From 28b0982ea70c21841fb23802d38f6b424f8200e1 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 10 Nov 2021 12:34:50 -0600 Subject: [PATCH 001/230] Refactored her[2]k/syr[2]k in terms of gemmt. (#531) Details: - Renamed herk macrokernels and supporting files and functions to gemmt, which is possible since at the macrokernel level they are identical. Then recast herk/her2k/syrk/syr2k in terms of gemmt within the expert level-3 oapi (bli_l3_oapi_ex.c) while also redefining them as literal functions rather than cpp macros that instantiate multiple functions. Thanks to Devin Matthews for his efforts on this issue (#531). - Check that the maximum stack buffer size is sufficiently large relative to the register blocksizes for each datatype, and do so when the context is initialized rather than when an operation is called. Note that with this change, users who pass in their own contexts into the expert interfaces currently will *not* have any checks performed. Thanks to Devin Matthews for suggesting this change. --- config/zen/bli_family_zen.h | 4 +- config/zen2/bli_family_zen2.h | 4 +- frame/3/bli_l3.h | 4 - frame/3/bli_l3_blocksize.c | 12 +- frame/3/bli_l3_blocksize.h | 6 +- frame/3/bli_l3_check.c | 5 - frame/3/bli_l3_cntl.c | 4 +- frame/3/bli_l3_direct.c | 6 +- frame/3/bli_l3_direct.h | 2 +- frame/3/bli_l3_ind.c | 7 +- frame/3/bli_l3_ind.h | 4 - frame/3/bli_l3_oapi_ex.c | 773 +++++++++++------- frame/3/bli_l3_prune.c | 6 +- frame/3/bli_l3_prune.h | 6 +- frame/3/bli_l3_thrinfo.h | 8 +- frame/3/gemm/bli_gemm_blk_var3.c | 2 +- frame/3/gemm/bli_gemm_cntl.c | 6 +- frame/3/gemmt/bli_gemmt.h | 2 + frame/3/gemmt/bli_gemmt_front.c | 2 +- .../bli_gemmt_l_ker_var2.c} | 16 +- .../bli_gemmt_u_ker_var2.c} | 16 +- .../bli_herk_var.h => gemmt/bli_gemmt_var.h} | 16 +- .../bli_gemmt_x_ker_var2.c} | 4 +- .../other/bli_gemmt_l_ker_var2.c} | 12 +- .../other/bli_gemmt_u_ker_var2.c} | 12 +- frame/3/her2k/bli_her2k.h | 36 - frame/3/her2k/bli_her2k_front.c | 161 ---- frame/3/her2k/bli_her2k_front.h | 45 - frame/3/herk/bli_herk.h | 38 - frame/3/herk/bli_herk_front.c | 124 --- frame/3/herk/bli_herk_front.h | 44 - .../herk/other/bli_herk_l_ker_var2.1looprr.c | 420 ---------- frame/3/herk/other/bli_herk_l_ker_var2rr.c | 555 ------------- frame/3/herk/other/bli_herk_l_ker_var2sl.c | 556 ------------- .../herk/other/bli_herk_u_ker_var2.1looprr.c | 420 ---------- frame/3/herk/other/bli_herk_u_ker_var2rr.c | 557 ------------- frame/3/herk/other/bli_herk_u_ker_var2sl.c | 558 ------------- frame/3/syr2k/bli_syr2k.h | 36 - frame/3/syr2k/bli_syr2k_front.c | 134 --- frame/3/syr2k/bli_syr2k_front.h | 45 - frame/3/syrk/bli_syrk.h | 36 - frame/3/syrk/bli_syrk_front.c | 119 --- frame/3/syrk/bli_syrk_front.h | 58 -- frame/base/bli_check.c | 30 +- frame/base/bli_check.h | 2 +- frame/base/bli_gks.c | 5 + frame/base/bli_info.c | 9 +- frame/base/bli_info.h | 1 + frame/base/bli_part.c | 18 +- frame/thread/bli_thread.c | 4 +- .../3/{bli_syrk_small.c => bli_gemmt_small.c} | 88 +- sandbox/gemmlike/bls_gemm_check.c | 5 - 52 files changed, 647 insertions(+), 4396 deletions(-) rename frame/3/{herk/bli_herk_l_ker_var2.c => gemmt/bli_gemmt_l_ker_var2.c} (97%) rename frame/3/{herk/bli_herk_u_ker_var2.c => gemmt/bli_gemmt_u_ker_var2.c} (97%) rename frame/3/{herk/bli_herk_var.h => gemmt/bli_gemmt_var.h} (90%) rename frame/3/{herk/bli_herk_x_ker_var2.c => gemmt/bli_gemmt_x_ker_var2.c} (97%) rename frame/3/{herk/other/bli_herk_l_ker_var2.c => gemmt/other/bli_gemmt_l_ker_var2.c} (97%) rename frame/3/{herk/other/bli_herk_u_ker_var2.c => gemmt/other/bli_gemmt_u_ker_var2.c} (97%) delete mode 100644 frame/3/her2k/bli_her2k.h delete mode 100644 frame/3/her2k/bli_her2k_front.c delete mode 100644 frame/3/her2k/bli_her2k_front.h delete mode 100644 frame/3/herk/bli_herk.h delete mode 100644 frame/3/herk/bli_herk_front.c delete mode 100644 frame/3/herk/bli_herk_front.h delete mode 100644 frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c delete mode 100644 frame/3/herk/other/bli_herk_l_ker_var2rr.c delete mode 100644 frame/3/herk/other/bli_herk_l_ker_var2sl.c delete mode 100644 frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c delete mode 100644 frame/3/herk/other/bli_herk_u_ker_var2rr.c delete mode 100644 frame/3/herk/other/bli_herk_u_ker_var2sl.c delete mode 100644 frame/3/syr2k/bli_syr2k.h delete mode 100644 frame/3/syr2k/bli_syr2k_front.c delete mode 100644 frame/3/syr2k/bli_syr2k_front.h delete mode 100644 frame/3/syrk/bli_syrk.h delete mode 100644 frame/3/syrk/bli_syrk_front.c delete mode 100644 frame/3/syrk/bli_syrk_front.h rename kernels/zen/3/{bli_syrk_small.c => bli_gemmt_small.c} (99%) diff --git a/config/zen/bli_family_zen.h b/config/zen/bli_family_zen.h index c82392b60..d1c4ef828 100644 --- a/config/zen/bli_family_zen.h +++ b/config/zen/bli_family_zen.h @@ -52,8 +52,8 @@ #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 //This macro will enable BLIS DGEMM to choose block sizes for a single instance mode #define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 0 diff --git a/config/zen2/bli_family_zen2.h b/config/zen2/bli_family_zen2.h index a0f5b574d..d7adddf3c 100644 --- a/config/zen2/bli_family_zen2.h +++ b/config/zen2/bli_family_zen2.h @@ -51,8 +51,8 @@ #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index 94e37fc17..da9348844 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -84,11 +84,7 @@ // Operation-specific headers. #include "bli_gemm.h" #include "bli_hemm.h" -#include "bli_herk.h" -#include "bli_her2k.h" #include "bli_symm.h" -#include "bli_syrk.h" -#include "bli_syr2k.h" #include "bli_trmm.h" #include "bli_trmm3.h" #include "bli_trsm.h" diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 58b658d1d..1986b3b0f 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -51,8 +51,8 @@ dim_t bli_l3_determine_kc if ( family == BLIS_GEMM ) return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); - else if ( family == BLIS_HERK ) - return bli_herk_determine_kc( direct, i, dim, a, b, bszid, cntx ); + else if ( family == BLIS_GEMMT ) + return bli_gemmt_determine_kc( direct, i, dim, a, b, bszid, cntx ); else if ( family == BLIS_TRMM ) return bli_trmm_determine_kc( direct, i, dim, a, b, bszid, cntx ); else if ( family == BLIS_TRSM ) @@ -91,7 +91,7 @@ dim_t PASTEMAC0(opname) \ } GENFRONT( gemm_determine_kc, gemm ) -GENFRONT( herk_determine_kc, herk ) +GENFRONT( gemmt_determine_kc, gemmt ) GENFRONT( trmm_determine_kc, trmm ) GENFRONT( trsm_determine_kc, trsm ) @@ -201,7 +201,7 @@ dim_t PASTEMAC0(opname) \ b_alg = bli_blksz_get_def( dt, bsize ); \ b_max = bli_blksz_get_max( dt, bsize ); \ \ - /* Notice that for herk, we do not need to perform any special handling + /* Notice that for gemmt, we do not need to perform any special handling for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \ \ /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined @@ -211,8 +211,8 @@ dim_t PASTEMAC0(opname) \ return b_use; \ } -GENFRONT( herk_determine_kc_f, f ) -GENFRONT( herk_determine_kc_b, b ) +GENFRONT( gemmt_determine_kc_f, f ) +GENFRONT( gemmt_determine_kc_b, b ) // ----------------------------------------------------------------------------- diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h index c3301ee13..3ea3c5aa0 100644 --- a/frame/3/bli_l3_blocksize.h +++ b/frame/3/bli_l3_blocksize.h @@ -60,7 +60,7 @@ dim_t PASTEMAC0(opname) \ ); GENPROT( gemm_determine_kc ) -GENPROT( herk_determine_kc ) +GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) @@ -81,8 +81,8 @@ dim_t PASTEMAC0(opname) \ GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) -GENPROT( herk_determine_kc_f ) -GENPROT( herk_determine_kc_b ) +GENPROT( gemmt_determine_kc_f ) +GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index 413f6a58d..50da4627c 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -597,10 +597,5 @@ void bli_l3_basic_check e_val = bli_check_object_buffer( c ); bli_check_error_code( e_val ); - - // Check for sufficiently sized stack buffers - - e_val = bli_check_sufficient_stack_buf_size( bli_obj_dt( a ), cntx ); - bli_check_error_code( e_val ); } diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index f6bfbedbb..3cdecfbc2 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -54,7 +54,7 @@ void bli_l3_cntl_create_if if ( cntl_orig == NULL ) { if ( family == BLIS_GEMM || - family == BLIS_HERK || + family == BLIS_GEMMT || family == BLIS_TRMM ) { *cntl_use = bli_gemm_cntl_create( rntm, family, schema_a, schema_b ); @@ -97,7 +97,7 @@ void bli_l3_cntl_free opid_t family = bli_cntl_family( cntl_use ); if ( family == BLIS_GEMM || - family == BLIS_HERK || + family == BLIS_GEMMT || family == BLIS_TRMM ) { bli_gemm_cntl_free( rntm, cntl_use, thread ); diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c index 7baf2d6ef..0d0a71921 100644 --- a/frame/3/bli_l3_direct.c +++ b/frame/3/bli_l3_direct.c @@ -46,7 +46,7 @@ dir_t bli_l3_direct opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c ); - else if ( family == BLIS_HERK ) return bli_herk_direct( a, b, c ); + else if ( family == BLIS_GEMMT ) return bli_gemmt_direct( a, b, c ); else if ( family == BLIS_TRMM ) return bli_trmm_direct( a, b, c ); else if ( family == BLIS_TRSM ) return bli_trsm_direct( a, b, c ); @@ -68,14 +68,14 @@ dir_t bli_gemm_direct return BLIS_FWD; } -dir_t bli_herk_direct +dir_t bli_gemmt_direct ( obj_t* a, obj_t* b, obj_t* c ) { - // For herk, movement may be forwards (or backwards). + // For gemmt, movement may be forwards (or backwards). return BLIS_FWD; } diff --git a/frame/3/bli_l3_direct.h b/frame/3/bli_l3_direct.h index 7383c4a9f..39798407a 100644 --- a/frame/3/bli_l3_direct.h +++ b/frame/3/bli_l3_direct.h @@ -53,7 +53,7 @@ dir_t PASTEMAC0(opname) \ ); GENPROT( gemm_direct ) -GENPROT( herk_direct ) +GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) diff --git a/frame/3/bli_l3_ind.c b/frame/3/bli_l3_ind.c index 7c30f61af..fbf73be60 100644 --- a/frame/3/bli_l3_ind.c +++ b/frame/3/bli_l3_ind.c @@ -55,7 +55,8 @@ static bool bli_l3_ind_oper_impl[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] = static BLIS_THREAD_LOCAL bool bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] = { - /* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */ + /* gemm gemmt hemm herk her2k symm + syrk syr2k trmm3 trmm trsm */ /* c z */ /* 1m */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, @@ -80,11 +81,7 @@ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ) \ GENFUNC( gemm, BLIS_GEMM ) GENFUNC( gemmt, BLIS_GEMMT ) GENFUNC( hemm, BLIS_HEMM ) -GENFUNC( herk, BLIS_HERK ) -GENFUNC( her2k, BLIS_HER2K ) GENFUNC( symm, BLIS_SYMM ) -GENFUNC( syrk, BLIS_SYRK ) -GENFUNC( syr2k, BLIS_SYR2K ) GENFUNC( trmm3, BLIS_TRMM3 ) GENFUNC( trmm, BLIS_TRMM ) GENFUNC( trsm, BLIS_TRSM ) diff --git a/frame/3/bli_l3_ind.h b/frame/3/bli_l3_ind.h index f80757eb0..a14ad783c 100644 --- a/frame/3/bli_l3_ind.h +++ b/frame/3/bli_l3_ind.h @@ -47,11 +47,7 @@ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) -GENPROT( herk ) -GENPROT( her2k ) GENPROT( symm ) -GENPROT( syrk ) -GENPROT( syr2k ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c index f6cfd6640..cd0df7017 100644 --- a/frame/3/bli_l3_oapi_ex.c +++ b/frame/3/bli_l3_oapi_ex.c @@ -38,301 +38,508 @@ // Define object-based interfaces (expert). // -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - /* If the rntm is non-NULL, it may indicate that we should forgo sup - handling altogether. */ \ - bool enable_sup = TRUE; \ - if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \ -\ - if ( enable_sup ) \ - { \ - /* Execute the small/unpacked oapi handler. If it finds that the problem - does not fall within the thresholds that define "small", or for some - other reason decides not to use the small/unpacked implementation, - the function returns with BLIS_FAILURE, which causes execution to - proceed towards the conventional implementation. */ \ - err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \ - if ( result == BLIS_SUCCESS ) \ - { \ - return; \ - } \ - } \ -\ - /* Initialize a local runtime with global settings if necessary. Note - that in the case that a runtime is passed in, we make a local copy. */ \ - rntm_t rntm_l; \ - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \ - else { rntm_l = *rntm; rntm = &rntm_l; } \ -\ - /* Default to using native execution. */ \ - num_t dt = bli_obj_dt( c ); \ - ind_t im = BLIS_NAT; \ -\ - /* If each matrix operand has a complex storage datatype, try to get an - induced method (if one is available and enabled). NOTE: Allowing - precisions to vary while using 1m, which is what we do here, is unique - to gemm; other level-3 operations use 1m only if all storage datatypes - are equal (and they ignore the computation precision). */ \ - if ( bli_obj_is_complex( c ) && \ - bli_obj_is_complex( a ) && \ - bli_obj_is_complex( b ) ) \ - { \ - /* Find the highest priority induced method that is both enabled and - available for the current operation. (If an induced method is - available but not enabled, or simply unavailable, BLIS_NAT will - be returned here.) */ \ - im = PASTEMAC(opname,ind_find_avail)( dt ); \ - } \ -\ - /* If necessary, obtain a valid context from the gks using the induced - method id determined above. */ \ - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \ -\ - /* Check the operands. */ \ - if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, a, b, beta, c, cntx ); \ -\ - /* Invoke the operation's front-end and request the default control tree. */ \ - PASTEMAC(opname,_front)( alpha, a, b, beta, c, cntx, rntm, NULL ); \ -} - // If a sandbox was enabled, we forgo defining bli_gemm_ex() since it will be // defined in the sandbox environment. #ifndef BLIS_ENABLE_SANDBOX -GENFRONT( gemm ) + +void PASTEMAC(gemm,BLIS_OAPI_EX_SUF) + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // If the rntm is non-NULL, it may indicate that we should forgo sup + // handling altogether. + bool enable_sup = TRUE; + if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); + + if ( enable_sup ) + { + // Execute the small/unpacked oapi handler. If it finds that the problem + // does not fall within the thresholds that define "small", or for some + // other reason decides not to use the small/unpacked implementation, + // the function returns with BLIS_FAILURE, which causes execution to + // proceed towards the conventional implementation. + err_t result = bli_gemmsup( alpha, a, b, beta, c, cntx, rntm ); + if ( result == BLIS_SUCCESS ) + { + return; + } + } + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Default to using native execution. + num_t dt = bli_obj_dt( c ); + ind_t im = BLIS_NAT; + + // If each matrix operand has a complex storage datatype, try to get an + // induced method (if one is available and enabled). NOTE: Allowing + // precisions to vary while using 1m, which is what we do here, is unique + // to gemm; other level-3 operations use 1m only if all storage datatypes + // are equal (and they ignore the computation precision). + if ( bli_obj_is_complex( c ) && + bli_obj_is_complex( a ) && + bli_obj_is_complex( b ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_gemmind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_gemm_check( alpha, a, b, beta, c, cntx ); + + // Invoke the operation's front-end and request the default control tree. + bli_gemm_front( alpha, a, b, beta, c, cntx, rntm, NULL ); +} + #endif -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - /* Initialize a local runtime with global settings if necessary. Note - that in the case that a runtime is passed in, we make a local copy. */ \ - rntm_t rntm_l; \ - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \ - else { rntm_l = *rntm; rntm = &rntm_l; } \ -\ - /* Default to using native execution. */ \ - num_t dt = bli_obj_dt( c ); \ - ind_t im = BLIS_NAT; \ -\ - /* If all matrix operands are complex and of the same storage datatype, try - to get an induced method (if one is available and enabled). */ \ - if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ - bli_obj_dt( b ) == bli_obj_dt( c ) && \ - bli_obj_is_complex( c ) ) \ - { \ - /* Find the highest priority induced method that is both enabled and - available for the current operation. (If an induced method is - available but not enabled, or simply unavailable, BLIS_NAT will - be returned here.) */ \ - im = PASTEMAC(opname,ind_find_avail)( dt ); \ - } \ -\ - /* If necessary, obtain a valid context from the gks using the induced - method id determined above. */ \ - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \ -\ - /* Check the operands. */ \ - if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, a, b, beta, c, cntx ); \ -\ - /* Invoke the operation's front-end and request the default control tree. */ \ - PASTEMAC(opname,_front)( alpha, a, b, beta, c, cntx, rntm, NULL ); \ +void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Default to using native execution. + num_t dt = bli_obj_dt( c ); + ind_t im = BLIS_NAT; + + // If all matrix operands are complex and of the same storage datatype, try + // to get an induced method (if one is available and enabled). + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_gemmtind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_gemmt_check( alpha, a, b, beta, c, cntx ); + + // Invoke the operation's front-end and request the default control tree. + bli_gemmt_front( alpha, a, b, beta, c, cntx, rntm, NULL ); } -GENFRONT( gemmt ) -GENFRONT( her2k ) -GENFRONT( syr2k ) - - -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ - ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - /* Initialize a local runtime with global settings if necessary. Note - that in the case that a runtime is passed in, we make a local copy. */ \ - rntm_t rntm_l; \ - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \ - else { rntm_l = *rntm; rntm = &rntm_l; } \ -\ - /* Default to using native execution. */ \ - num_t dt = bli_obj_dt( c ); \ - ind_t im = BLIS_NAT; \ -\ - /* If all matrix operands are complex and of the same storage datatype, try - to get an induced method (if one is available and enabled). */ \ - if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ - bli_obj_dt( b ) == bli_obj_dt( c ) && \ - bli_obj_is_complex( c ) ) \ - { \ - /* Find the highest priority induced method that is both enabled and - available for the current operation. (If an induced method is - available but not enabled, or simply unavailable, BLIS_NAT will - be returned here.) */ \ - im = PASTEMAC(opname,ind_find_avail)( dt ); \ - } \ -\ - /* If necessary, obtain a valid context from the gks using the induced - method id determined above. */ \ - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \ -\ - /* Check the operands. */ \ - if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( side, alpha, a, b, beta, c, cntx ); \ -\ - /* Invoke the operation's front-end and request the default control tree. */ \ - PASTEMAC(opname,_front)( side, alpha, a, b, beta, c, cntx, rntm, NULL ); \ + +void PASTEMAC(her2k,BLIS_OAPI_EX_SUF) + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + obj_t ah; + obj_t bh; + obj_t alphah; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_her2k_check( alpha, a, b, beta, c, cntx ); + + bli_obj_alias_to( alpha, &alphah ); + bli_obj_toggle_conj( &alphah ); + + bli_obj_alias_to( a, &ah ); + bli_obj_toggle_trans( &ah ); + bli_obj_toggle_conj( &ah ); + + bli_obj_alias_to( b, &bh ); + bli_obj_toggle_trans( &bh ); + bli_obj_toggle_conj( &bh ); + + // Invoke gemmt twice, using beta only the first time. + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &bh, beta, c, cntx, rntm ); + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( &alphah, b, &ah, &BLIS_ONE, c, cntx, rntm ); + + // The Hermitian rank-2k product was computed as alpha*A*B'+alpha'*B*A', even for + // the diagonal elements. Mathematically, the imaginary components of + // diagonal elements of a Hermitian rank-2k product should always be + // zero. However, in practice, they sometimes accumulate meaningless + // non-zero values. To prevent this, we explicitly set those values + // to zero before returning. + bli_setid( &BLIS_ZERO, c ); } -GENFRONT( hemm ) -GENFRONT( symm ) -GENFRONT( trmm3 ) - - -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ - ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - /* Initialize a local runtime with global settings if necessary. Note - that in the case that a runtime is passed in, we make a local copy. */ \ - rntm_t rntm_l; \ - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \ - else { rntm_l = *rntm; rntm = &rntm_l; } \ -\ - /* Default to using native execution. */ \ - num_t dt = bli_obj_dt( c ); \ - ind_t im = BLIS_NAT; \ -\ - /* If all matrix operands are complex and of the same storage datatype, try - to get an induced method (if one is available and enabled). */ \ - if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ - bli_obj_is_complex( c ) ) \ - { \ - /* Find the highest priority induced method that is both enabled and - available for the current operation. (If an induced method is - available but not enabled, or simply unavailable, BLIS_NAT will - be returned here.) */ \ - im = PASTEMAC(opname,ind_find_avail)( dt ); \ - } \ -\ - /* If necessary, obtain a valid context from the gks using the induced - method id determined above. */ \ - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \ -\ - /* Check the operands. */ \ - if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, a, beta, c, cntx ); \ -\ - /* Invoke the operation's front-end and request the default control tree. */ \ - PASTEMAC(opname,_front)( alpha, a, beta, c, cntx, rntm, NULL ); \ + +void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF) + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + obj_t at; + obj_t bt; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_syr2k_check( alpha, a, b, beta, c, cntx ); + + bli_obj_alias_to( b, &bt ); + bli_obj_toggle_trans( &bt ); + + bli_obj_alias_to( a, &at ); + bli_obj_toggle_trans( &at ); + + // Invoke gemmt twice, using beta only the first time. + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &bt, beta, c, cntx, rntm ); + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, b, &at, &BLIS_ONE, c, cntx, rntm ); } -GENFRONT( herk ) -GENFRONT( syrk ) - - -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ - ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - cntx_t* cntx, \ - rntm_t* rntm \ - ) \ -{ \ - bli_init_once(); \ -\ - /* Initialize a local runtime with global settings if necessary. Note - that in the case that a runtime is passed in, we make a local copy. */ \ - rntm_t rntm_l; \ - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \ - else { rntm_l = *rntm; rntm = &rntm_l; } \ -\ - /* Default to using native execution. */ \ - num_t dt = bli_obj_dt( b ); \ - ind_t im = BLIS_NAT; \ -\ - /* If all matrix operands are complex and of the same storage datatype, try - to get an induced method (if one is available and enabled). */ \ - if ( bli_obj_dt( a ) == bli_obj_dt( b ) && \ - bli_obj_is_complex( b ) ) \ - { \ - /* Find the highest priority induced method that is both enabled and - available for the current operation. (If an induced method is - available but not enabled, or simply unavailable, BLIS_NAT will - be returned here.) */ \ - im = PASTEMAC(opname,ind_find_avail)( dt ); \ - } \ -\ - /* If necessary, obtain a valid context from the gks using the induced - method id determined above. */ \ - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \ -\ - /* Check the operands. */ \ - if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( side, alpha, a, b, cntx ); \ -\ - /* Invoke the operation's front-end and request the default control tree. */ \ - PASTEMAC(opname,_front)( side, alpha, a, b, cntx, rntm, NULL ); \ + +void PASTEMAC(hemm,BLIS_OAPI_EX_SUF) + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Default to using native execution. + num_t dt = bli_obj_dt( c ); + ind_t im = BLIS_NAT; + + // If all matrix operands are complex and of the same storage datatype, try + // to get an induced method (if one is available and enabled). + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_hemmind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_hemm_check( side, alpha, a, b, beta, c, cntx ); + + // Invoke the operation's front-end and request the default control tree. + bli_hemm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); } -GENFRONT( trmm ) -GENFRONT( trsm ) +void PASTEMAC(symm,BLIS_OAPI_EX_SUF) + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Default to using native execution. + num_t dt = bli_obj_dt( c ); + ind_t im = BLIS_NAT; + + // If all matrix operands are complex and of the same storage datatype, try + // to get an induced method (if one is available and enabled). + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_symmind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_symm_check( side, alpha, a, b, beta, c, cntx ); + + // Invoke the operation's front-end and request the default control tree. + bli_symm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); +} + + +void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF) + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Default to using native execution. + num_t dt = bli_obj_dt( c ); + ind_t im = BLIS_NAT; + + // If all matrix operands are complex and of the same storage datatype, try + // to get an induced method (if one is available and enabled). + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && + bli_obj_dt( b ) == bli_obj_dt( c ) && + bli_obj_is_complex( c ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_trmm3ind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_trmm3_check( side, alpha, a, b, beta, c, cntx ); + + // Invoke the operation's front-end and request the default control tree. + bli_trmm3_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); +} + + +void PASTEMAC(herk,BLIS_OAPI_EX_SUF) + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + obj_t ah; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_herk_check( alpha, a, beta, c, cntx ); + + bli_obj_alias_to( a, &ah ); + bli_obj_toggle_trans( &ah ); + bli_obj_toggle_conj( &ah ); + + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &ah, beta, c, cntx, rntm ); + + // The Hermitian rank-k product was computed as Re(alpha)*A*A', even for the + // diagonal elements. Mathematically, the imaginary components of + // diagonal elements of a Hermitian rank-k product should always be + // zero. However, in practice, they sometimes accumulate meaningless + // non-zero values. To prevent this, we explicitly set those values + // to zero before returning. + bli_setid( &BLIS_ZERO, c ); +} + + +void PASTEMAC(syrk,BLIS_OAPI_EX_SUF) + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + obj_t at; + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_syrk_check( alpha, a, beta, c, cntx ); + + bli_obj_alias_to( a, &at ); + bli_obj_toggle_trans( &at ); + + PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &at, beta, c, cntx, rntm ); +} + + +void PASTEMAC(trmm,BLIS_OAPI_EX_SUF) + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Default to using native execution. + num_t dt = bli_obj_dt( b ); + ind_t im = BLIS_NAT; + + // If all matrix operands are complex and of the same storage datatype, try + // to get an induced method (if one is available and enabled). + if ( bli_obj_dt( a ) == bli_obj_dt( b ) && + bli_obj_is_complex( b ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_trmmind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_trmm_check( side, alpha, a, b, cntx ); + + // Invoke the operation's front-end and request the default control tree. + bli_trmm_front( side, alpha, a, b, cntx, rntm, NULL ); +} + + +void PASTEMAC(trsm,BLIS_OAPI_EX_SUF) + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Default to using native execution. + num_t dt = bli_obj_dt( b ); + ind_t im = BLIS_NAT; + + // If all matrix operands are complex and of the same storage datatype, try + // to get an induced method (if one is available and enabled). + if ( bli_obj_dt( a ) == bli_obj_dt( b ) && + bli_obj_is_complex( b ) ) + { + // Find the highest priority induced method that is both enabled and + // available for the current operation. (If an induced method is + // available but not enabled, or simply unavailable, BLIS_NAT will + // be returned here.) + im = bli_trsmind_find_avail( dt ); + } + + // If necessary, obtain a valid context from the gks using the induced + // method id determined above. + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_trsm_check( side, alpha, a, b, cntx ); + + // Invoke the operation's front-end and request the default control tree. + bli_trsm_front( side, alpha, a, b, cntx, rntm, NULL ); +} diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c index fa008fd15..6ca8244cb 100644 --- a/frame/3/bli_l3_prune.c +++ b/frame/3/bli_l3_prune.c @@ -47,7 +47,7 @@ void bli_l3_prune_unref_mparts_m opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm. - else if ( family == BLIS_HERK ) bli_herk_prune_unref_mparts_m( a, b, c ); + else if ( family == BLIS_GEMMT ) bli_gemmt_prune_unref_mparts_m( a, b, c ); else if ( family == BLIS_TRMM ) bli_trmm_prune_unref_mparts_m( a, b, c ); else if ( family == BLIS_TRSM ) bli_trsm_prune_unref_mparts_m( a, b, c ); } @@ -68,7 +68,7 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \ opid_t family = bli_cntl_family( cntl ); \ \ if ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \ - else if ( family == BLIS_HERK ) PASTEMAC(herk_prune_unref_mparts_,dim)( a, b, c ); \ + else if ( family == BLIS_GEMMT ) PASTEMAC(gemmt_prune_unref_mparts_,dim)( a, b, c ); \ else if ( family == BLIS_TRMM ) PASTEMAC(trmm_prune_unref_mparts_,dim)( a, b, c ); \ else if ( family == BLIS_TRSM ) PASTEMAC(trsm_prune_unref_mparts_,dim)( a, b, c ); \ } @@ -152,7 +152,7 @@ void PASTEMAC(opname,_prune_unref_mparts_k) \ for the k dimension. */ \ } -GENFRONT( herk ) +GENFRONT( gemmt ) // ----------------------------------------------------------------------------- diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h index 340ecd4db..ad8f07dc4 100644 --- a/frame/3/bli_l3_prune.h +++ b/frame/3/bli_l3_prune.h @@ -64,9 +64,9 @@ GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) -GENPROT( herk, m ) -GENPROT( herk, n ) -GENPROT( herk, k ) +GENPROT( gemmt, m ) +GENPROT( gemmt, n ) +GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 4726e1042..37a3909fd 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -44,12 +44,12 @@ #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) -// herk +// gemmt -// NOTE: The definition of bli_herk_get_next_?_upanel() does not need to +// NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. -#define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) -#define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) +#define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) +#define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index 94f0af409..7883dfd6d 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -93,7 +93,7 @@ void bli_gemm_blk_var3 // can simply overwrite the internal beta scalar with BLIS_ONE once // it has been used in the first iteration. However... - // Unlike variant 3 of gemm and herk, which reset the internal scalar + // Unlike variant 3 of gemm and gemmt, which reset the internal scalar // on C at the end of the first iteration so that subsequent iterations // do not erroneously apply beta more than once, it is important that // this behavior not be applied to trmm. That is because the order of diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index d7cd0a92c..27678e0bf 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -63,7 +63,7 @@ cntl_t* bli_gemmbp_cntl_create // Use the function pointers to the macrokernels that use slab // assignment of micropanels to threads in the jr and ir loops. if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2; - else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2; + else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2; else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2; else /* should never execute */ macro_kernel_fp = NULL; @@ -167,8 +167,8 @@ cntl_t* bli_gemmpb_cntl_create { void_fp macro_kernel_p = bli_gemm_ker_var1; - // Change the macro-kernel if the operation family is herk or trmm. - //if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; + // Change the macro-kernel if the operation family is gemmt or trmm. + //if ( family == BLIS_GEMMT ) macro_kernel_p = bli_gemmt_x_ker_var2; //else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; // Create two nodes for the macro-kernel. diff --git a/frame/3/gemmt/bli_gemmt.h b/frame/3/gemmt/bli_gemmt.h index ed522ee13..32ab3865e 100644 --- a/frame/3/gemmt/bli_gemmt.h +++ b/frame/3/gemmt/bli_gemmt.h @@ -34,3 +34,5 @@ #include "bli_gemmt_front.h" +#include "bli_gemmt_var.h" + diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c index 84385bf17..9f18a717d 100644 --- a/frame/3/gemmt/bli_gemmt_front.c +++ b/frame/3/gemmt/bli_gemmt_front.c @@ -108,7 +108,7 @@ void bli_gemmt_front bli_l3_thread_decorator ( bli_gemm_int, - BLIS_HERK, // operation family id (gemmt uses 'herk' family) + BLIS_GEMMT, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c similarity index 97% rename from frame/3/herk/bli_herk_l_ker_var2.c rename to frame/3/gemmt/bli_gemmt_l_ker_var2.c index 5a05672d7..a995e6c52 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -#define FUNCPTR_T herk_fp +#define FUNCPTR_T gemmt_fp typedef void (*FUNCPTR_T) ( @@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T) thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); +static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2); -void bli_herk_l_ker_var2 +void bli_gemmt_l_ker_var2 ( obj_t* a, obj_t* b, @@ -359,11 +359,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ @@ -464,11 +464,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ @@ -551,5 +551,5 @@ void PASTEMAC(ch,varname) \ } \ } -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 ) +INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 ) diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c similarity index 97% rename from frame/3/herk/bli_herk_u_ker_var2.c rename to frame/3/gemmt/bli_gemmt_u_ker_var2.c index 9e685a944..3115fc67b 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -#define FUNCPTR_T herk_fp +#define FUNCPTR_T gemmt_fp typedef void (*FUNCPTR_T) ( @@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T) thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); +static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2); -void bli_herk_u_ker_var2 +void bli_gemmt_u_ker_var2 ( obj_t* a, obj_t* b, @@ -359,11 +359,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ @@ -490,11 +490,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ @@ -554,5 +554,5 @@ void PASTEMAC(ch,varname) \ } \ } -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) +INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 ) diff --git a/frame/3/herk/bli_herk_var.h b/frame/3/gemmt/bli_gemmt_var.h similarity index 90% rename from frame/3/herk/bli_herk_var.h rename to frame/3/gemmt/bli_gemmt_var.h index 00b85fc5c..60c68c9f5 100644 --- a/frame/3/herk/bli_herk_var.h +++ b/frame/3/gemmt/bli_gemmt_var.h @@ -52,16 +52,10 @@ void PASTEMAC0(opname) \ thrinfo_t* thread \ ); -//GENPROT( herk_blk_var1 ) -//GENPROT( herk_blk_var2 ) -//GENPROT( herk_blk_var3 ) +GENPROT( gemmt_x_ker_var2 ) -GENPROT( herk_x_ker_var2 ) - -GENPROT( herk_l_ker_var2 ) -GENPROT( herk_u_ker_var2 ) -//GENPROT( herk_packa ) -//GENPROT( herk_packb ) +GENPROT( gemmt_l_ker_var2 ) +GENPROT( gemmt_u_ker_var2 ) // @@ -91,6 +85,6 @@ void PASTEMAC(ch,varname) \ thrinfo_t* thread \ ); -INSERT_GENTPROT_BASIC0( herk_l_ker_var2 ) -INSERT_GENTPROT_BASIC0( herk_u_ker_var2 ) +INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) +INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) diff --git a/frame/3/herk/bli_herk_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c similarity index 97% rename from frame/3/herk/bli_herk_x_ker_var2.c rename to frame/3/gemmt/bli_gemmt_x_ker_var2.c index b6769d719..6d24ea496 100644 --- a/frame/3/herk/bli_herk_x_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c @@ -37,10 +37,10 @@ static gemm_var_oft vars[2] = { - bli_herk_l_ker_var2, bli_herk_u_ker_var2, + bli_gemmt_l_ker_var2, bli_gemmt_u_ker_var2, }; -void bli_herk_x_ker_var2 +void bli_gemmt_x_ker_var2 ( obj_t* a, obj_t* ah, diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c similarity index 97% rename from frame/3/herk/other/bli_herk_l_ker_var2.c rename to frame/3/gemmt/other/bli_gemmt_l_ker_var2.c index 22439f5b2..0bf4b1a0f 100644 --- a/frame/3/herk/other/bli_herk_l_ker_var2.c +++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -#define FUNCPTR_T herk_fp +#define FUNCPTR_T gemmt_fp typedef void (*FUNCPTR_T) ( @@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T) thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); +static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2); -void bli_herk_l_ker_var2 +void bli_gemmt_l_ker_var2 ( obj_t* a, obj_t* b, @@ -318,11 +318,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ + a2 = bli_gemmt_get_next_a_upanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ + b2 = bli_gemmt_get_next_b_upanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ @@ -405,5 +405,5 @@ void PASTEMAC(ch,varname) \ } \ } -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 ) +INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 ) diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c similarity index 97% rename from frame/3/herk/other/bli_herk_u_ker_var2.c rename to frame/3/gemmt/other/bli_gemmt_u_ker_var2.c index 1aa3ce12d..1655bea55 100644 --- a/frame/3/herk/other/bli_herk_u_ker_var2.c +++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -#define FUNCPTR_T herk_fp +#define FUNCPTR_T gemmt_fp typedef void (*FUNCPTR_T) ( @@ -57,10 +57,10 @@ typedef void (*FUNCPTR_T) thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); +static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2); -void bli_herk_u_ker_var2 +void bli_gemmt_u_ker_var2 ( obj_t* a, obj_t* b, @@ -318,11 +318,11 @@ void PASTEMAC(ch,varname) \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \ + a2 = bli_gemmt_get_next_a_upanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \ + b2 = bli_gemmt_get_next_b_upanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ @@ -405,5 +405,5 @@ void PASTEMAC(ch,varname) \ } \ } -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) +INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 ) diff --git a/frame/3/her2k/bli_her2k.h b/frame/3/her2k/bli_her2k.h deleted file mode 100644 index 02975c2b5..000000000 --- a/frame/3/her2k/bli_her2k.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "bli_her2k_front.h" - diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c deleted file mode 100644 index 459ab05c7..000000000 --- a/frame/3/her2k/bli_her2k_front.c +++ /dev/null @@ -1,161 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_her2k_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - bli_init_once(); - - obj_t alpha_conj; - obj_t c_local; - obj_t a_local; - obj_t bh_local; - obj_t b_local; - obj_t ah_local; - - // If alpha is zero, scale by beta, zero the imaginary components of - // the diagonal elements, and return. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) - { - bli_scalm( beta, c ); - bli_setid( &BLIS_ZERO, c ); - return; - } - - // Alias A, B, and C in case we need to apply transformations. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( b, &b_local ); - bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); - - // For her2k, the first and second right-hand "B" operands are simply B' - // and A'. - bli_obj_alias_to( b, &bh_local ); - bli_obj_induce_trans( &bh_local ); - bli_obj_toggle_conj( &bh_local ); - bli_obj_alias_to( a, &ah_local ); - bli_obj_induce_trans( &ah_local ); - bli_obj_toggle_conj( &ah_local ); - - // An optimization: If C is stored by rows and the micro-kernel prefers - // contiguous columns, or if C is stored by columns and the micro-kernel - // prefers contiguous rows, transpose the entire operation to allow the - // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) - { - bli_obj_swap( &a_local, &bh_local ); - bli_obj_swap( &b_local, &ah_local ); - - bli_obj_induce_trans( &a_local ); - bli_obj_induce_trans( &bh_local ); - bli_obj_induce_trans( &b_local ); - bli_obj_induce_trans( &ah_local ); - - bli_obj_induce_trans( &c_local ); - } - - // Set the pack schemas within the objects. - bli_l3_set_schemas( &a_local, &bh_local, &c_local, cntx ); - bli_l3_set_schemas( &b_local, &ah_local, &c_local, cntx ); - - // Initialize a conjugated copy of alpha. - bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ), - BLIS_CONJUGATE, - alpha, - &alpha_conj ); - - // Parse and interpret the contents of the rntm_t object to properly - // set the ways of parallelism for each loop, and then make any - // additional modifications necessary for the current operation. - bli_rntm_set_ways_for_op - ( - BLIS_HER2K, - BLIS_LEFT, // ignored for her[2]k/syr[2]k - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), - rntm - ); - - // Invoke herk twice, using beta only the first time. - - // Invoke the internal back-end. - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &a_local, - &bh_local, - beta, - &c_local, - cntx, - rntm, - cntl - ); - - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - &alpha_conj, - &b_local, - &ah_local, - &BLIS_ONE, - &c_local, - cntx, - rntm, - cntl - ); - - // The Hermitian rank-2k product was computed as A*B'+B*A', even for - // the diagonal elements. Mathematically, the imaginary components of - // diagonal elements of a Hermitian rank-2k product should always be - // zero. However, in practice, they sometimes accumulate meaningless - // non-zero values. To prevent this, we explicitly set those values - // to zero before returning. - bli_setid( &BLIS_ZERO, &c_local ); -} - diff --git a/frame/3/her2k/bli_her2k_front.h b/frame/3/her2k/bli_her2k_front.h deleted file mode 100644 index 0efdb86c2..000000000 --- a/frame/3/her2k/bli_her2k_front.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_her2k_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); diff --git a/frame/3/herk/bli_herk.h b/frame/3/herk/bli_herk.h deleted file mode 100644 index c43728968..000000000 --- a/frame/3/herk/bli_herk.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "bli_herk_front.h" - -#include "bli_herk_var.h" - diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c deleted file mode 100644 index 324e18151..000000000 --- a/frame/3/herk/bli_herk_front.c +++ /dev/null @@ -1,124 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_herk_front - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - bli_init_once(); - - obj_t a_local; - obj_t ah_local; - obj_t c_local; - - // If alpha is zero, scale by beta, zero the imaginary components of - // the diagonal elements, and return. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) - { - bli_scalm( beta, c ); - bli_setid( &BLIS_ZERO, c ); - return; - } - - // Alias A and C in case we need to apply transformations. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); - - // For herk, the right-hand "B" operand is simply A'. - bli_obj_alias_to( a, &ah_local ); - bli_obj_induce_trans( &ah_local ); - bli_obj_toggle_conj( &ah_local ); - - // An optimization: If C is stored by rows and the micro-kernel prefers - // contiguous columns, or if C is stored by columns and the micro-kernel - // prefers contiguous rows, transpose the entire operation to allow the - // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) - { - bli_obj_toggle_conj( &a_local ); - bli_obj_toggle_conj( &ah_local ); - - bli_obj_induce_trans( &c_local ); - } - - // Set the pack schemas within the objects. - bli_l3_set_schemas( &a_local, &ah_local, &c_local, cntx ); - - // Parse and interpret the contents of the rntm_t object to properly - // set the ways of parallelism for each loop, and then make any - // additional modifications necessary for the current operation. - bli_rntm_set_ways_for_op - ( - BLIS_HERK, - BLIS_LEFT, // ignored for her[2]k/syr[2]k - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), - rntm - ); - - // Invoke the internal back-end. - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &a_local, - &ah_local, - beta, - &c_local, - cntx, - rntm, - cntl - ); - - // The Hermitian rank-k product was computed as A*A', even for the - // diagonal elements. Mathematically, the imaginary components of - // diagonal elements of a Hermitian rank-k product should always be - // zero. However, in practice, they sometimes accumulate meaningless - // non-zero values. To prevent this, we explicitly set those values - // to zero before returning. - bli_setid( &BLIS_ZERO, &c_local ); -} - diff --git a/frame/3/herk/bli_herk_front.h b/frame/3/herk/bli_herk_front.h deleted file mode 100644 index 44778a450..000000000 --- a/frame/3/herk/bli_herk_front.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_herk_front - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); diff --git a/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c deleted file mode 100644 index 8a99a2e24..000000000 --- a/frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); - - -void bli_herk_l_ker_var2 - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, ip; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely above the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region above where the diagonal of C intersects - the left edge of the panel, adjust the pointer to C and A and treat - this case as if the diagonal offset were zero. */ \ - if ( diagoffc < 0 ) \ - { \ - ip = -diagoffc / MR; \ - i = ip * MR; \ - m = m - i; \ - diagoffc = -diagoffc % MR; \ - c_cast = c_cast + (i )*rs_c; \ - a_cast = a_cast + (ip )*ps_a; \ - } \ -\ - /* If there is a zero region to the right of where the diagonal - of C intersects the bottom of the panel, shrink it to prevent - "no-op" iterations from executing. */ \ - if ( diagoffc + m < n ) \ - { \ - n = diagoffc + m; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Use interleaved (round robin) assignment of micropanels to threads in - the 2nd and 1st loops. */ \ - bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 ) - diff --git a/frame/3/herk/other/bli_herk_l_ker_var2rr.c b/frame/3/herk/other/bli_herk_l_ker_var2rr.c deleted file mode 100644 index c78a36b29..000000000 --- a/frame/3/herk/other/bli_herk_l_ker_var2rr.c +++ /dev/null @@ -1,555 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2rr); - -// -// -- Macrokernel functions for round-robin partitioning ----------------------- -// - -void bli_herk_l_ker_var2rr - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, ip; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely above the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region above where the diagonal of C intersects - the left edge of the panel, adjust the pointer to C and A and treat - this case as if the diagonal offset were zero. */ \ - if ( diagoffc < 0 ) \ - { \ - ip = -diagoffc / MR; \ - i = ip * MR; \ - m = m - i; \ - diagoffc = -diagoffc % MR; \ - c_cast = c_cast + (i )*rs_c; \ - a_cast = a_cast + (ip )*ps_a; \ - } \ -\ - /* If there is a zero region to the right of where the diagonal - of C intersects the bottom of the panel, shrink it to prevent - "no-op" iterations from executing. */ \ - if ( diagoffc + m < n ) \ - { \ - n = diagoffc + m; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the rectangular - part of C, and the triangular portion. */ \ - dim_t n_iter_rct; \ - dim_t n_iter_tri; \ -\ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \ - { \ - /* If the entire panel of C does not intersect the diagonal, there is - no triangular region, and therefore we can skip the second set of - loops. */ \ - n_iter_rct = n_iter; \ - n_iter_tri = 0; \ - } \ - else \ - { \ - /* If the panel of C does intersect the diagonal, compute the number of - iterations in the rectangular region by dividing NR into the diagonal - offset. Any remainder from this integer division is discarded, which - is what we want. That is, we want the rectangular region to contain - as many columns of whole microtiles as possible without including any - microtiles that intersect the diagonal. The number of iterations in - the triangular (or trapezoidal) region is computed as the remaining - number of iterations in the n dimension. */ \ - n_iter_rct = diagoffc / NR; \ - n_iter_tri = n_iter - n_iter_rct; \ - } \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd and 1st - loops for the initial rectangular region of C (if it exists). */ \ - bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* No need to compute the diagonal offset for the rectangular - region. */ \ - /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -\ - /* If there is no triangular region, then we're done. */ \ - if ( n_iter_tri == 0 ) return; \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd and - 1st loops for the remaining triangular region of C. */ \ - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Advance the start and end iteration offsets for the triangular region - by the number of iterations used for the rectangular region. */ \ - jr_start += n_iter_rct; \ - jr_end += n_iter_rct; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2rr ) - diff --git a/frame/3/herk/other/bli_herk_l_ker_var2sl.c b/frame/3/herk/other/bli_herk_l_ker_var2sl.c deleted file mode 100644 index 17e0b0d0e..000000000 --- a/frame/3/herk/other/bli_herk_l_ker_var2sl.c +++ /dev/null @@ -1,556 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2sl); - -// -// -- Macrokernel functions for slab partitioning ------------------------------ -// - -void bli_herk_l_ker_var2sl - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, ip; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely above the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region above where the diagonal of C intersects - the left edge of the panel, adjust the pointer to C and A and treat - this case as if the diagonal offset were zero. */ \ - if ( diagoffc < 0 ) \ - { \ - ip = -diagoffc / MR; \ - i = ip * MR; \ - m = m - i; \ - diagoffc = -diagoffc % MR; \ - c_cast = c_cast + (i )*rs_c; \ - a_cast = a_cast + (ip )*ps_a; \ - } \ -\ - /* If there is a zero region to the right of where the diagonal - of C intersects the bottom of the panel, shrink it to prevent - "no-op" iterations from executing. */ \ - if ( diagoffc + m < n ) \ - { \ - n = diagoffc + m; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the rectangular - part of C, and the triangular portion. */ \ - dim_t n_iter_rct; \ - dim_t n_iter_tri; \ -\ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \ - { \ - /* If the entire panel of C does not intersect the diagonal, there is - no triangular region, and therefore we can skip the second set of - loops. */ \ - n_iter_rct = n_iter; \ - n_iter_tri = 0; \ - } \ - else \ - { \ - /* If the panel of C does intersect the diagonal, compute the number of - iterations in the rectangular region by dividing NR into the diagonal - offset. Any remainder from this integer division is discarded, which - is what we want. That is, we want the rectangular region to contain - as many columns of whole microtiles as possible without including any - microtiles that intersect the diagonal. The number of iterations in - the triangular (or trapezoidal) region is computed as the remaining - number of iterations in the n dimension. */ \ - n_iter_rct = diagoffc / NR; \ - n_iter_tri = n_iter - n_iter_rct; \ - } \ -\ - /* Use slab assignment of micropanels to threads in the 2nd and 1st - loops for the initial rectangular region of C (if it exists). */ \ - bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* No need to compute the diagonal offset for the rectangular - region. */ \ - /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -\ - /* If there is no triangular region, then we're done. */ \ - if ( n_iter_tri == 0 ) return; \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd - loop and slab partitioning in the 1st loop for the remaining - triangular region of C. */ \ - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Advance the start and end iteration offsets for the triangular region - by the number of iterations used for the rectangular region. */ \ - jr_start += n_iter_rct; \ - jr_end += n_iter_rct; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_l_ker_var2sl ) - diff --git a/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c b/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c deleted file mode 100644 index 31d8fab62..000000000 --- a/frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); - - -void bli_herk_u_ker_var2 - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, jp; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely below the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region to the left of where the diagonal of C - intersects the top edge of the panel, adjust the pointer to C and B - and treat this case as if the diagonal offset were zero. */ \ - if ( diagoffc > 0 ) \ - { \ - jp = diagoffc / NR; \ - j = jp * NR; \ - n = n - j; \ - diagoffc = diagoffc % NR; \ - c_cast = c_cast + (j )*cs_c; \ - b_cast = b_cast + (jp )*ps_b; \ - } \ -\ - /* If there is a zero region below where the diagonal of C intersects - the right edge of the panel, shrink it to prevent "no-op" iterations - from executing. */ \ - if ( -diagoffc + n < m ) \ - { \ - m = -diagoffc + n; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Use interleaved (round robin) assignment of micropanels to threads in - the 2nd and 1st loops. */ \ - bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 ) - diff --git a/frame/3/herk/other/bli_herk_u_ker_var2rr.c b/frame/3/herk/other/bli_herk_u_ker_var2rr.c deleted file mode 100644 index 085ef6308..000000000 --- a/frame/3/herk/other/bli_herk_u_ker_var2rr.c +++ /dev/null @@ -1,557 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2rr); - -// -// -- Macrokernel functions for round-robin partitioning ----------------------- -// - -void bli_herk_u_ker_var2rr - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, jp; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely below the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region to the left of where the diagonal of C - intersects the top edge of the panel, adjust the pointer to C and B - and treat this case as if the diagonal offset were zero. - NOTE: It's possible that after this pruning that the diagonal offset - is still positive (though it is guaranteed to be less than NR). */ \ - if ( diagoffc > 0 ) \ - { \ - jp = diagoffc / NR; \ - j = jp * NR; \ - n = n - j; \ - diagoffc = diagoffc % NR; \ - c_cast = c_cast + (j )*cs_c; \ - b_cast = b_cast + (jp )*ps_b; \ - } \ -\ - /* If there is a zero region below where the diagonal of C intersects - the right edge of the panel, shrink it to prevent "no-op" iterations - from executing. */ \ - if ( -diagoffc + n < m ) \ - { \ - m = -diagoffc + n; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the triangular - part of C, and the rectangular portion. */ \ - dim_t n_iter_tri; \ - dim_t n_iter_rct; \ -\ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \ - { \ - /* If the entire panel of C does not intersect the diagonal, there is - no triangular region, and therefore we can skip the first set of - loops. */ \ - n_iter_tri = 0; \ - n_iter_rct = n_iter; \ - } \ - else \ - { \ - /* If the panel of C does intersect the diagonal, compute the number of - iterations in the triangular (or trapezoidal) region by dividing NR - into the number of rows in C. A non-zero remainder means we need to - add one additional iteration. That is, we want the triangular region - to contain as few columns of whole microtiles as possible while still - including all microtiles that intersect the diagonal. The number of - iterations in the rectangular region is computed as the remaining - number of iterations in the n dimension. */ \ - n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \ - n_iter_rct = n_iter - n_iter_tri; \ - } \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd and 1st - loops for the initial triangular region of C (if it exists). */ \ - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -\ - /* If there is no rectangular region, then we're done. */ \ - if ( n_iter_rct == 0 ) return; \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd and 1st - loops for the remaining triangular region of C. */ \ - bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Advance the start and end iteration offsets for the rectangular region - by the number of iterations used for the triangular region. */ \ - jr_start += n_iter_tri; \ - jr_end += n_iter_tri; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* No need to compute the diagonal offset for the rectangular - region. */ \ - /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2rr ) - diff --git a/frame/3/herk/other/bli_herk_u_ker_var2sl.c b/frame/3/herk/other/bli_herk_u_ker_var2sl.c deleted file mode 100644 index abc6e5188..000000000 --- a/frame/3/herk/other/bli_herk_u_ker_var2sl.c +++ /dev/null @@ -1,558 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2sl); - -// -// -- Macrokernel functions for slab partitioning ------------------------------ -// - -void bli_herk_u_ker_var2sl - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, jp; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely below the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region to the left of where the diagonal of C - intersects the top edge of the panel, adjust the pointer to C and B - and treat this case as if the diagonal offset were zero. - NOTE: It's possible that after this pruning that the diagonal offset - is still positive (though it is guaranteed to be less than NR). */ \ - if ( diagoffc > 0 ) \ - { \ - jp = diagoffc / NR; \ - j = jp * NR; \ - n = n - j; \ - diagoffc = diagoffc % NR; \ - c_cast = c_cast + (j )*cs_c; \ - b_cast = b_cast + (jp )*ps_b; \ - } \ -\ - /* If there is a zero region below where the diagonal of C intersects - the right edge of the panel, shrink it to prevent "no-op" iterations - from executing. */ \ - if ( -diagoffc + n < m ) \ - { \ - m = -diagoffc + n; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the triangular - part of C, and the rectangular portion. */ \ - dim_t n_iter_tri; \ - dim_t n_iter_rct; \ -\ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \ - { \ - /* If the entire panel of C does not intersect the diagonal, there is - no triangular region, and therefore we can skip the first set of - loops. */ \ - n_iter_tri = 0; \ - n_iter_rct = n_iter; \ - } \ - else \ - { \ - /* If the panel of C does intersect the diagonal, compute the number of - iterations in the triangular (or trapezoidal) region by dividing NR - into the number of rows in C. A non-zero remainder means we need to - add one additional iteration. That is, we want the triangular region - to contain as few columns of whole microtiles as possible while still - including all microtiles that intersect the diagonal. The number of - iterations in the rectangular region is computed as the remaining - number of iterations in the n dimension. */ \ - n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \ - n_iter_rct = n_iter - n_iter_tri; \ - } \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd loop - and slab partitioning in the 1st loop for the initial triangular region - of C (if it exists). */ \ - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -\ - /* If there is no rectangular region, then we're done. */ \ - if ( n_iter_rct == 0 ) return; \ -\ - /* Use slab assignment of micropanels to threads in the 2nd and 1st loops - loop for the remaining triangular region of C. */ \ - bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Advance the start and end iteration offsets for the rectangular region - by the number of iterations used for the triangular region. */ \ - jr_start += n_iter_tri; \ - jr_end += n_iter_tri; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* No need to compute the diagonal offset for the rectangular - region. */ \ - /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( herk_u_ker_var2sl ) - diff --git a/frame/3/syr2k/bli_syr2k.h b/frame/3/syr2k/bli_syr2k.h deleted file mode 100644 index 680e6e399..000000000 --- a/frame/3/syr2k/bli_syr2k.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "bli_syr2k_front.h" - diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c deleted file mode 100644 index 4f30cc3d5..000000000 --- a/frame/3/syr2k/bli_syr2k_front.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_syr2k_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - bli_init_once(); - - obj_t c_local; - obj_t a_local; - obj_t bt_local; - obj_t b_local; - obj_t at_local; - - // If alpha is zero, scale by beta and return. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) - { - bli_scalm( beta, c ); - return; - } - - // Alias A, B, and C in case we need to apply transformations. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( b, &b_local ); - bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); - - // For syr2k, the first and second right-hand "B" operands are simply B' - // and A'. - bli_obj_alias_to( b, &bt_local ); - bli_obj_induce_trans( &bt_local ); - bli_obj_alias_to( a, &at_local ); - bli_obj_induce_trans( &at_local ); - - // An optimization: If C is stored by rows and the micro-kernel prefers - // contiguous columns, or if C is stored by columns and the micro-kernel - // prefers contiguous rows, transpose the entire operation to allow the - // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) - { - bli_obj_induce_trans( &c_local ); - } - - // Set the pack schemas within the objects. - bli_l3_set_schemas( &a_local, &bt_local, &c_local, cntx ); - bli_l3_set_schemas( &b_local, &at_local, &c_local, cntx ); - - // Parse and interpret the contents of the rntm_t object to properly - // set the ways of parallelism for each loop, and then make any - // additional modifications necessary for the current operation. - bli_rntm_set_ways_for_op - ( - BLIS_SYR2K, - BLIS_LEFT, // ignored for her[2]k/syr[2]k - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), - rntm - ); - - // Invoke herk twice, using beta only the first time. - - // Invoke the internal back-end. - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &a_local, - &bt_local, - beta, - &c_local, - cntx, - rntm, - cntl - ); - - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &b_local, - &at_local, - &BLIS_ONE, - &c_local, - cntx, - rntm, - cntl - ); -} - diff --git a/frame/3/syr2k/bli_syr2k_front.h b/frame/3/syr2k/bli_syr2k_front.h deleted file mode 100644 index 767bb6ee1..000000000 --- a/frame/3/syr2k/bli_syr2k_front.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_syr2k_front - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); diff --git a/frame/3/syrk/bli_syrk.h b/frame/3/syrk/bli_syrk.h deleted file mode 100644 index 4936fe431..000000000 --- a/frame/3/syrk/bli_syrk.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "bli_syrk_front.h" - diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c deleted file mode 100644 index 819941426..000000000 --- a/frame/3/syrk/bli_syrk_front.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_syrk_front - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - bli_init_once(); - - obj_t a_local; - obj_t at_local; - obj_t c_local; - - // Alias A and C in case we need to apply transformations. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); - - // For syrk, the right-hand "B" operand is simply A^T. - bli_obj_alias_to( a, &at_local ); - bli_obj_induce_trans( &at_local ); - -#if 0 -#ifdef BLIS_ENABLE_SMALL_MATRIX - gint_t status = bli_syrk_small( alpha, &a_local, &at_local, beta, &c_local, - cntx, cntl ); - if ( status == BLIS_SUCCESS ) return; -#endif -#endif - - // If alpha is zero, scale by beta and return. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) - { - bli_scalm( beta, c ); - return; - } - - // An optimization: If C is stored by rows and the micro-kernel prefers - // contiguous columns, or if C is stored by columns and the micro-kernel - // prefers contiguous rows, transpose the entire operation to allow the - // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) - { - bli_obj_induce_trans( &c_local ); - } - - // Set the pack schemas within the objects. - bli_l3_set_schemas( &a_local, &at_local, &c_local, cntx ); - - // Parse and interpret the contents of the rntm_t object to properly - // set the ways of parallelism for each loop, and then make any - // additional modifications necessary for the current operation. - bli_rntm_set_ways_for_op - ( - BLIS_SYRK, - BLIS_LEFT, // ignored for her[2]k/syr[2]k - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ), - rntm - ); - - // Invoke the internal back-end. - bli_l3_thread_decorator - ( - bli_gemm_int, - BLIS_HERK, // operation family id - alpha, - &a_local, - &at_local, - beta, - &c_local, - cntx, - rntm, - cntl - ); -} - diff --git a/frame/3/syrk/bli_syrk_front.h b/frame/3/syrk/bli_syrk_front.h deleted file mode 100644 index bf8d26a52..000000000 --- a/frame/3/syrk/bli_syrk_front.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_syrk_front - ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); - -#ifdef BLIS_ENABLE_SMALL_MATRIX -err_t bli_syrk_small - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl - ); -#endif - diff --git a/frame/base/bli_check.c b/frame/base/bli_check.c index 78d139e6b..e76314036 100644 --- a/frame/base/bli_check.c +++ b/frame/base/bli_check.c @@ -819,22 +819,26 @@ err_t bli_check_if_exhausted_pool( pool_t* pool ) return e_val; } -err_t bli_check_sufficient_stack_buf_size( num_t dt, cntx_t* cntx ) +err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ) { err_t e_val = BLIS_SUCCESS; + num_t dt; - dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - siz_t dt_size = bli_dt_size( dt ); - - // NOTE: For induced methods, we use the size of the complex datatypes - // (rather than the size of the native micro-kernels' datatype) because - // the macro-kernel needs this larger micro-tile footprint, even if the - // virtual micro-kernel implementation will only ever be writing to half - // of it (real or imaginary part) at a time. - - if ( mr * nr * dt_size > BLIS_STACK_BUF_MAX_SIZE ) - e_val = BLIS_INSUFFICIENT_STACK_BUF_SIZE; + for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) + { + dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); + dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); + siz_t dt_size = bli_dt_size( dt ); + + // NOTE: For induced methods, we use the size of the complex datatypes + // (rather than the size of the native micro-kernels' datatype) because + // the macro-kernel needs this larger micro-tile footprint, even if the + // virtual micro-kernel implementation will only ever be writing to half + // of it (real or imaginary part) at a time. + + if ( mr * nr * dt_size > BLIS_STACK_BUF_MAX_SIZE ) + e_val = BLIS_INSUFFICIENT_STACK_BUF_SIZE; + } return e_val; } diff --git a/frame/base/bli_check.h b/frame/base/bli_check.h index 70ec2fd8f..276d27689 100644 --- a/frame/base/bli_check.h +++ b/frame/base/bli_check.h @@ -103,7 +103,7 @@ err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); -err_t bli_check_sufficient_stack_buf_size( num_t dt, cntx_t* cntx ); +err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index c250191fc..0a5bcafd4 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -449,6 +449,11 @@ void bli_gks_register_cntx e_val = bli_check_valid_mc_mod_mult( mc, nr ); bli_check_error_code( e_val ); e_val = bli_check_valid_nc_mod_mult( nc, mr ); bli_check_error_code( e_val ); #endif + + // Verify that the register blocksizes in the context are sufficiently large + // relative to the maximum stack buffer size defined at configure-time. + e_val = bli_check_sufficient_stack_buf_size( gks_id_nat ); + bli_check_error_code( e_val ); } // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index fa7901583..8a3dcd30a 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -180,12 +180,13 @@ char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ) // -- BLIS implementation query (level-3) -------------------------------------- char* bli_info_get_gemm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMM, dt ); } +char* bli_info_get_gemmt_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } char* bli_info_get_hemm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HEMM, dt ); } -char* bli_info_get_herk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HERK, dt ); } -char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HER2K, dt ); } +char* bli_info_get_herk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } +char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } char* bli_info_get_symm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYMM, dt ); } -char* bli_info_get_syrk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYRK, dt ); } -char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYR2K, dt ); } +char* bli_info_get_syrk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } +char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } char* bli_info_get_trmm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM, dt ); } char* bli_info_get_trmm3_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM3, dt ); } char* bli_info_get_trsm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRSM, dt ); } diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index d900ca4f5..99c7d000d 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -91,6 +91,7 @@ BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); +BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c index da7643eb6..95587e4a7 100644 --- a/frame/base/bli_part.c +++ b/frame/base/bli_part.c @@ -266,7 +266,7 @@ void bli_acquire_mpart_mdim // diagonal, then set the subpartition structure to "general"; otherwise // we let the subpartition inherit the storage structure of its immediate // parent. - if ( !bli_obj_root_is_general( sub_obj ) && + if ( !bli_obj_root_is_general( sub_obj ) && bli_obj_is_outside_diag( sub_obj ) ) { // NOTE: This comment may be out-of-date since we now distinguish @@ -274,10 +274,10 @@ void bli_acquire_mpart_mdim // Note that we cannot mark the subpartition object as general/dense // here since it makes sense to preserve the existing uplo information // a while longer so that the correct kernels are invoked. (Example: - // incremental packing/computing in herk produces subpartitions that + // incremental packing/computing in gemmt produces subpartitions that // appear general/dense, but their uplo fields are needed to be either // lower or upper, to determine which macro-kernel gets called in the - // herk_int() back-end.) + // gemmt_int() back-end.) // If the subpartition lies entirely in an "unstored" triangle of the // root matrix, then we need to tweak the subpartition. If the root @@ -489,7 +489,7 @@ void bli_acquire_mpart_ndim // diagonal), and the subpartition does not intersect the root matrix's // diagonal, then we might need to modify some of the subpartition's // properties, depending on its structure type. - if ( !bli_obj_root_is_general( sub_obj ) && + if ( !bli_obj_root_is_general( sub_obj ) && bli_obj_is_outside_diag( sub_obj ) ) { // NOTE: This comment may be out-of-date since we now distinguish @@ -497,10 +497,10 @@ void bli_acquire_mpart_ndim // Note that we cannot mark the subpartition object as general/dense // here since it makes sense to preserve the existing uplo information // a while longer so that the correct kernels are invoked. (Example: - // incremental packing/computing in herk produces subpartitions that + // incremental packing/computing in gemmt produces subpartitions that // appear general/dense, but their uplo fields are needed to be either // lower or upper, to determine which macro-kernel gets called in the - // herk_int() back-end.) + // gemmt_int() back-end.) // If the subpartition lies entirely in an "unstored" triangle of the // root matrix, then we need to tweak the subpartition. If the root @@ -742,7 +742,7 @@ void bli_acquire_mpart_mndim // diagonal, then set the subpartition structure to "general"; otherwise // we let the subpartition inherit the storage structure of its immediate // parent. - if ( !bli_obj_root_is_general( sub_obj ) && + if ( !bli_obj_root_is_general( sub_obj ) && req_part != BLIS_SUBPART00 && req_part != BLIS_SUBPART11 && req_part != BLIS_SUBPART22 ) @@ -762,10 +762,10 @@ void bli_acquire_mpart_mndim // Note that we cannot mark the subpartition object as general/dense // here since it makes sense to preserve the existing uplo information // a while longer so that the correct kernels are invoked. (Example: - // incremental packing/computing in herk produces subpartitions that + // incremental packing/computing in gemmt produces subpartitions that // appear general/dense, but their uplo fields are needed to be either // lower or upper, to determine which macro-kernel gets called in the - // herk_int() back-end.) + // gemmt_int() back-end.) // If the subpartition lies entirely in an "unstored" triangle of the // root matrix, then we need to tweak the subpartition. If the root diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 9ebd47de1..6dc4f9141 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -678,7 +678,7 @@ siz_t bli_thread_range_mdim // structured matrix, even though they represent part of that matrix // that will be dense and full (after packing). if ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; } - else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE; } else /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; } @@ -737,7 +737,7 @@ siz_t bli_thread_range_ndim // structured matrix, even though they represent part of that matrix // that will be dense and full (after packing). if ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; } - else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE; } else /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; } diff --git a/kernels/zen/3/bli_syrk_small.c b/kernels/zen/3/bli_gemmt_small.c similarity index 99% rename from kernels/zen/3/bli_syrk_small.c rename to kernels/zen/3/bli_gemmt_small.c index 23d47298c..f2fd88de7 100644 --- a/kernels/zen/3/bli_syrk_small.c +++ b/kernels/zen/3/bli_gemmt_small.c @@ -52,9 +52,9 @@ static float C_pack[F_SCRATCH_DIM] __attribute__((aligned(64))); #define D_SCRATCH_DIM (D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES) static double D_A_pack[D_SCRATCH_DIM] __attribute__((aligned(64))); static double D_C_pack[D_SCRATCH_DIM] __attribute__((aligned(64))); -#define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called. -#define AT_MR 4 // The kernel dimension of the A transpose SYRK kernel.(AT_MR * NR). -static err_t bli_ssyrk_small +#define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called. +#define AT_MR 4 // The kernel dimension of the A transpose GEMMT kernel.(AT_MR * NR). +static err_t bli_sgemmt_small ( obj_t* alpha, obj_t* a, @@ -65,7 +65,7 @@ static err_t bli_ssyrk_small cntl_t* cntl ); -static err_t bli_dsyrk_small +static err_t bli_dgemmt_small ( obj_t* alpha, obj_t* a, @@ -76,7 +76,7 @@ static err_t bli_dsyrk_small cntl_t* cntl ); -static err_t bli_ssyrk_small_atbn +static err_t bli_sgemmt_small_atbn ( obj_t* alpha, obj_t* a, @@ -87,7 +87,7 @@ static err_t bli_ssyrk_small_atbn cntl_t* cntl ); -static err_t bli_dsyrk_small_atbn +static err_t bli_dgemmt_small_atbn ( obj_t* alpha, obj_t* a, @@ -98,11 +98,11 @@ static err_t bli_dsyrk_small_atbn cntl_t* cntl ); /* -* The bli_syrk_small function will use the +* The bli_gemmt_small function will use the * custom MRxNR kernels, to perform the computation. * The custom kernels are used if the [M * N] < 240 * 240 */ -err_t bli_syrk_small +err_t bli_gemmt_small ( obj_t* alpha, obj_t* a, @@ -113,20 +113,20 @@ err_t bli_syrk_small cntl_t* cntl ) { - // FGVZ: This code was originally in bli_syrk_front(). However, it really - // fits more naturally here within the bli_syrk_small() function. This + // FGVZ: This code was originally in bli_gemmt_front(). However, it really + // fits more naturally here within the bli_gemmt_small() function. This // becomes a bit more obvious now that the code is here, as it contains - // cpp macros such as BLIS_SMALL_MATRIX_A_THRES_M_SYRK, which are specific + // cpp macros such as BLIS_SMALL_MATRIX_A_THRES_M_GEMMT, which are specific // to this implementation. if ( bli_obj_has_trans( a ) ) { // Continue with small implementation. ; } - else if ( ( bli_obj_length( a ) <= BLIS_SMALL_MATRIX_A_THRES_M_SYRK && - bli_obj_width( a ) < BLIS_SMALL_MATRIX_A_THRES_N_SYRK ) || - ( bli_obj_length( a ) < BLIS_SMALL_MATRIX_A_THRES_M_SYRK && - bli_obj_width( a ) <= BLIS_SMALL_MATRIX_A_THRES_N_SYRK ) ) + else if ( ( bli_obj_length( a ) <= BLIS_SMALL_MATRIX_A_THRES_M_GEMMT && + bli_obj_width( a ) < BLIS_SMALL_MATRIX_A_THRES_N_GEMMT ) || + ( bli_obj_length( a ) < BLIS_SMALL_MATRIX_A_THRES_M_GEMMT && + bli_obj_width( a ) <= BLIS_SMALL_MATRIX_A_THRES_N_GEMMT ) ) { // Continue with small implementation. ; @@ -162,11 +162,11 @@ err_t bli_syrk_small { if (dt == BLIS_FLOAT) { - return bli_ssyrk_small_atbn(alpha, a, b, beta, c, cntx, cntl); + return bli_sgemmt_small_atbn(alpha, a, b, beta, c, cntx, cntl); } else if (dt == BLIS_DOUBLE) { - return bli_dsyrk_small_atbn(alpha, a, b, beta, c, cntx, cntl); + return bli_dgemmt_small_atbn(alpha, a, b, beta, c, cntx, cntl); } } @@ -175,19 +175,19 @@ err_t bli_syrk_small if (dt == BLIS_DOUBLE) { - return bli_dsyrk_small(alpha, a, b, beta, c, cntx, cntl); + return bli_dgemmt_small(alpha, a, b, beta, c, cntx, cntl); } if (dt == BLIS_FLOAT) { - return bli_ssyrk_small(alpha, a, b, beta, c, cntx, cntl); + return bli_sgemmt_small(alpha, a, b, beta, c, cntx, cntl); } return BLIS_NOT_YET_IMPLEMENTED; }; -static err_t bli_ssyrk_small +static err_t bli_sgemmt_small ( obj_t* alpha, obj_t* a, @@ -240,7 +240,7 @@ static err_t bli_ssyrk_small beta_cast = (beta->buffer); int required_packing_A = 1; - // when N is equal to 1 call GEMV instead of SYRK + // when N is equal to 1 call GEMV instead of GEMMT if (N == 1) { bli_gemv @@ -1584,7 +1584,7 @@ static err_t bli_ssyrk_small } } } - + //copy/compute sryk values back to C using SIMD if ( bli_seq0( *beta_cast ) ) {//just copy in case of beta = 0 @@ -1673,7 +1673,7 @@ static err_t bli_ssyrk_small _i = 0; for ( _l = 0; _l < k; _l++ ) { - ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC)); + ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC)); ymm0 = _mm256_loadu_ps((C + _i*rsc)); ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); _mm256_storeu_ps((matCbuf + _i*rs_matC), ymm0); @@ -1703,11 +1703,11 @@ static err_t bli_ssyrk_small _l = 0; while ( _l < k ) { - ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); + ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); - + _i += 8; _l++; } @@ -1729,8 +1729,8 @@ static err_t bli_ssyrk_small _i = 0; _l = 0; while ( _l < k ) - { - ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); + { + ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); @@ -1747,7 +1747,7 @@ static err_t bli_ssyrk_small } } } - + return BLIS_SUCCESS; } else @@ -1756,7 +1756,7 @@ static err_t bli_ssyrk_small }; -static err_t bli_dsyrk_small +static err_t bli_dgemmt_small ( obj_t* alpha, obj_t* a, @@ -1810,7 +1810,7 @@ static err_t bli_dsyrk_small beta_cast = (beta->buffer); int required_packing_A = 1; - // when N is equal to 1 call GEMV instead of SYRK + // when N is equal to 1 call GEMV instead of GEMMT if (N == 1) { bli_gemv @@ -3154,7 +3154,7 @@ static err_t bli_dsyrk_small } } } - + //copy/compute sryk values back to C using SIMD if ( bli_seq0( *beta_cast ) ) {//just copy for beta = 0 @@ -3195,7 +3195,7 @@ static err_t bli_dsyrk_small { ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); - + _i += 4; _l++; } @@ -3243,7 +3243,7 @@ static err_t bli_dsyrk_small _i = 0; for ( _l = 0; _l < k; _l++ ) { - ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC)); + ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC)); ymm0 = _mm256_loadu_pd((C + _i*rsc)); ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); _mm256_storeu_pd((matCbuf + _i*rs_matC), ymm0); @@ -3273,7 +3273,7 @@ static err_t bli_dsyrk_small _l = 0; while ( _l < k ) { - ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); + ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); @@ -3299,8 +3299,8 @@ static err_t bli_dsyrk_small _i = 0; _l = 0; while ( _l < k ) - { - ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); + { + ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); @@ -3317,7 +3317,7 @@ static err_t bli_dsyrk_small } } } - + return BLIS_SUCCESS; } else @@ -3326,7 +3326,7 @@ static err_t bli_dsyrk_small }; -static err_t bli_ssyrk_small_atbn +static err_t bli_sgemmt_small_atbn ( obj_t* alpha, obj_t* a, @@ -3364,7 +3364,7 @@ static err_t bli_ssyrk_small_atbn alpha_cast = (alpha->buffer); beta_cast = (beta->buffer); - // The non-copy version of the A^T SYRK gives better performance for the small M cases. + // The non-copy version of the A^T GEMMT gives better performance for the small M cases. // The threshold is controlled by BLIS_ATBN_M_THRES if (M <= BLIS_ATBN_M_THRES) { @@ -3715,7 +3715,7 @@ static err_t bli_ssyrk_small_atbn } } } - + //copy/compute sryk values back to C if ( bli_seq0( *beta_cast ) ) //when beta is 0, just copy result to C { @@ -3774,7 +3774,7 @@ static err_t bli_ssyrk_small_atbn return BLIS_NONCONFORMAL_DIMENSIONS; } -static err_t bli_dsyrk_small_atbn +static err_t bli_dgemmt_small_atbn ( obj_t* alpha, obj_t* a, @@ -3812,7 +3812,7 @@ static err_t bli_dsyrk_small_atbn alpha_cast = (alpha->buffer); beta_cast = (beta->buffer); - // The non-copy version of the A^T SYRK gives better performance for the small M cases. + // The non-copy version of the A^T GEMMT gives better performance for the small M cases. // The threshold is controlled by BLIS_ATBN_M_THRES if (M <= BLIS_ATBN_M_THRES) { @@ -3968,7 +3968,7 @@ static err_t bli_dsyrk_small_atbn result *= (*alpha_cast); tC[3] = result/* + tC[3] * (*beta_cast)*/; - + tC += ldc; ymm6 = _mm256_hadd_pd(ymm6, ymm6); _mm256_storeu_pd(scratch, ymm6); @@ -4199,7 +4199,7 @@ static err_t bli_dsyrk_small_atbn } } } - + return BLIS_SUCCESS; } else diff --git a/sandbox/gemmlike/bls_gemm_check.c b/sandbox/gemmlike/bls_gemm_check.c index bd6c2647e..369017338 100644 --- a/sandbox/gemmlike/bls_gemm_check.c +++ b/sandbox/gemmlike/bls_gemm_check.c @@ -99,11 +99,6 @@ void bls_gemm_check e_val = bli_check_object_buffer( c ); bli_check_error_code( e_val ); - // Check for sufficiently sized stack buffers - - e_val = bli_check_sufficient_stack_buf_size( bli_obj_dt( a ), cntx ); - bli_check_error_code( e_val ); - // Check object dimensions. e_val = bli_check_level3_dims( a, b, c ); From 7bc8ab485e89cfc6032932e57929e208a28f4be5 Mon Sep 17 00:00:00 2001 From: Meghana-vankadari <74656386+Meghana-vankadari@users.noreply.github.com> Date: Fri, 12 Nov 2021 04:16:14 +0530 Subject: [PATCH 002/230] Added BLAS/CBLAS APIs for axpby, gemm_batch. (#566) Details: - Expanded the BLAS compatibility layer to include support for ?axpby_() and ?gemm_batch_(). The former is a straightforward BLAS-like interface into the axpbyv operation while the latter implements a batched gemm via loops over bli_?gemm(). Also expanded the CBLAS compatibility layer to include support for cblas_?axpby() and cblas_?gemm_batch(), which serve as wrappers to the corresponding (new) BLAS-like APIs. Thanks to Meghana Vankadari for submitting these new APIs via #566. - Fixed a long-standing bug in common.mk that for some reason never manifested until now. Previously, CBLAS source files were compiled *without* the location of cblas.h being specified via a -I flag. I'm not sure why this worked, but it may be due to the fact that the cblas.h file resided in the same directory as all of the CBLAS source, and perhaps compilers implicitly add a -I flag for the directory that corresponds to the location of the source file being compiled. This bug only showed up because some CBLAS-like source code was moved into an 'extra' subdirectory of that frame/compat/cblas/src directory. After moving the code, compilation for those files failed (because the cblas.h header file, presumably, could not be found in the same location). This bug was fixed within common.mk by explicitly adding the cblas.h directory to the list of -I flags passed to the compiler. - Added test_axpbyv.c and test_gemm_batch.c files to 'test' directory, and updated test/Makefile to build those drivers. - Fixed typo in error message string in cblas_sgemm.c. --- common.mk | 14 +- frame/compat/bli_blas.h | 6 + frame/compat/cblas/src/cblas.h | 57 ++ frame/compat/cblas/src/cblas_f77.h | 21 +- frame/compat/cblas/src/cblas_sgemm.c | 30 +- frame/compat/cblas/src/extra/cblas_caxpby.c | 27 + .../cblas/src/extra/cblas_cgemm_batch.c | 168 +++++ frame/compat/cblas/src/extra/cblas_daxpby.c | 26 + .../cblas/src/extra/cblas_dgemm_batch.c | 168 +++++ frame/compat/cblas/src/extra/cblas_saxpby.c | 28 + .../cblas/src/extra/cblas_sgemm_batch.c | 168 +++++ frame/compat/cblas/src/extra/cblas_zaxpby.c | 27 + .../cblas/src/extra/cblas_zgemm_batch.c | 168 +++++ frame/compat/extra/bla_axpby.c | 89 +++ frame/compat/extra/bla_axpby.h | 54 ++ frame/compat/extra/bla_gemm_batch.c | 254 ++++++++ frame/compat/extra/bla_gemm_batch.h | 61 ++ test/Makefile | 8 +- test/test_axpbyv.c | 293 +++++++++ test/test_gemm_batch.c | 584 ++++++++++++++++++ 20 files changed, 2226 insertions(+), 25 deletions(-) create mode 100644 frame/compat/cblas/src/extra/cblas_caxpby.c create mode 100644 frame/compat/cblas/src/extra/cblas_cgemm_batch.c create mode 100644 frame/compat/cblas/src/extra/cblas_daxpby.c create mode 100644 frame/compat/cblas/src/extra/cblas_dgemm_batch.c create mode 100644 frame/compat/cblas/src/extra/cblas_saxpby.c create mode 100644 frame/compat/cblas/src/extra/cblas_sgemm_batch.c create mode 100644 frame/compat/cblas/src/extra/cblas_zaxpby.c create mode 100644 frame/compat/cblas/src/extra/cblas_zgemm_batch.c create mode 100644 frame/compat/extra/bla_axpby.c create mode 100644 frame/compat/extra/bla_axpby.h create mode 100644 frame/compat/extra/bla_gemm_batch.c create mode 100644 frame/compat/extra/bla_gemm_batch.h create mode 100644 test/test_axpbyv.c create mode 100644 test/test_gemm_batch.c diff --git a/common.mk b/common.mk index 2da306d79..90c3da83f 100644 --- a/common.mk +++ b/common.mk @@ -1009,9 +1009,11 @@ BLIS_H_FLAT := $(BASE_INC_PATH)/$(BLIS_H) # # Isolate the path to cblas.h by filtering the file from the list of framework -# header files. +# header files, and then strip the filename to obtain the directory in which +# cblas.h resides. CBLAS_H := cblas.h CBLAS_H_SRC_PATH := $(filter %/$(CBLAS_H), $(FRAME_H99_FILES)) +CBLAS_H_DIRPATH := $(dir $(CBLAS_H_SRC_PATH)) # Construct the path to what will be the intermediate flattened/monolithic # cblas.h file. @@ -1037,7 +1039,8 @@ REF_KER_H_PATHS := $(strip $(foreach header, $(REF_KER_HEADERS), \ $(FRAME_H99_FILES))))) # Add -I to each header path so we can specify our include search paths to the -# C compiler. Then add frame/include since it's needed for bli_oapi_w[o]_cntx.h. +# C compiler. Then add frame/include since it's needed when compiling source +# files that #include bli_oapi_ba.h or bli_oapi_ex.h. REF_KER_I_PATHS := $(strip $(patsubst %, -I%, $(REF_KER_H_PATHS))) REF_KER_I_PATHS += -I$(DIST_PATH)/frame/include @@ -1046,6 +1049,13 @@ REF_KER_I_PATHS += -I$(DIST_PATH)/frame/include # now #include the monolithic/flattened blis.h instead. CINCFLAGS := -I$(BASE_INC_PATH) $(REF_KER_I_PATHS) +# If CBLAS is enabled, we also include the path to the cblas.h directory so +# that the compiler will be able to find cblas.h as the CBLAS source code is +# being compiled. +ifeq ($(MK_ENABLE_CBLAS),yes) +CINCFLAGS += -I$(CBLAS_H_DIRPATH) +endif + # Obtain a list of header paths in the configured sandbox. Then add -I to each # header path. CSBOXINCFLAGS := $(strip $(patsubst %, -I%, $(SANDBOX_HDR_DIRPATHS))) diff --git a/frame/compat/bli_blas.h b/frame/compat/bli_blas.h index 1ce976453..a65953c11 100644 --- a/frame/compat/bli_blas.h +++ b/frame/compat/bli_blas.h @@ -113,6 +113,7 @@ #include "bla_amax.h" #include "bla_asum.h" #include "bla_axpy.h" +#include "bla_axpby.h" #include "bla_copy.h" #include "bla_dot.h" #include "bla_nrm2.h" @@ -199,6 +200,11 @@ #include "bla_trsm_check.h" #include "bla_gemmt_check.h" +// -- Batch prototypes -- + +#include "bla_gemm_batch.h" + + // -- Fortran-compatible APIs to BLIS functions -- #include "b77_thread.h" diff --git a/frame/compat/cblas/src/cblas.h b/frame/compat/cblas/src/cblas.h index 85e24674d..cee74233c 100644 --- a/frame/compat/cblas/src/cblas.h +++ b/frame/compat/cblas/src/cblas.h @@ -1,3 +1,4 @@ + #ifndef CBLAS_H #define CBLAS_H #include @@ -595,6 +596,62 @@ void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); + +/* + * =========================================================================== + * BLAS Extension prototypes + * =========================================================================== + */ + +// -- APIs to operations unique to BLIS -- + +void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, + f77_int incX, float beta, float *Y, f77_int incY); +void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, + f77_int incX, double beta, double *Y, f77_int incY); +void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, + const void *X, f77_int incX, const void* beta, + void *Y, f77_int incY); +void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, + const void *X, f77_int incX, const void *beta, + void *Y, f77_int incY); + +// -- Batch APIs -- + +void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, + enum CBLAS_TRANSPOSE *TransA_array, + enum CBLAS_TRANSPOSE *TransB_array, + f77_int *M_array, f77_int *N_array, + f77_int *K_array, const float *alpha_array, const float **A, + f77_int *lda_array, const float **B, f77_int *ldb_array, + const float *beta_array, float **C, f77_int *ldc_array, + f77_int group_count, f77_int *group_size); +void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, + enum CBLAS_TRANSPOSE *TransA_array, + enum CBLAS_TRANSPOSE *TransB_array, + f77_int *M_array, f77_int *N_array, + f77_int *K_array, const double *alpha_array, + const double **A,f77_int *lda_array, + const double **B, f77_int *ldb_array, + const double *beta_array, double **C, f77_int *ldc_array, + f77_int group_count, f77_int *group_size); +void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, + enum CBLAS_TRANSPOSE *TransA_array, + enum CBLAS_TRANSPOSE *TransB_array, + f77_int *M_array, f77_int *N_array, + f77_int *K_array, const void *alpha_array, const void **A, + f77_int *lda_array, const void **B, f77_int *ldb_array, + const void *beta_array, void **C, f77_int *ldc_array, + f77_int group_count, f77_int *group_size); +void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, + enum CBLAS_TRANSPOSE *TransA_array, + enum CBLAS_TRANSPOSE *TransB_array, + f77_int *M_array, f77_int *N_array, + f77_int *K_array, const void *alpha_array, const void **A, + f77_int *lda_array, const void **B, f77_int *ldb_array, + const void *beta_array, void **C, f77_int *ldc_array, + f77_int group_count, f77_int *group_size); + #ifdef __cplusplus } #endif diff --git a/frame/compat/cblas/src/cblas_f77.h b/frame/compat/cblas/src/cblas_f77.h index 5e94fdf2c..e534d2054 100644 --- a/frame/compat/cblas/src/cblas_f77.h +++ b/frame/compat/cblas/src/cblas_f77.h @@ -14,7 +14,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -200,9 +200,20 @@ /* * BLAS extensions */ -#define F77_sgemmt sgemmt_ -#define F77_dgemmt dgemmt_ -#define F77_cgemmt cgemmt_ -#define F77_zgemmt zgemmt_ +#define F77_sgemmt sgemmt_ +#define F77_dgemmt dgemmt_ +#define F77_cgemmt cgemmt_ +#define F77_zgemmt zgemmt_ + +#define F77_saxpby saxpby_ +#define F77_daxpby daxpby_ +#define F77_caxpby caxpby_ +#define F77_zaxpby zaxpby_ + +#define F77_sgemm_batch sgemm_batch_ +#define F77_dgemm_batch dgemm_batch_ +#define F77_cgemm_batch cgemm_batch_ +#define F77_zgemm_batch zgemm_batch_ + #endif /* CBLAS_F77_H */ diff --git a/frame/compat/cblas/src/cblas_sgemm.c b/frame/compat/cblas/src/cblas_sgemm.c index 89d0f07a8..bf40b9c0d 100644 --- a/frame/compat/cblas/src/cblas_sgemm.c +++ b/frame/compat/cblas/src/cblas_sgemm.c @@ -7,6 +7,8 @@ * Written by Keita Teranishi * 4/8/1998 * + * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + * */ #include "cblas.h" @@ -17,12 +19,12 @@ void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc) { - char TA, TB; + char TA, TB; #ifdef F77_CHAR F77_CHAR F77_TA, F77_TB; #else - #define F77_TA &TA - #define F77_TB &TB + #define F77_TA &TA + #define F77_TB &TB #endif #ifdef F77_INT @@ -36,7 +38,7 @@ void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, #define F77_ldb ldb #define F77_ldc ldc #endif - + extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; @@ -46,9 +48,9 @@ void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, if(TransA == CblasTrans) TA='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; - else + else { - cblas_xerbla(2, "cblas_sgemm", + cblas_xerbla(2, "cblas_sgemm", "Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; @@ -58,9 +60,9 @@ void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, if(TransB == CblasTrans) TB='T'; else if ( TransB == CblasConjTrans ) TB='C'; else if ( TransB == CblasNoTrans ) TB='N'; - else + else { - cblas_xerbla(3, "cblas_sgemm", + cblas_xerbla(3, "cblas_sgemm", "Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; @@ -79,9 +81,9 @@ void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, if(TransA == CblasTrans) TB='T'; else if ( TransA == CblasConjTrans ) TB='C'; else if ( TransA == CblasNoTrans ) TB='N'; - else + else { - cblas_xerbla(2, "cblas_sgemm", + cblas_xerbla(2, "cblas_sgemm", "Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; @@ -90,10 +92,10 @@ void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, if(TransB == CblasTrans) TA='T'; else if ( TransB == CblasConjTrans ) TA='C'; else if ( TransB == CblasNoTrans ) TA='N'; - else + else { - cblas_xerbla(2, "cblas_sgemm", - "Illegal TransA setting, %d\n", TransA); + cblas_xerbla(2, "cblas_sgemm", + "Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; @@ -104,7 +106,7 @@ void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, #endif F77_sgemm(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, &alpha, B, &F77_ldb, A, &F77_lda, &beta, C, &F77_ldc); - } else + } else cblas_xerbla(1, "cblas_sgemm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; diff --git a/frame/compat/cblas/src/extra/cblas_caxpby.c b/frame/compat/cblas/src/extra/cblas_caxpby.c new file mode 100644 index 000000000..e8400d91b --- /dev/null +++ b/frame/compat/cblas/src/extra/cblas_caxpby.c @@ -0,0 +1,27 @@ +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS +/* + * cblas_caxpby.c + * + * The program is a C interface to caxpby. + * + * Copyright (C) 2020, Advanced Micro Devices, Inc + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_caxpby( f77_int N, const void *alpha, + const void *X, f77_int incX, + const void *beta, + void *Y, f77_int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_caxpby( &F77_N, (scomplex*)alpha, (scomplex*)X, &F77_incX, (scomplex*)beta, (scomplex*)Y, &F77_incY); +} +#endif diff --git a/frame/compat/cblas/src/extra/cblas_cgemm_batch.c b/frame/compat/cblas/src/extra/cblas_cgemm_batch.c new file mode 100644 index 000000000..18dd0bad5 --- /dev/null +++ b/frame/compat/cblas/src/extra/cblas_cgemm_batch.c @@ -0,0 +1,168 @@ +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS +/* + * + * cblas_cgemm_batch.c + * This program is a C interface to cgemm_batch. + * + * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cgemm_batch(enum CBLAS_ORDER Order, + enum CBLAS_TRANSPOSE *TransA_array, + enum CBLAS_TRANSPOSE *TransB_array, + f77_int *M_array, f77_int *N_array, + f77_int *K_array, const void *alpha_array, + const void **A_array, f77_int *lda_array, + const void **B_array, f77_int *ldb_array, + const void *beta_array, + void **C_array, f77_int *ldc_array, + f77_int group_count, f77_int *group_size) +{ + char TA[group_count], TB[group_count]; +#ifdef F77_CHAR + F77_CHAR F77_TA[group_count], F77_TB[group_count]; +#else + #define F77_TA TA + #define F77_TB TB +#endif + +#ifdef F77_INT + F77_INT F77_GRP_COUNT = group_count; + F77_INT F77_M[F77_GRP_COUNT], F77_N[F77_GRP_COUNT], F77_K[F77_GRP_COUNT]; + F77_INT F77_lda[F77_GRP_COUNT], F77_ldb[F77_GRP_COUNT], F77_ldc[F77_GRP_COUNT]; + F77_INT F77_GRP_SIZE[F77_GRP_COUNT]; +#else + #define F77_GRP_COUNT group_count + #define F77_M M_array + #define F77_N N_array + #define F77_K K_array + #define F77_lda lda_array + #define F77_ldb ldb_array + #define F77_ldc ldc_array + #define F77_GRP_SIZE group_size +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + dim_t i; + if( Order == CblasColMajor ) + { + for(i = 0; i < group_count; i++) + { + if(TransA_array[i] == CblasTrans) TA[i]='T'; + else if ( TransA_array[i] == CblasConjTrans ) TA[i]='C'; + else if ( TransA_array[i] == CblasNoTrans ) TA[i]='N'; + else + { + cblas_xerbla(2, "cblas_cgemm_batch", + "Illegal TransA setting %d for group %d\n", TransA_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if(TransB_array[i] == CblasTrans) TB[i]='T'; + else if ( TransB_array[i] == CblasConjTrans ) TB[i]='C'; + else if ( TransB_array[i] == CblasNoTrans ) TB[i]='N'; + else + { + cblas_xerbla(3, "cblas_cgemm_batch", + "Illegal TransB setting %d for group %d\n", TransB_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + +#ifdef F77_CHAR + F77_TA[i] = C2F_CHAR(TA+i); + F77_TB[i] = C2F_CHAR(TB+i); +#endif + +#ifdef F77_INT + F77_M[i] = M_array[i]; + F77_N[i] = N_array[i]; + F77_K[i] = K_array[i]; + F77_lda[i] = lda_array[i]; + F77_ldb[i] = ldb_array[i]; + F77_ldc[i] = ldc_array[i]; + F77_GRP_SIZE[i] = group_size[i]; +#endif + } + + F77_cgemm_batch(F77_TA, F77_TB, + F77_M, F77_N, F77_K, + (const scomplex*)alpha_array, + (const scomplex**)A_array, F77_lda, + (const scomplex**)B_array, F77_ldb, + (const scomplex*)beta_array, + (scomplex**)C_array, F77_ldc, + &F77_GRP_COUNT, F77_GRP_SIZE); + } + else if (Order == CblasRowMajor) + { + RowMajorStrg = 1; + dim_t i; + + for(i = 0; i < group_count; i++) + { + if(TransA_array[i] == CblasTrans) TB[i]='T'; + else if ( TransA_array[i] == CblasConjTrans ) TB[i]='C'; + else if ( TransA_array[i] == CblasNoTrans ) TB[i]='N'; + else + { + cblas_xerbla(2, "cblas_cgemm_batch", + "Illegal TransA setting %d for group %d\n", TransA_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(TransB_array[i] == CblasTrans) TA[i]='T'; + else if ( TransB_array[i] == CblasConjTrans ) TA[i]='C'; + else if ( TransB_array[i] == CblasNoTrans ) TA[i]='N'; + else + { + cblas_xerbla(2, "cblas_cgemm_batch", + "Illegal TransB setting %d for group %d\n", TransB_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + +#ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); +#endif + +#ifdef F77_INT + F77_M[i] = M_array[i]; + F77_N[i] = N_array[i]; + F77_K[i] = K_array[i]; + F77_lda[i] = lda_array[i]; + F77_ldb[i] = ldb_array[i]; + F77_ldc[i] = ldc_array[i]; + F77_GRP_SIZE = group_size[i]; +#endif + } + + F77_cgemm_batch(F77_TA, F77_TB, + F77_N, F77_M, F77_K, + (const scomplex*)alpha_array, + (const scomplex**)B_array, F77_ldb, + (const scomplex**)A_array, F77_lda, + (const scomplex*)beta_array, + (scomplex**)C_array, F77_ldc, + &F77_GRP_COUNT, F77_GRP_SIZE); + } else + cblas_xerbla(1, "cblas_cgemm_batch", + "Illegal Order setting, %d\n", Order); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; +} +#endif diff --git a/frame/compat/cblas/src/extra/cblas_daxpby.c b/frame/compat/cblas/src/extra/cblas_daxpby.c new file mode 100644 index 000000000..8fbea4d5a --- /dev/null +++ b/frame/compat/cblas/src/extra/cblas_daxpby.c @@ -0,0 +1,26 @@ +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS +/* + * cblas_daxpby.c + * + * The program is a C interface to daxpby. + * + * Copyright (C) 2020, Advanced Micro Devices, Inc. + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_daxpby( f77_int N, double alpha, + const double *X, f77_int incX, + double beta, + double *Y, f77_int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_daxpby( &F77_N, &alpha, X, &F77_incX, &beta, Y, &F77_incY); +} +#endif diff --git a/frame/compat/cblas/src/extra/cblas_dgemm_batch.c b/frame/compat/cblas/src/extra/cblas_dgemm_batch.c new file mode 100644 index 000000000..a2bed3b1a --- /dev/null +++ b/frame/compat/cblas/src/extra/cblas_dgemm_batch.c @@ -0,0 +1,168 @@ +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS +/* + * + * cblas_dgemm_batch.c + * This program is a C interface to dgemm_batch. + * + * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dgemm_batch(enum CBLAS_ORDER Order, + enum CBLAS_TRANSPOSE *TransA_array, + enum CBLAS_TRANSPOSE *TransB_array, + f77_int *M_array, f77_int *N_array, + f77_int *K_array, const double *alpha_array, + const double **A_array, f77_int *lda_array, + const double **B_array, f77_int *ldb_array, + const double *beta_array, + double **C_array, f77_int *ldc_array, + f77_int group_count, f77_int *group_size) +{ + char TA[group_count], TB[group_count]; +#ifdef F77_CHAR + F77_CHAR F77_TA[group_count], F77_TB[group_count]; +#else + #define F77_TA TA + #define F77_TB TB +#endif + +#ifdef F77_INT + F77_INT F77_GRP_COUNT = group_count; + F77_INT F77_M[F77_GRP_COUNT], F77_N[F77_GRP_COUNT], F77_K[F77_GRP_COUNT]; + F77_INT F77_lda[F77_GRP_COUNT], F77_ldb[F77_GRP_COUNT], F77_ldc[F77_GRP_COUNT]; + F77_INT F77_GRP_SIZE[F77_GRP_COUNT]; +#else + #define F77_GRP_COUNT group_count + #define F77_M M_array + #define F77_N N_array + #define F77_K K_array + #define F77_lda lda_array + #define F77_ldb ldb_array + #define F77_ldc ldc_array + #define F77_GRP_SIZE group_size +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + dim_t i; + if( Order == CblasColMajor ) + { + for(i = 0; i < group_count; i++) + { + if(TransA_array[i] == CblasTrans) TA[i]='T'; + else if ( TransA_array[i] == CblasConjTrans ) TA[i]='C'; + else if ( TransA_array[i] == CblasNoTrans ) TA[i]='N'; + else + { + cblas_xerbla(2, "cblas_dgemm_batch", + "Illegal TransA setting %d for group %d\n", TransA_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if(TransB_array[i] == CblasTrans) TB[i]='T'; + else if ( TransB_array[i] == CblasConjTrans ) TB[i]='C'; + else if ( TransB_array[i] == CblasNoTrans ) TB[i]='N'; + else + { + cblas_xerbla(3, "cblas_dgemm_batch", + "Illegal TransB setting %d for group %d\n", TransB_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + +#ifdef F77_CHAR + F77_TA[i] = C2F_CHAR(TA+i); + F77_TB[i] = C2F_CHAR(TB+i); +#endif + +#ifdef F77_INT + F77_M[i] = M_array[i]; + F77_N[i] = N_array[i]; + F77_K[i] = K_array[i]; + F77_lda[i] = lda_array[i]; + F77_ldb[i] = ldb_array[i]; + F77_ldc[i] = ldc_array[i]; + F77_GRP_SIZE[i] = group_size[i]; +#endif + } + + F77_dgemm_batch(F77_TA, F77_TB, + F77_M, F77_N, F77_K, + alpha_array, + A_array, F77_lda, + B_array, F77_ldb, + beta_array, + C_array, F77_ldc, + &F77_GRP_COUNT, F77_GRP_SIZE); + } + else if (Order == CblasRowMajor) + { + RowMajorStrg = 1; + dim_t i; + + for(i = 0; i < group_count; i++) + { + if(TransA_array[i] == CblasTrans) TB[i]='T'; + else if ( TransA_array[i] == CblasConjTrans ) TB[i]='C'; + else if ( TransA_array[i] == CblasNoTrans ) TB[i]='N'; + else + { + cblas_xerbla(2, "cblas_dgemm_batch", + "Illegal TransA setting %d for group %d\n", TransA_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(TransB_array[i] == CblasTrans) TA[i]='T'; + else if ( TransB_array[i] == CblasConjTrans ) TA[i]='C'; + else if ( TransB_array[i] == CblasNoTrans ) TA[i]='N'; + else + { + cblas_xerbla(2, "cblas_dgemm_batch", + "Illegal TransB setting %d for group %d\n", TransB_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + +#ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); +#endif + +#ifdef F77_INT + F77_M[i] = M_array[i]; + F77_N[i] = N_array[i]; + F77_K[i] = K_array[i]; + F77_lda[i] = lda_array[i]; + F77_ldb[i] = ldb_array[i]; + F77_ldc[i] = ldc_array[i]; + F77_GRP_SIZE = group_size[i]; +#endif + } + + F77_dgemm_batch(F77_TA, F77_TB, + F77_N, F77_M, F77_K, + alpha_array, + B_array, F77_ldb, + A_array, F77_lda, + beta_array, + C_array, F77_ldc, + &F77_GRP_COUNT, F77_GRP_SIZE); + } else + cblas_xerbla(1, "cblas_dgemm_batch", + "Illegal Order setting, %d\n", Order); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; +} +#endif diff --git a/frame/compat/cblas/src/extra/cblas_saxpby.c b/frame/compat/cblas/src/extra/cblas_saxpby.c new file mode 100644 index 000000000..685282123 --- /dev/null +++ b/frame/compat/cblas/src/extra/cblas_saxpby.c @@ -0,0 +1,28 @@ +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS +/* + * cblas_saxpby.c + * + * The program is a C interface to saxpby. + * It calls the fortran wrapper before calling saxpby. + * + * Copyright (C) 2020, Advanced Micro Devices, Inc. + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_saxpby( f77_int N, float alpha, + const float *X, f77_int incX, + float beta, + float *Y, f77_int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_saxpby( &F77_N, &alpha, X, &F77_incX, &beta, Y, &F77_incY); +} +#endif diff --git a/frame/compat/cblas/src/extra/cblas_sgemm_batch.c b/frame/compat/cblas/src/extra/cblas_sgemm_batch.c new file mode 100644 index 000000000..3e8517db2 --- /dev/null +++ b/frame/compat/cblas/src/extra/cblas_sgemm_batch.c @@ -0,0 +1,168 @@ +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS +/* + * + * cblas_sgemm_batch.c + * This program is a C interface to sgemm_batch. + * + * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_sgemm_batch(enum CBLAS_ORDER Order, + enum CBLAS_TRANSPOSE *TransA_array, + enum CBLAS_TRANSPOSE *TransB_array, + f77_int *M_array, f77_int *N_array, + f77_int *K_array, const float *alpha_array, + const float **A_array, f77_int *lda_array, + const float **B_array, f77_int *ldb_array, + const float *beta_array, + float **C_array, f77_int *ldc_array, + f77_int group_count, f77_int *group_size) +{ + char TA[group_count], TB[group_count]; +#ifdef F77_CHAR + F77_CHAR F77_TA[group_count], F77_TB[group_count]; +#else + #define F77_TA TA + #define F77_TB TB +#endif + +#ifdef F77_INT + F77_INT F77_GRP_COUNT = group_count; + F77_INT F77_M[F77_GRP_COUNT], F77_N[F77_GRP_COUNT], F77_K[F77_GRP_COUNT]; + F77_INT F77_lda[F77_GRP_COUNT], F77_ldb[F77_GRP_COUNT], F77_ldc[F77_GRP_COUNT]; + F77_INT F77_GRP_SIZE[F77_GRP_COUNT]; +#else + #define F77_GRP_COUNT group_count + #define F77_M M_array + #define F77_N N_array + #define F77_K K_array + #define F77_lda lda_array + #define F77_ldb ldb_array + #define F77_ldc ldc_array + #define F77_GRP_SIZE group_size +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + dim_t i; + if( Order == CblasColMajor ) + { + for(i = 0; i < group_count; i++) + { + if(TransA_array[i] == CblasTrans) TA[i]='T'; + else if ( TransA_array[i] == CblasConjTrans ) TA[i]='C'; + else if ( TransA_array[i] == CblasNoTrans ) TA[i]='N'; + else + { + cblas_xerbla(2, "cblas_sgemm_batch", + "Illegal TransA setting %d for group %d\n", TransA_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if(TransB_array[i] == CblasTrans) TB[i]='T'; + else if ( TransB_array[i] == CblasConjTrans ) TB[i]='C'; + else if ( TransB_array[i] == CblasNoTrans ) TB[i]='N'; + else + { + cblas_xerbla(3, "cblas_sgemm_batch", + "Illegal TransB setting %d for group %d\n", TransB_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + +#ifdef F77_CHAR + F77_TA[i] = C2F_CHAR(TA+i); + F77_TB[i] = C2F_CHAR(TB+i); +#endif + +#ifdef F77_INT + F77_M[i] = M_array[i]; + F77_N[i] = N_array[i]; + F77_K[i] = K_array[i]; + F77_lda[i] = lda_array[i]; + F77_ldb[i] = ldb_array[i]; + F77_ldc[i] = ldc_array[i]; + F77_GRP_SIZE[i] = group_size[i]; +#endif + } + + F77_sgemm_batch(F77_TA, F77_TB, + F77_M, F77_N, F77_K, + alpha_array, + A_array, F77_lda, + B_array, F77_ldb, + beta_array, + C_array, F77_ldc, + &F77_GRP_COUNT, F77_GRP_SIZE); + } + else if (Order == CblasRowMajor) + { + RowMajorStrg = 1; + dim_t i; + + for(i = 0; i < group_count; i++) + { + if(TransA_array[i] == CblasTrans) TB[i]='T'; + else if ( TransA_array[i] == CblasConjTrans ) TB[i]='C'; + else if ( TransA_array[i] == CblasNoTrans ) TB[i]='N'; + else + { + cblas_xerbla(2, "cblas_sgemm_batch", + "Illegal TransA setting %d for group %d\n", TransA_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(TransB_array[i] == CblasTrans) TA[i]='T'; + else if ( TransB_array[i] == CblasConjTrans ) TA[i]='C'; + else if ( TransB_array[i] == CblasNoTrans ) TA[i]='N'; + else + { + cblas_xerbla(2, "cblas_sgemm_batch", + "Illegal TransB setting %d for group %d\n", TransB_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + +#ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); +#endif + +#ifdef F77_INT + F77_M[i] = M_array[i]; + F77_N[i] = N_array[i]; + F77_K[i] = K_array[i]; + F77_lda[i] = lda_array[i]; + F77_ldb[i] = ldb_array[i]; + F77_ldc[i] = ldc_array[i]; + F77_GRP_SIZE = group_size[i]; +#endif + } + + F77_sgemm_batch(F77_TA, F77_TB, + F77_N, F77_M, F77_K, + alpha_array, + B_array, F77_ldb, + A_array, F77_lda, + beta_array, + C_array, F77_ldc, + &F77_GRP_COUNT, F77_GRP_SIZE); + } else + cblas_xerbla(1, "cblas_sgemm_batch", + "Illegal Order setting, %d\n", Order); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; +} +#endif diff --git a/frame/compat/cblas/src/extra/cblas_zaxpby.c b/frame/compat/cblas/src/extra/cblas_zaxpby.c new file mode 100644 index 000000000..483607ec9 --- /dev/null +++ b/frame/compat/cblas/src/extra/cblas_zaxpby.c @@ -0,0 +1,27 @@ +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS +/* + * cblas_zaxpby.c + * + * The program is a C interface to zaxpby. + * + * Copyright (C) 2020, Advanced Micro Devices, Inc. + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zaxpby( f77_int N, const void *alpha, + const void *X, f77_int incX, + const void *beta, + void *Y, f77_int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_zaxpby( &F77_N, (dcomplex*)alpha, (dcomplex*)X, &F77_incX, (dcomplex*)beta, (dcomplex*)Y, &F77_incY); +} +#endif diff --git a/frame/compat/cblas/src/extra/cblas_zgemm_batch.c b/frame/compat/cblas/src/extra/cblas_zgemm_batch.c new file mode 100644 index 000000000..2d188a9f0 --- /dev/null +++ b/frame/compat/cblas/src/extra/cblas_zgemm_batch.c @@ -0,0 +1,168 @@ +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS +/* + * + * cblas_zgemm_batch.c + * This program is a C interface to zgemm_batch. + * + * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zgemm_batch(enum CBLAS_ORDER Order, + enum CBLAS_TRANSPOSE *TransA_array, + enum CBLAS_TRANSPOSE *TransB_array, + f77_int *M_array, f77_int *N_array, + f77_int *K_array, const void *alpha_array, + const void **A_array, f77_int *lda_array, + const void **B_array, f77_int *ldb_array, + const void *beta_array, + void **C_array, f77_int *ldc_array, + f77_int group_count, f77_int *group_size) +{ + char TA[group_count], TB[group_count]; +#ifdef F77_CHAR + F77_CHAR F77_TA[group_count], F77_TB[group_count]; +#else + #define F77_TA TA + #define F77_TB TB +#endif + +#ifdef F77_INT + F77_INT F77_GRP_COUNT = group_count; + F77_INT F77_M[F77_GRP_COUNT], F77_N[F77_GRP_COUNT], F77_K[F77_GRP_COUNT]; + F77_INT F77_lda[F77_GRP_COUNT], F77_ldb[F77_GRP_COUNT], F77_ldc[F77_GRP_COUNT]; + F77_INT F77_GRP_SIZE[F77_GRP_COUNT]; +#else + #define F77_GRP_COUNT group_count + #define F77_M M_array + #define F77_N N_array + #define F77_K K_array + #define F77_lda lda_array + #define F77_ldb ldb_array + #define F77_ldc ldc_array + #define F77_GRP_SIZE group_size +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + dim_t i; + if( Order == CblasColMajor ) + { + for(i = 0; i < group_count; i++) + { + if(TransA_array[i] == CblasTrans) TA[i]='T'; + else if ( TransA_array[i] == CblasConjTrans ) TA[i]='C'; + else if ( TransA_array[i] == CblasNoTrans ) TA[i]='N'; + else + { + cblas_xerbla(2, "cblas_zgemm_batch", + "Illegal TransA setting %d for group %d\n", TransA_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if(TransB_array[i] == CblasTrans) TB[i]='T'; + else if ( TransB_array[i] == CblasConjTrans ) TB[i]='C'; + else if ( TransB_array[i] == CblasNoTrans ) TB[i]='N'; + else + { + cblas_xerbla(3, "cblas_zgemm_batch", + "Illegal TransB setting %d for group %d\n", TransB_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + +#ifdef F77_CHAR + F77_TA[i] = C2F_CHAR(TA+i); + F77_TB[i] = C2F_CHAR(TB+i); +#endif + +#ifdef F77_INT + F77_M[i] = M_array[i]; + F77_N[i] = N_array[i]; + F77_K[i] = K_array[i]; + F77_lda[i] = lda_array[i]; + F77_ldb[i] = ldb_array[i]; + F77_ldc[i] = ldc_array[i]; + F77_GRP_SIZE[i] = group_size[i]; +#endif + } + + F77_zgemm_batch(F77_TA, F77_TB, + F77_M, F77_N, F77_K, + (const dcomplex*)alpha_array, + (const dcomplex**)A_array, F77_lda, + (const dcomplex**)B_array, F77_ldb, + (const dcomplex*)beta_array, + (dcomplex**)C_array, F77_ldc, + &F77_GRP_COUNT, F77_GRP_SIZE); + } + else if (Order == CblasRowMajor) + { + RowMajorStrg = 1; + dim_t i; + + for(i = 0; i < group_count; i++) + { + if(TransA_array[i] == CblasTrans) TB[i]='T'; + else if ( TransA_array[i] == CblasConjTrans ) TB[i]='C'; + else if ( TransA_array[i] == CblasNoTrans ) TB[i]='N'; + else + { + cblas_xerbla(2, "cblas_zgemm_batch", + "Illegal TransA setting %d for group %d\n", TransA_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(TransB_array[i] == CblasTrans) TA[i]='T'; + else if ( TransB_array[i] == CblasConjTrans ) TA[i]='C'; + else if ( TransB_array[i] == CblasNoTrans ) TA[i]='N'; + else + { + cblas_xerbla(2, "cblas_zgemm_batch", + "Illegal TransB setting %d for group %d\n", TransB_array[i], i); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + +#ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); +#endif + +#ifdef F77_INT + F77_M[i] = M_array[i]; + F77_N[i] = N_array[i]; + F77_K[i] = K_array[i]; + F77_lda[i] = lda_array[i]; + F77_ldb[i] = ldb_array[i]; + F77_ldc[i] = ldc_array[i]; + F77_GRP_SIZE = group_size[i]; +#endif + } + + F77_zgemm_batch(F77_TA, F77_TB, + F77_N, F77_M, F77_K, + (const dcomplex*)alpha_array, + (const dcomplex**)B_array, F77_ldb, + (const dcomplex**)A_array, F77_lda, + (const dcomplex*)beta_array, + (dcomplex**)C_array, F77_ldc, + &F77_GRP_COUNT, F77_GRP_SIZE); + } else + cblas_xerbla(1, "cblas_zgemm_batch", + "Illegal Order setting, %d\n", Order); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; +} +#endif diff --git a/frame/compat/extra/bla_axpby.c b/frame/compat/extra/bla_axpby.c new file mode 100644 index 000000000..d96d75d74 --- /dev/null +++ b/frame/compat/extra/bla_axpby.c @@ -0,0 +1,89 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +// +// Define BLAS-to-BLIS interfaces. +// +#undef GENTFUNC +#define GENTFUNC( ftype, ch, blasname, blisname ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_int* n, \ + const ftype* alpha, \ + const ftype* x, const f77_int* incx, \ + const ftype* beta, \ + ftype* y, const f77_int* incy \ + ) \ +{ \ + dim_t n0; \ + ftype* x0; \ + ftype* y0; \ + inc_t incx0; \ + inc_t incy0; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + /* Convert/typecast negative values of n to zero. */ \ + bli_convert_blas_dim1( *n, n0 ); \ +\ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ + bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ +\ + /* Call BLIS interface. */ \ + PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + n0, \ + (ftype*)alpha, \ + x0, incx0, \ + (ftype*)beta, \ + y0, incy0, \ + NULL, \ + NULL \ + ); \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +} + +#ifdef BLIS_ENABLE_BLAS +INSERT_GENTFUNC_BLAS( axpby, axpbyv ) +#endif diff --git a/frame/compat/extra/bla_axpby.h b/frame/compat/extra/bla_axpby.h new file mode 100644 index 000000000..ab2952be9 --- /dev/null +++ b/frame/compat/extra/bla_axpby.h @@ -0,0 +1,54 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype BLAS-to-BLIS interfaces. +// +#undef GENTPROT +#define GENTPROT( ftype, ch, blasname ) \ +\ +BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ + ( \ + const f77_int* n, \ + const ftype* alpha, \ + const ftype* x, const f77_int* incx, \ + const ftype* beta, \ + ftype* y, const f77_int* incy \ + ); + +#ifdef BLIS_ENABLE_BLAS +INSERT_GENTPROT_BLAS( axpby ) +#endif + diff --git a/frame/compat/extra/bla_gemm_batch.c b/frame/compat/extra/bla_gemm_batch.c new file mode 100644 index 000000000..be84572a3 --- /dev/null +++ b/frame/compat/extra/bla_gemm_batch.c @@ -0,0 +1,254 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +// +// Define BLAS-to-BLIS interfaces. +// + +#ifdef BLIS_BLAS3_CALLS_TAPI + +#undef GENTFUNC +#define GENTFUNC( ftype, ch, blasname, blisname ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_char* transa_array, \ + const f77_char* transb_array, \ + const f77_int* m_array, \ + const f77_int* n_array, \ + const f77_int* k_array, \ + const ftype* alpha_array, \ + const ftype** a_array, const f77_int* lda_array, \ + const ftype** b_array, const f77_int* ldb_array, \ + const ftype* beta_array, \ + ftype** c_array, const f77_int* ldc_array, \ + const f77_int* group_count, \ + const f77_int* group_size \ + ) \ +{ \ + trans_t blis_transa; \ + trans_t blis_transb; \ + dim_t m0, n0, k0; \ + inc_t rs_a, cs_a; \ + inc_t rs_b, cs_b; \ + inc_t rs_c, cs_c; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + /* Perform BLAS parameter checking. */ \ + for ( f77_int gi = 0; gi < *group_count; gi++ ) \ + { \ + PASTEBLACHK(blisname) \ + ( \ + MKSTR(ch), \ + MKSTR(blisname), \ + transa_array+gi, \ + transb_array+gi, \ + m_array+gi, \ + n_array+gi, \ + k_array+gi, \ + lda_array+gi, \ + ldb_array+gi, \ + ldc_array+gi \ + ); \ + } \ +\ + f77_int idx = 0; \ +\ + for ( f77_int i = 0; i < *group_count; i++ ) \ + { \ + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ + bli_param_map_netlib_to_blis_trans( transa_array[i], &blis_transa ); \ + bli_param_map_netlib_to_blis_trans( transb_array[i], &blis_transb ); \ +\ + /* Typecast BLAS integers to BLIS integers. */ \ + bli_convert_blas_dim1( m_array[i], m0 ); \ + bli_convert_blas_dim1( n_array[i], n0 ); \ + bli_convert_blas_dim1( k_array[i], k0 ); \ +\ + /* Set the row and column strides of the matrix operands. */ \ + rs_a = 1; \ + cs_a = lda_array[i]; \ + rs_b = 1; \ + cs_b = ldb_array[i]; \ + rs_c = 1; \ + cs_c = ldc_array[i]; \ +\ + for ( f77_int j = 0; j < group_size[i]; j++ ) \ + { \ + /* Call BLIS interface. */ \ + PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + blis_transa, \ + blis_transb, \ + m0, \ + n0, \ + k0, \ + (ftype*)(alpha_array + i), \ + (ftype*)*(a_array + idx), rs_a, cs_a, \ + (ftype*)*(b_array + idx), rs_b, cs_b, \ + (ftype*)(beta_array + i), \ + (ftype*)*(c_array + idx), rs_c, cs_c, \ + NULL, \ + NULL \ + ); \ +\ + idx++; \ + } \ + } \ +\ + bli_finalize_auto(); \ +} + +#else + +#undef GENTFUNC +#define GENTFUNC( ftype, ch, blasname, blisname ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_char* transa_array, \ + const f77_char* transb_array, \ + const f77_int* m_array, \ + const f77_int* n_array, \ + const f77_int* k_array, \ + const ftype* alpha_array, \ + const ftype** a_array, const f77_int* lda_array, \ + const ftype** b_array, const f77_int* ldb_array, \ + const ftype* beta_array, \ + ftype** c_array, const f77_int* ldc_array, \ + const f77_int* group_count, \ + const f77_int* group_size ) \ +{ \ + trans_t blis_transa; \ + trans_t blis_transb; \ + dim_t m0, n0, k0; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + /* Perform BLAS parameter checking. */ \ + for ( f77_int gi = 0; gi < *group_count; gi++ ) \ + { \ + PASTEBLACHK(blisname) \ + ( \ + MKSTR(ch), \ + MKSTR(blisname), \ + transa_array+gi, \ + transb_array+gi, \ + m_array+gi, \ + n_array+gi, \ + k_array+gi, \ + lda_array+gi, \ + ldb_array+gi, \ + ldc_array+gi \ + ); \ + } \ +\ + const num_t dt = PASTEMAC(ch,type); \ +\ + f77_int idx = 0, i, j; \ +\ + for ( i = 0; i < *group_count; i++ ) \ + { \ + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ + bli_param_map_netlib_to_blis_trans( transa_array[i], &blis_transa ); \ + bli_param_map_netlib_to_blis_trans( transb_array[i], &blis_transb ); \ +\ + /* Typecast BLAS integers to BLIS integers. */ \ + bli_convert_blas_dim1( m_array[i], m0 ); \ + bli_convert_blas_dim1( n_array[i], n0 ); \ + bli_convert_blas_dim1( k_array[i], k0 ); \ +\ + /* Set the row and column strides of the matrix operands. */ \ + const inc_t rs_a = 1; \ + const inc_t cs_a = lda_array[i]; \ + const inc_t rs_b = 1; \ + const inc_t cs_b = ldb_array[i]; \ + const inc_t rs_c = 1; \ + const inc_t cs_c = ldc_array[i]; \ +\ + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ +\ + dim_t m0_a, n0_a; \ + dim_t m0_b, n0_b; \ +\ + bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \ + bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \ +\ + bli_obj_init_finish_1x1( dt, (ftype*)(alpha_array + i), &alphao ); \ + bli_obj_init_finish_1x1( dt, (ftype*)(beta_array + i), &betao ); \ +\ + for( j = 0; j < group_size[i]; j++ ) \ + { \ + obj_t ao = BLIS_OBJECT_INITIALIZER; \ + obj_t bo = BLIS_OBJECT_INITIALIZER; \ + obj_t co = BLIS_OBJECT_INITIALIZER; \ +\ + bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)*(a_array + idx), rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)*(b_array + idx), rs_b, cs_b, &bo ); \ + bli_obj_init_finish( dt, m0, n0, (ftype*)*(c_array + idx), rs_c, cs_c, &co ); \ + bli_obj_set_conjtrans( blis_transa, &ao ); \ + bli_obj_set_conjtrans( blis_transb, &bo ); \ +\ + PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \ + ( \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co, \ + NULL, \ + NULL \ + ); \ +\ + idx++; \ + } \ + } \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +} + +#endif + +#ifdef BLIS_ENABLE_BLAS +INSERT_GENTFUNC_BLAS( gemm_batch, gemm ) +#endif + diff --git a/frame/compat/extra/bla_gemm_batch.h b/frame/compat/extra/bla_gemm_batch.h new file mode 100644 index 000000000..f997f4b8e --- /dev/null +++ b/frame/compat/extra/bla_gemm_batch.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype BLAS-to-BLIS interfaces. +// +#undef GENTPROT +#define GENTPROT( ftype, ch, blasname ) \ +\ +BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ + ( \ + const f77_char* transa_array, \ + const f77_char* transb_array, \ + const f77_int* m_array, \ + const f77_int* n_array, \ + const f77_int* k_array, \ + const ftype* alpha_array, \ + const ftype** a_array, const f77_int* lda_array, \ + const ftype** b_array, const f77_int* ldb_array, \ + const ftype* beta_array, \ + ftype** c_array, const f77_int* ldc_array, \ + const f77_int* group_count, \ + const f77_int* group_size \ + ); + +#ifdef BLIS_ENABLE_BLAS +INSERT_GENTPROT_BLAS( gemm_batch ) +#endif + diff --git a/test/Makefile b/test/Makefile index bbd817f2d..ae998ccde 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,11 +1,11 @@ # # -# BLIS +# BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2017 - 2020, Advanced Micro Devices, Inc. +# Copyright (C) 2017 - 2020, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -143,9 +143,9 @@ CFLAGS += -I$(TEST_SRC_PATH) # # Define the operations we will test. -TEST_OPS := dotv axpyv \ +TEST_OPS := dotv axpyv axpbyv\ gemv ger hemv her her2 trmv trsv \ - gemm hemm herk her2k trmm trsm + gemm gemm_batch hemm herk her2k trmm trsm # Optionally test gemmt, which some libraries might not implement. ifeq ($(BUILD_GEMMT),yes) diff --git a/test/test_axpbyv.c b/test/test_axpbyv.c new file mode 100644 index 000000000..28be2542c --- /dev/null +++ b/test/test_axpbyv.c @@ -0,0 +1,293 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef WIN32 +#include +#else +#include +#endif +#include "blis.h" + +//#define PRINT +#ifdef BLIS_ENABLE_CBLAS +//#define CHECK_CBLAS +#endif + +#ifdef CHECK_CBLAS +#include "cblas.h" +#endif + +/* + * BLIS interface API will be called by default. + * To call BLAS API, modify line 159 to '#if 0'. + * To call cblas API, modify line 159 to '#if 0'and define the + * macro 'CHECK_CBLAS' in line 44 + * + *Sample prototype for BLAS interface API is as follows: + * n alpha x incx beta y incy + *void daxpbyv_( int*, double*, double*, int*, double*, double*, int* ); + */ + +int main( int argc, char** argv ) +{ + obj_t x, y; + obj_t y_save; + obj_t alpha, beta; + dim_t n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int n_input; + num_t dt_x, dt_y; + num_t dt_alpha, dt_beta; + int r, n_repeats; + num_t dt; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + +#ifndef PRINT + p_begin = 40; + p_end = 4000; + p_inc = 40; + + n_input = -1; +#else + p_begin = 16; + p_end = 16; + p_inc = 1; + + n_input = 15; +#endif + +#if 1 + dt = BLIS_FLOAT; + //dt = BLIS_DOUBLE; +#else + //dt = BLIS_SCOMPLEX; + dt = BLIS_DCOMPLEX; +#endif + + + dt_x = dt_y = dt_alpha = dt_beta = dt; + + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ; +#ifdef BLIS + printf( "data_axpbyv_blis" ); +#else + printf( "data_axpbyv_%s", BLAS ); +#endif + printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )0, 0.0 ); + + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) + { + + if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + + bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); + + bli_obj_create( dt_x, n, 1, 0, 0, &x ); + bli_obj_create( dt_y, n, 1, 0, 0, &y ); + bli_obj_create( dt_y, n, 1, 0, 0, &y_save ); + + bli_randm( &x ); + bli_randm( &y ); + + bli_setsc( (0.9/1.0), 0.2, &alpha ); + bli_setsc( -(1.1/1.0), 0.3, &beta ); + + bli_copym( &y, &y_save ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &y_save, &y ); + + dtime = bli_clock(); + +#ifdef PRINT + bli_printm( "alpha", &alpha, "%4.1f", "" ); + bli_printm( "beta" , &beta, "%4.1f", "" ); + + bli_printm( "x", &x, "%4.1f", "" ); + bli_printm( "y", &y, "%4.1f", "" ); +#endif + +#ifdef BLIS + + bli_axpbyv( &alpha, + &x, + &beta, + &y ); +#else + if ( bli_is_float( dt ) ) + { + f77_int nn = bli_obj_length( &x ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + float alphap = *(( float * )bli_obj_buffer( &alpha )); + float betap = *(( float * )bli_obj_buffer( &beta )); + float* xp = bli_obj_buffer( &x ); + float* yp = bli_obj_buffer( &y ); +#ifdef CHECK_CBLAS + cblas_saxpby( nn, + alphap, + xp, incx, + betap, + yp, incy ); +#else + saxpby_( &nn, + &alphap, + xp, &incx, + &betap, + yp, &incy ); + +#endif + } + else if ( bli_is_double( dt ) ) + { + + f77_int nn = bli_obj_length( &x ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + double alphap = *(( double * )bli_obj_buffer( &alpha )); + double betap = *(( double * )bli_obj_buffer( &beta )); + double* xp = bli_obj_buffer( &x ); + double* yp = bli_obj_buffer( &y ); +#ifdef CHECK_CBLAS + cblas_daxpby( nn, + alphap, + xp, incx, + betap, + yp, incy ); +#else + daxpby_( &nn, + &alphap, + xp, &incx, + &betap, + yp, &incy ); +#endif + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int nn = bli_obj_length( &x ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + void* alphap = bli_obj_buffer( &alpha ); + void* betap = bli_obj_buffer( &beta ); + void* xp = bli_obj_buffer( &x ); + void* yp = bli_obj_buffer( &y ); +#ifdef CHECK_CBLAS + cblas_caxpby( nn, + alphap, + xp, incx, + betap, + yp, incy ); +#else + caxpby_( &nn, + ( scomplex* )alphap, + ( scomplex* )xp, &incx, + ( scomplex* )betap, + ( scomplex* )yp, &incy ); +#endif + } + else if ( bli_is_dcomplex( dt )) + { + f77_int nn = bli_obj_length( &x ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + void* alphap = bli_obj_buffer( &alpha ); + void* betap = bli_obj_buffer( &beta ); + void* xp = bli_obj_buffer( &x ); + void* yp = bli_obj_buffer( &y ); +#ifdef CHECK_CBLAS + cblas_zaxpby( nn, + alphap, + xp, incx, + betap, + yp, incy ); +#else + zaxpby_( &nn, + ( dcomplex* )alphap, + ( dcomplex* )xp, &incx, + ( dcomplex* )betap, + ( dcomplex* )yp, &incy ); +#endif + } +#endif + +#ifdef PRINT + bli_printm( "y after", &y, "%4.1f", "" ); + exit(1); +#endif + + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( 3.0 * n ) / ( dtime_save * 1.0e9 ); + +#ifdef BLIS + printf( "data_axpbyv_blis" ); +#else + printf( "data_axpbyv_%s", BLAS ); +#endif + printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )n, gflops ); + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &x ); + bli_obj_free( &y ); + bli_obj_free( &y_save ); + } + + bli_finalize(); + + return 0; +} diff --git a/test/test_gemm_batch.c b/test/test_gemm_batch.c new file mode 100644 index 000000000..5660e4150 --- /dev/null +++ b/test/test_gemm_batch.c @@ -0,0 +1,584 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef WIN32 +#include +#else +#include +#endif +#include "blis.h" + +//#define CHECK_CBLAS +#ifdef CHECK_CBLAS +#include "cblas.h" +#endif + +/* Format for FILE input + * For each input set, first line contains 'storage scheme' + * and 'group count' seperated by space. + * Following 'group_count' number of lines contains all the parameters of + * each group separated by space in each line in the following order: + * tA tB m n k lda ldb ldc alpha_r alpha_i beta_r beta_i group_size + * + * Example: + * c 2 + * n n 4 8 4 4 4 4 1.1 0.0 0.9 0.0 2 + * n n 3 3 6 3 6 3 1.0 0.0 2.0 0.0 2 + * + */ + +//#define FILE_IN_OUT +#ifndef FILE_IN_OUT +#define GRP_COUNT 2 +#endif + +//#define PRINT + +int main( int argc, char** argv ) +{ + num_t dt; + + char stor_scheme; + dim_t i, j, idx; + dim_t r, n_repeats; + + double dtime; + double dtime_save; + double gflops; + + dim_t total_count = 0; + +#if 1 + dt = BLIS_FLOAT; + //dt = BLIS_DOUBLE; +#else + dt = BLIS_SCOMPLEX; + //dt = BLIS_DCOMPLEX; +#endif + + n_repeats = 1; + +#ifdef FILE_IN_OUT + FILE* fin = NULL; + FILE* fout = NULL; + + if(argc < 3) + { + printf("Usage: ./test_gemm_batch_XX.x input.csv output.csv\n"); + exit(1); + } + + fin = fopen(argv[1], "r"); + if( fin == NULL ) + { + printf("Error opening input file %s \n", argv[1]); + exit(1); + } + + fout = fopen(argv[2], "w"); + if(fout == NULL) + { + printf("Error opening output file %s\n",argv[2]); + exit(1); + } + + dim_t GRP_COUNT; + + fprintf(fout, "m\t n\t k\t lda\t ldb\t ldc\t transa\t transb\t grp_size\n"); + + while(fscanf(fin, "%c %ld\n", &stor_scheme, &GRP_COUNT) == 2) + { + char transa[GRP_COUNT]; + char transb[GRP_COUNT]; + + dim_t m[GRP_COUNT]; + dim_t n[GRP_COUNT]; + dim_t k[GRP_COUNT]; + + dim_t lda[GRP_COUNT]; + dim_t ldb[GRP_COUNT]; + dim_t ldc[GRP_COUNT]; + + double alpha_real[GRP_COUNT]; + double alpha_imag[GRP_COUNT]; + double beta_real[GRP_COUNT]; + double beta_imag[GRP_COUNT]; + + dim_t group_size[GRP_COUNT]; + obj_t alpha[GRP_COUNT], beta[GRP_COUNT]; + + total_count = 0; + for(i = 0; i < GRP_COUNT; i++) + { + fscanf(fin, "%c %c %ld %ld %ld %ld %ld %ld %lf %lf %lf %lf %ld\n", &transa[i], &transb[i], &m[i], &n[i], &k[i], &lda[i], &ldb[i], &ldc[i], &alpha_real[i], &alpha_imag[i], &beta_real[i], &beta_imag[i], &group_size[i]); + + total_count += group_size[i]; + } +#else + printf("m\t n\t k\t lda\t ldb\t ldc\t transa\t transb\t grp_size\n"); + + stor_scheme = 'c'; + + dim_t m[GRP_COUNT] = {4, 3}; + dim_t n[GRP_COUNT] = {8, 3}; + dim_t k[GRP_COUNT] = {4, 6}; + + dim_t lda[GRP_COUNT] = {4, 3}; + dim_t ldb[GRP_COUNT] = {4, 6}; + dim_t ldc[GRP_COUNT] = {4, 3}; + + char transa[GRP_COUNT] = {'N', 'N'}; + char transb[GRP_COUNT] = {'N', 'N'}; + + double alpha_real[GRP_COUNT] = {1.1, 1.0}; + double alpha_imag[GRP_COUNT] = {0.0, 0.0}; + + double beta_real[GRP_COUNT] = {0.9, 2.0}; + double beta_imag[GRP_COUNT] = {0.0, 0.0}; + + dim_t group_size[GRP_COUNT] = {2,2}; + + obj_t alpha[GRP_COUNT], beta[GRP_COUNT]; + + total_count = 0; + for(i = 0; i < GRP_COUNT; i++) + total_count += group_size[i]; + +#endif + obj_t a[total_count], b[total_count]; + obj_t c[total_count], c_save[total_count]; + f77_int f77_m[GRP_COUNT], f77_n[GRP_COUNT], f77_k[GRP_COUNT]; + f77_int f77_lda[GRP_COUNT], f77_ldb[GRP_COUNT], f77_ldc[GRP_COUNT]; + f77_int f77_group_size[GRP_COUNT]; + f77_int f77_group_count = GRP_COUNT; +#ifdef CHECK_CBLAS + enum CBLAS_ORDER cblas_order; + enum CBLAS_TRANSPOSE cblas_transa[GRP_COUNT]; + enum CBLAS_TRANSPOSE cblas_transb[GRP_COUNT]; + + if(stor_scheme == 'R' || stor_scheme == 'r') + cblas_order = CblasRowMajor; + else + cblas_order = CblasColMajor; + +#else + f77_char f77_transa[GRP_COUNT]; + f77_char f77_transb[GRP_COUNT]; + + if(stor_scheme == 'r' || stor_scheme == 'R' ) + { + printf("BLAS Interface doesn't support row-major order\n"); +#ifdef FILE_IN_OUT + continue; +#else + exit(1); +#endif + } +#endif + + idx = 0; + for(i = 0; i < GRP_COUNT; i++) + { + bli_obj_create(dt, 1, 1, 0, 0, &alpha[i]); + bli_obj_create(dt, 1, 1, 0, 0, &beta[i] ); + + bli_setsc(alpha_real[i], alpha_imag[i], &alpha[i]); + bli_setsc(beta_real[i], beta_imag[i], &beta[i] ); + + trans_t blis_transa, blis_transb; + if(transa[i] == 't' || transa[i] == 'T') + blis_transa = BLIS_TRANSPOSE; + else if (transa[i] == 'c' || transa[i] == 'C') + blis_transa = BLIS_CONJ_TRANSPOSE; + else if ( transa[i] == 'n' || transa[i] == 'N') + blis_transa = BLIS_NO_TRANSPOSE; + else + { + printf("Illegal transA setting %c for group %ld\n", transa[i], i); + exit(1); + } + + if(transb[i] == 't' || transb[i] == 'T') + blis_transb = BLIS_TRANSPOSE; + else if (transb[i] == 'c' || transb[i] == 'C') + blis_transb = BLIS_CONJ_TRANSPOSE; + else if (transb[i] == 'n' || transb[i] == 'N') + blis_transb = BLIS_NO_TRANSPOSE; + else + { + printf("Illegal transB setting %c for group %ld\n", transb[i], i); + exit(1); + } +#ifdef CHECK_CBLAS + if(bli_is_trans( blis_transa )) + cblas_transa[i] = CblasTrans; + else if (bli_is_conjtrans( blis_transa )) + cblas_transa[i] = CblasConjTrans; + else + cblas_transa[i] = CblasNoTrans; + + if(bli_is_trans( blis_transb )) + cblas_transb[i] = CblasTrans; + else if (bli_is_conjtrans( blis_transb )) + cblas_transb[i] = CblasConjTrans; + else + cblas_transb[i] = CblasNoTrans; +#else + bli_param_map_blis_to_netlib_trans( blis_transa, &f77_transa[i]); + bli_param_map_blis_to_netlib_trans( blis_transb, &f77_transb[i]); + +#endif + dim_t m0_a, n0_a; + dim_t m0_b, n0_b; + bli_set_dims_with_trans( blis_transa, m[i], k[i], &m0_a, &n0_a ); + bli_set_dims_with_trans( blis_transb, k[i], n[i], &m0_b, &n0_b ); + if(stor_scheme == 'C' || stor_scheme == 'c') + { + for(j = 0; j < group_size[i]; j++) + { + bli_obj_create(dt, m0_a, n0_a, 1, lda[i], &a[idx]); + bli_obj_create(dt, m0_b, n0_b, 1, ldb[i], &b[idx]); + bli_obj_create(dt, m[i], n[i], 1, ldc[i], &c[idx]); + bli_obj_create(dt, m[i], n[i], 1, ldc[i], &c_save[idx]); + + bli_randm( &a[idx] ); + bli_randm( &b[idx] ); + bli_randm( &c[idx] ); + + bli_obj_set_conjtrans(blis_transa, &a[idx]); + bli_obj_set_conjtrans(blis_transb, &b[idx]); + idx++; + } + } + else if(stor_scheme == 'R' || stor_scheme == 'r') + { + for(j = 0; j < group_size[i]; j++) + { + bli_obj_create(dt, m0_a, n0_a, lda[i], 1, &a[idx]); + bli_obj_create(dt, m0_b, n0_b, ldb[i], 1, &b[idx]); + bli_obj_create(dt, m[i], n[i], ldc[i], 1, &c[idx]); + bli_obj_create(dt, m[i], n[i], ldc[i], 1, &c_save[idx]); + + bli_randm( &a[idx] ); + bli_randm( &b[idx] ); + bli_randm( &c[idx] ); + + bli_obj_set_conjtrans(blis_transa, &a[idx]); + bli_obj_set_conjtrans(blis_transb, &b[idx]); + idx++; + } + } + f77_m[i] = m[i]; + f77_n[i] = n[i]; + f77_k[i] = k[i]; + f77_lda[i] = lda[i]; + f77_ldb[i] = ldb[i]; + f77_ldc[i] = ldc[i]; + f77_group_size[i] = group_size[i]; + + } + + idx = 0; + for(i = 0; i < GRP_COUNT; i++) + for(j = 0; j < group_size[i]; j++) + { + bli_copym(&c[idx], &c_save[idx]); + idx++; + } + + dtime_save = DBL_MAX; + + for( r = 0; r < n_repeats; ++r ) + { + idx = 0; + for(i = 0; i < GRP_COUNT; i++) + for(j = 0; j < group_size[i]; j++) + { + bli_copym( &c_save[idx], &c[idx]); + idx++; + } + + dtime = bli_clock(); + +#ifdef PRINT + idx = 0; + for(i = 0; i < GRP_COUNT; i++) + for(j = 0; j < group_size[i]; j++) + { + printf("Group: %ld Member: %ld\n", i, j); + + bli_printm("a", &a[idx], "%4.1f", ""); + bli_printm("b", &b[idx], "%4.1f", ""); + bli_printm("c", &c[idx], "%4.1f", ""); + + idx++; + } +#endif + + if(bli_is_float(dt)) + { + const float *ap[total_count], *bp[total_count]; + float *cp[total_count]; + float alphap[GRP_COUNT], betap[GRP_COUNT]; + + idx = 0; + for(i = 0; i < GRP_COUNT; i++) + { + for(j = 0; j < group_size[i]; j++) + { + ap[idx] = bli_obj_buffer( &a[idx] ); + bp[idx] = bli_obj_buffer( &b[idx] ); + cp[idx] = bli_obj_buffer( &c[idx] ); + + idx++; + } + alphap[i] = *(float*)bli_obj_buffer_for_1x1(dt, &alpha[i]); + betap[i] = *(float*)bli_obj_buffer_for_1x1(dt, &beta[i] ); + } + +#ifdef CHECK_CBLAS + cblas_sgemm_batch( cblas_order, + cblas_transa, + cblas_transb, + f77_m, f77_n, f77_k, + alphap, ap, f77_lda, + bp, f77_ldb, + betap, cp, f77_ldc, + f77_group_count, + f77_group_size + ); +#else + sgemm_batch_( f77_transa, + f77_transb, + f77_m, f77_n, f77_k, + alphap, ap, f77_lda, + bp, f77_ldb, + betap, cp, f77_ldc, + &f77_group_count, + f77_group_size + ); +#endif + + } + else if(bli_is_double(dt)) + { + const double *ap[total_count], *bp[total_count]; + double *cp[total_count]; + double alphap[GRP_COUNT], betap[GRP_COUNT]; + + idx = 0; + for(i = 0; i < GRP_COUNT; i++) + { + for(j = 0; j < group_size[i]; j++) + { + ap[idx] = bli_obj_buffer( &a[idx] ); + bp[idx] = bli_obj_buffer( &b[idx] ); + cp[idx] = bli_obj_buffer( &c[idx] ); + + idx++; + } + alphap[i] = *(double*)bli_obj_buffer_for_1x1(dt, &alpha[i]); + betap[i] = *(double*)bli_obj_buffer_for_1x1(dt, &beta[i] ); + } +#ifdef CHECK_CBLAS + cblas_dgemm_batch( cblas_order, + cblas_transa, + cblas_transb, + f77_m, f77_n, f77_k, + alphap, ap, f77_lda, + bp, f77_ldb, + betap, cp, f77_ldc, + f77_group_count, + f77_group_size + ); +#else + dgemm_batch_( f77_transa, + f77_transb, + f77_m, f77_n, f77_k, + alphap, ap, f77_lda, + bp, f77_ldb, + betap, cp, f77_ldc, + &f77_group_count, + f77_group_size + ); +#endif + + } + else if(bli_is_scomplex(dt)) + { + const scomplex *ap[total_count], *bp[total_count]; + scomplex *cp[total_count]; + scomplex alphap[GRP_COUNT], betap[GRP_COUNT]; + + idx = 0; + for(i = 0; i < GRP_COUNT; i++) + { + for(j = 0; j < group_size[i]; j++) + { + ap[idx] = bli_obj_buffer( &a[idx] ); + bp[idx] = bli_obj_buffer( &b[idx] ); + cp[idx] = bli_obj_buffer( &c[idx] ); + + idx++; + } + alphap[i] = *(scomplex*)bli_obj_buffer_for_1x1(dt, &alpha[i]); + betap[i] = *(scomplex*)bli_obj_buffer_for_1x1(dt, &beta[i] ); + } +#ifdef CHECK_CBLAS + cblas_cgemm_batch( cblas_order, + cblas_transa, + cblas_transb, + f77_m, f77_n, f77_k, + (const void*)alphap, + (const void**)ap, f77_lda, + (const void**)bp, f77_ldb, + (const void*)betap, (void**)cp, f77_ldc, + f77_group_count, + f77_group_size + ); +#else + cgemm_batch_( f77_transa, + f77_transb, + f77_m, f77_n, f77_k, + alphap, ap, f77_lda, + bp, f77_ldb, + betap, cp, f77_ldc, + &f77_group_count, + f77_group_size + ); +#endif + } + else if(bli_is_dcomplex(dt)) + { + const dcomplex *ap[total_count], *bp[total_count]; + dcomplex *cp[total_count]; + dcomplex alphap[GRP_COUNT], betap[GRP_COUNT]; + + idx = 0; + for(i = 0; i < GRP_COUNT; i++) + { + for(j = 0; j < group_size[i]; j++) + { + ap[idx] = bli_obj_buffer( &a[idx] ); + bp[idx] = bli_obj_buffer( &b[idx] ); + cp[idx] = bli_obj_buffer( &c[idx] ); + + idx++; + } + alphap[i] = *(dcomplex*)bli_obj_buffer_for_1x1(dt, &alpha[i]); + betap[i] = *(dcomplex*)bli_obj_buffer_for_1x1(dt, &beta[i] ); + } + +#ifdef CHECK_CBLAS + cblas_zgemm_batch( cblas_order, + cblas_transa, + cblas_transb, + f77_m, f77_n, f77_k, + (const void*)alphap, + (const void**)ap, f77_lda, + (const void**)bp, f77_ldb, + (const void*)betap, (void**)cp, f77_ldc, + f77_group_count, + f77_group_size + ); +#else + zgemm_batch_( f77_transa, + f77_transb, + f77_m, f77_n, f77_k, + alphap, ap, f77_lda, + bp, f77_ldb, + betap, cp, f77_ldc, + &f77_group_count, + f77_group_size + ); +#endif + } +#ifdef PRINT + idx = 0; + for(i = 0; i < GRP_COUNT; i++) + for(j = 0; j < group_size[i]; j++) + { + printf("Group: %ld Member: %ld\n", i, j); + bli_printm("c after", &c[idx], "%4.1f", ""); + + idx++; + } +#endif + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + dim_t fp_ops = 0; + for(i = 0; i < GRP_COUNT; i++) + fp_ops += 2.0 * m[i] * k[i] * n[i] * group_size[i]; + + gflops = fp_ops / (dtime_save * 1.0e9 ); + + if(bli_is_complex( dt ) ) gflops *= 4.0; + +#ifdef FILE_IN_OUT + fprintf(fout, "Stor_scheme = %c, group_count = %lu, gflops = %7.2f\n", stor_scheme, GRP_COUNT, gflops); + for(i = 0; i < GRP_COUNT; i++) + fprintf(fout, "%4lu \t %4lu\t %4lu\t %4lu\t %4lu\t %4lu\t %c\t %c\t %4lu\n", m[i], n[i], k[i], lda[i], ldb[i], ldc[i], transa[i], transb[i], group_size[i]); + + fflush(fout); +#else + printf( "Stor_scheme = %c, group_count = %d, gflops = %7.2f\n", stor_scheme, GRP_COUNT, gflops); + for(i = 0; i < GRP_COUNT; i++) + printf("%4lu \t %4lu\t %4lu\t %4lu\t %4lu\t %4lu\t %c\t %c\t %4lu\n", m[i], n[i], k[i], lda[i], ldb[i], ldc[i], transa[i], transb[i], group_size[i]); + +#endif + + idx = 0; + for(i = 0; i < GRP_COUNT; i++) + { + bli_obj_free( &alpha[i]); + bli_obj_free( &beta[i] ); + + for(j = 0; j < group_size[i]; j++ ) + { + bli_obj_free( &a[idx]); + bli_obj_free( &b[idx]); + bli_obj_free( &c[idx]); + bli_obj_free( &c_save[idx]); + + idx++; + } + } +#ifdef FILE_IN_OUT + } + fclose(fin); + fclose(fout); +#endif + return 0; +} + From 7bde468c6f7ecc4b5322d2ade1ae9c0b88e6b9f3 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 13 Nov 2021 16:39:37 -0600 Subject: [PATCH 003/230] Added support for addons. Details: - Implemented a new feature called addons, which are similar to sandboxes except that there is no requirement to define gemm or any other particular operation. - Updated configure to accept --enable-addon= or -a syntax for requesting an addon be included within a BLIS build. configure now outputs the list of enabled addons into config.mk. It also outputs the corresponding #include directives for the addons' headers to a new companion to the bli_config.h header file named bli_addon.h. Because addons may wish to make use of existing BLIS types within their own definitions, the addons' headers must be included sometime after that of bli_config.h (which currently is #included before bli_type_defs.h). This is why the #include directives needed to go into a new top-level header file rather than the existing bli_config.h file. - Added a markdown document, docs/Addons.md, to explain addons, how to build with them, and what assumptions their authors should keep in mind as they create them. - Added a gemmlike-like implementation of sandwich gemm called 'gemmd' as an addon in addon/gemmd. The code uses a 'bao_' prefix for local functions, including the user-level object and typed APIs. - Updated .gitignore so that git ignores bli_addon.h files. --- .gitignore | 1 + Makefile | 48 ++ addon/gemmd/attic/bli_gemm_ex.c | 88 +++ addon/gemmd/bao_gemmd.c | 305 +++++++++++ addon/gemmd/bao_gemmd.h | 105 ++++ addon/gemmd/bao_gemmd_bp_var1.c | 530 ++++++++++++++++++ addon/gemmd/bao_gemmd_bp_var2.c | 602 +++++++++++++++++++++ addon/gemmd/bao_gemmd_check.c | 131 +++++ addon/gemmd/bao_gemmd_check.h | 50 ++ addon/gemmd/bao_gemmd_var.h | 126 +++++ addon/gemmd/bao_l3_packm_a.c | 330 +++++++++++ addon/gemmd/bao_l3_packm_a.h | 123 +++++ addon/gemmd/bao_l3_packm_b.c | 330 +++++++++++ addon/gemmd/bao_l3_packm_b.h | 123 +++++ addon/gemmd/bao_l3_packm_var.h | 69 +++ addon/gemmd/bao_l3_packm_var1.c | 195 +++++++ addon/gemmd/bao_l3_packm_var2.c | 245 +++++++++ addon/gemmd/bao_packm_cxk.c | 199 +++++++ addon/gemmd/bao_packm_cxk.h | 59 ++ addon/gemmd/gemmd.h | 54 ++ addon/gemmd/thread/bao_l3_decor.h | 75 +++ addon/gemmd/thread/bao_l3_decor_openmp.c | 140 +++++ addon/gemmd/thread/bao_l3_decor_openmp.h | 44 ++ addon/gemmd/thread/bao_l3_decor_pthreads.c | 220 ++++++++ addon/gemmd/thread/bao_l3_decor_pthreads.h | 47 ++ addon/gemmd/thread/bao_l3_decor_single.c | 143 +++++ addon/gemmd/thread/bao_l3_decor_single.h | 44 ++ build/bli_addon.h.in | 47 ++ build/config.mk.in | 4 + common.mk | 121 ++++- configure | 152 +++++- docs/Addons.md | 231 ++++++++ frame/include/bli_config_macro_defs.h | 5 +- frame/include/blis.h | 8 + 34 files changed, 4961 insertions(+), 33 deletions(-) create mode 100644 addon/gemmd/attic/bli_gemm_ex.c create mode 100644 addon/gemmd/bao_gemmd.c create mode 100644 addon/gemmd/bao_gemmd.h create mode 100644 addon/gemmd/bao_gemmd_bp_var1.c create mode 100644 addon/gemmd/bao_gemmd_bp_var2.c create mode 100644 addon/gemmd/bao_gemmd_check.c create mode 100644 addon/gemmd/bao_gemmd_check.h create mode 100644 addon/gemmd/bao_gemmd_var.h create mode 100644 addon/gemmd/bao_l3_packm_a.c create mode 100644 addon/gemmd/bao_l3_packm_a.h create mode 100644 addon/gemmd/bao_l3_packm_b.c create mode 100644 addon/gemmd/bao_l3_packm_b.h create mode 100644 addon/gemmd/bao_l3_packm_var.h create mode 100644 addon/gemmd/bao_l3_packm_var1.c create mode 100644 addon/gemmd/bao_l3_packm_var2.c create mode 100644 addon/gemmd/bao_packm_cxk.c create mode 100644 addon/gemmd/bao_packm_cxk.h create mode 100644 addon/gemmd/gemmd.h create mode 100644 addon/gemmd/thread/bao_l3_decor.h create mode 100644 addon/gemmd/thread/bao_l3_decor_openmp.c create mode 100644 addon/gemmd/thread/bao_l3_decor_openmp.h create mode 100644 addon/gemmd/thread/bao_l3_decor_pthreads.c create mode 100644 addon/gemmd/thread/bao_l3_decor_pthreads.h create mode 100644 addon/gemmd/thread/bao_l3_decor_single.c create mode 100644 addon/gemmd/thread/bao_l3_decor_single.h create mode 100644 build/bli_addon.h.in create mode 100644 docs/Addons.md diff --git a/.gitignore b/.gitignore index 49b22c2b8..a24fe2b0e 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,7 @@ config.mk bli_config.h +bli_addon.h # -- monolithic headers -- diff --git a/Makefile b/Makefile index b5e036744..992983328 100644 --- a/Makefile +++ b/Makefile @@ -114,6 +114,7 @@ BASE_OBJ_CONFIG_PATH := $(BASE_OBJ_PATH)/$(CONFIG_DIR) BASE_OBJ_FRAME_PATH := $(BASE_OBJ_PATH)/$(FRAME_DIR) BASE_OBJ_REFKERN_PATH := $(BASE_OBJ_PATH)/$(REFKERN_DIR) BASE_OBJ_KERNELS_PATH := $(BASE_OBJ_PATH)/$(KERNELS_DIR) +BASE_OBJ_ADDON_PATH := $(BASE_OBJ_PATH)/$(ADDON_DIR) BASE_OBJ_SANDBOX_PATH := $(BASE_OBJ_PATH)/$(SANDBOX_DIR) # --- Define install target names for static libraries --- @@ -210,6 +211,10 @@ MK_REFKERN_OBJS := $(foreach arch, $(CONFIG_LIST), \ # Generate object file paths for all of the portable framework source code. MK_FRAME_OBJS := $(call gen-obj-paths-from-src,$(FRAME_SRC_SUFS),$(MK_FRAME_SRC),$(FRAME_PATH),$(BASE_OBJ_FRAME_PATH)) +# Generate object file paths for the addon source code. If one or more addons +# were not enabled a configure-time, this variable will we empty. +MK_ADDON_OBJS := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDON_SRC),$(ADDON_PATH),$(BASE_OBJ_ADDON_PATH)) + # Generate object file paths for the sandbox source code. If a sandbox was not # enabled a configure-time, this variable will we empty. MK_SANDBOX_OBJS := $(call gen-obj-paths-from-src,$(SANDBOX_SRC_SUFS),$(MK_SANDBOX_SRC),$(SANDBOX_PATH),$(BASE_OBJ_SANDBOX_PATH)) @@ -219,6 +224,7 @@ MK_BLIS_OBJS := $(MK_CONFIG_OBJS) \ $(MK_KERNELS_OBJS) \ $(MK_REFKERN_OBJS) \ $(MK_FRAME_OBJS) \ + $(MK_ADDON_OBJS) \ $(MK_SANDBOX_OBJS) # Optionally filter out the BLAS and CBLAS compatibility layer object files. @@ -549,6 +555,28 @@ else endif endef +# first argument: a configuration name from the union of config_list and +# config_name, used to look up the CFLAGS to use during compilation. +define make-c99-addon-rule +$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS) +ifeq ($(ENABLE_VERBOSE),yes) + $(CC) $(call get-addon-c99flags-for,$(1)) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-addon-c99text-for,$(1)) + @$(CC) $(call get-addon-c99flags-for,$(1)) -c $$< -o $$@ +endif +endef + +define make-cxx-addon-rule +$(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_HXX_FILES) $(MAKE_DEFS_MK_PATHS) +ifeq ($(ENABLE_VERBOSE),yes) + $(CXX) $(call get-addon-cxxflags-for,$(1)) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-addon-cxxtext-for,$(1)) + @$(CXX) $(call get-addon-cxxflags-for,$(1)) -c $$< -o $$@ +endif +endef + # first argument: a configuration name from the union of config_list and # config_name, used to look up the CFLAGS to use during compilation. define make-c99-sandbox-rule @@ -601,6 +629,16 @@ $(foreach conf, $(CONFIG_LIST), $(eval $(call make-refkern-rule,$(conf)))) $(foreach suf, $(KERNELS_SRC_SUFS), \ $(foreach kset, $(KERNEL_LIST), $(eval $(call make-kernels-rule,$(kset),$(call get-config-for-kset,$(kset)),$(suf))))) +# Instantiate the build rule for C addon files. Use the CFLAGS for the +# configuration family. +$(foreach suf, $(ADDON_C99_SUFS), \ +$(foreach conf, $(CONFIG_NAME), $(eval $(call make-c99-addon-rule,$(conf),$(suf))))) + +# Instantiate the build rule for C++ addon files. Use the CFLAGS for the +# configuration family. +$(foreach suf, $(ADDON_CXX_SUFS), \ +$(foreach conf, $(CONFIG_NAME), $(eval $(call make-cxx-addon-rule,$(conf),$(suf))))) + # Instantiate the build rule for C sandbox files. Use the CFLAGS for the # configuration family. $(foreach suf, $(SANDBOX_C99_SUFS), \ @@ -1078,6 +1116,9 @@ ifeq ($(ENABLE_VERBOSE),yes) - $(FIND) $(FRAME_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) - $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) - $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) +ifneq ($(ADDON_LIST),) + - $(FIND) $(ADDON_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) +endif ifneq ($(SANDBOX),) - $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) endif @@ -1090,6 +1131,10 @@ else @- $(FIND) $(REFKERN_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) @echo "Removing makefile fragments from $(KERNELS_FRAG_PATH)" @- $(FIND) $(KERNELS_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) +ifneq ($(ADDON_LIST),) + @echo "Removing makefile fragments from $(ADDON_FRAG_PATH)" + @- $(FIND) $(ADDON_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) +endif ifneq ($(SANDBOX),) @echo "Removing makefile fragments from $(SANDBOX_FRAG_PATH)" @- $(FIND) $(SANDBOX_FRAG_PATH) -name "$(FRAGMENT_MK)" | $(XARGS) $(RM_F) @@ -1210,6 +1255,7 @@ endif # IS_CONFIGURED distclean: cleanmk cleanh cleanlib cleantest ifeq ($(IS_CONFIGURED),yes) ifeq ($(ENABLE_VERBOSE),yes) + - $(RM_F) $(BLIS_ADDON_H) - $(RM_F) $(BLIS_CONFIG_H) - $(RM_F) $(CONFIG_MK_FILE) - $(RM_F) $(PC_OUT_FILE) @@ -1217,6 +1263,8 @@ ifeq ($(ENABLE_VERBOSE),yes) - $(RM_RF) $(LIB_DIR) - $(RM_RF) $(INCLUDE_DIR) else + @echo "Removing $(BLIS_ADDON_H)" + @$(RM_F) $(BLIS_ADDON_H) @echo "Removing $(BLIS_CONFIG_H)" @$(RM_F) $(BLIS_CONFIG_H) @echo "Removing $(CONFIG_MK_FILE)" diff --git a/addon/gemmd/attic/bli_gemm_ex.c b/addon/gemmd/attic/bli_gemm_ex.c new file mode 100644 index 000000000..0f40d1cb3 --- /dev/null +++ b/addon/gemmd/attic/bli_gemm_ex.c @@ -0,0 +1,88 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // A switch to easily toggle whether we use the addon implementation + // of bao_gemmd() as the implementation for bli_gemm(). (This allows for + // easy testing of bao_gemmd() via the testsuite.) + if ( 1 ) + { + const dim_t k = bli_obj_width_after_trans( a ); + const num_t dt = bli_obj_dt( c ); + obj_t d; + + bli_obj_create( dt, k, 1, 1, k, &d ); + bli_setv( &BLIS_ONE, &d ); + //bli_randv( &d ); + + bao_gemmd_ex( alpha, a, &d, b, beta, c, cntx, rntm ); + + bli_obj_free( &d ); + return; + } + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Obtain a valid (native) context from the gks if necessary. + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + // Check the operands. + if ( bli_error_checking_is_enabled() ) + bli_gemm_check( alpha, a, b, beta, c, cntx ); + + // Invoke the operation's front end. + bli_gemm_front + ( + alpha, a, b, beta, c, cntx, rntm, NULL + ); +} + diff --git a/addon/gemmd/bao_gemmd.c b/addon/gemmd/bao_gemmd.c new file mode 100644 index 000000000..71d49806b --- /dev/null +++ b/addon/gemmd/bao_gemmd.c @@ -0,0 +1,305 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// -- Define the gemmd operation's object API ---------------------------------- +// + +void bao_gemmd + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c + ) +{ + bao_gemmd_ex + ( + alpha, + a, + d, + b, + beta, + c, + NULL, + NULL + ); +} + +void bao_gemmd_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + bli_init_once(); + + // Initialize a local runtime with global settings if necessary. Note + // that in the case that a runtime is passed in, we make a local copy. + rntm_t rntm_l; + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } + else { rntm_l = *rntm; rntm = &rntm_l; } + + // Obtain a valid (native) context from the gks if necessary. + // NOTE: This must be done before calling the _check() function, since + // that function assumes the context pointer is valid. + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bao_gemmd_check( alpha, a, d, b, beta, c, cntx ); + + // -- bli_gemmd_front() ---------------------------------------------------- + + obj_t a_local; + obj_t b_local; + obj_t c_local; + + // If C has a zero dimension, return early. + if ( bli_obj_has_zero_dim( c ) ) + { + return; + } + + // If alpha is zero, or if A or B has a zero dimension, scale C by beta + // and return early. + if ( bli_obj_equals( alpha, &BLIS_ZERO ) || + bli_obj_has_zero_dim( a ) || + bli_obj_has_zero_dim( b ) ) + { + bli_scalm( beta, c ); + return; + } + + // Alias A, B, and C in case we need to apply transformations. + bli_obj_alias_to( a, &a_local ); + bli_obj_alias_to( b, &b_local ); + bli_obj_alias_to( c, &c_local ); + + // Induce a transposition of A if it has its transposition property set. + // Then clear the transposition bit in the object. + if ( bli_obj_has_trans( &a_local ) ) + { + bli_obj_induce_trans( &a_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); + } + + // Induce a transposition of B if it has its transposition property set. + // Then clear the transposition bit in the object. + if ( bli_obj_has_trans( &b_local ) ) + { + bli_obj_induce_trans( &b_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &b_local ); + } + + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + { + bli_obj_swap( &a_local, &b_local ); + + bli_obj_induce_trans( &a_local ); + bli_obj_induce_trans( &b_local ); + bli_obj_induce_trans( &c_local ); + } + + // Parse and interpret the contents of the rntm_t object to properly + // set the ways of parallelism for each loop, and then make any + // additional modifications necessary for the current operation. + bli_rntm_set_ways_for_op + ( + BLIS_GEMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + rntm + ); + + // Spawn threads (if applicable), where bao_gemmd_int() is the thread entry + // point function for each thread. This also begins the process of creating + // the thrinfo_t tree, which contains thread communicators. + bao_l3_thread_decorator + ( + bao_gemmd_int, + BLIS_GEMM, // operation family id + alpha, + &a_local, + d, + &b_local, + beta, + &c_local, + cntx, + rntm + ); +} + +// +// -- Define the gemmd operation's thread entry point -------------------------- +// + +void bao_gemmd_int + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + // In this function, we choose the gemmd implementation that is executed + // on each thread. + +#if 1 + // Call the block-panel algorithm that calls the kernel directly, which + // exposes edge-case handling. + bao_gemmd_bp_var1 + ( + alpha, + a, + d, + b, + beta, + c, + cntx, + rntm, + thread + ); +#else + // Call the block-panel algorithm that calls the kernel indirectly via a + // wrapper function, which hides edge-case handling. + bao_gemmd_bp_var2 + ( + alpha, + a, + d, + b, + beta, + c, + cntx, + rntm, + thread + ); +#endif +} + +// +// -- Define the gemmd operation's typed API ----------------------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* d, inc_t incd, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + bli_init_once(); \ +\ + /* Determine the datatype (e.g. BLIS_FLOAT, BLIS_DOUBLE, etc.) based on + the macro parameter 'ch' (e.g. s, d, etc). */ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, dd, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + /* Adjust the dimensions of matrices A and B according to the transa and + transb parameters. */ \ + bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ + bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \ +\ + /* Create bufferless scalar objects and attach the provided scalar pointers + to those scalar objects. */ \ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + /* Create bufferless matrix objects and attach the provided matrix pointers + to those matrix objects. */ \ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, k, 1, d, incd, k, &dd ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + /* Set the transposition/conjugation properties of the objects for matrices + A and B. */ \ + bli_obj_set_conjtrans( transa, &ao ); \ + bli_obj_set_conjtrans( transb, &bo ); \ +\ + /* Call the object interface. */ \ + PASTECH(bao_,opname) \ + ( \ + &alphao, \ + &ao, \ + &dd, \ + &bo, \ + &betao, \ + &co \ + ); \ +} + +//INSERT_GENTFUNC_BASIC0( gemmd ) +GENTFUNC( float, s, gemmd ) +GENTFUNC( double, d, gemmd ) +GENTFUNC( scomplex, c, gemmd ) +GENTFUNC( dcomplex, z, gemmd ) + diff --git a/addon/gemmd/bao_gemmd.h b/addon/gemmd/bao_gemmd.h new file mode 100644 index 000000000..7c7466494 --- /dev/null +++ b/addon/gemmd/bao_gemmd.h @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// +// -- Prototype the gemmd operation's object API ------------------------------- +// + +BLIS_EXPORT_ADDON void bao_gemmd + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c + ); + +BLIS_EXPORT_ADDON void bao_gemmd_ex + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +// +// -- Prototype the gemmd operation's thread entry point ----------------------- +// + +void bao_gemmd_int + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +// +// -- Prototype the gemmd operation's typed API -------------------------------- +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_ADDON void PASTECH2(bao_,ch,opname) \ + ( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* d, inc_t incd, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +//INSERT_GENTPROT_BASIC0( gemmd ) +GENTPROT( float, s, gemmd ) +GENTPROT( double, d, gemmd ) +GENTPROT( scomplex, c, gemmd ) +GENTPROT( dcomplex, z, gemmd ) + diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c new file mode 100644 index 000000000..e042f1fd8 --- /dev/null +++ b/addon/gemmd/bao_gemmd_bp_var1.c @@ -0,0 +1,530 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmd_fp + +typedef void (*FUNCPTR_T) + ( + conj_t conja, + conj_t conjb, + dim_t m, + dim_t n, + dim_t k, + void* restrict alpha, + void* restrict a, inc_t rs_a, inc_t cs_a, + void* restrict d, inc_t incd, + void* restrict b, inc_t rs_b, inc_t cs_b, + void* restrict beta, + void* restrict c, inc_t rs_c, inc_t cs_c, + cntx_t* restrict cntx, + rntm_t* restrict rntm, + thrinfo_t* restrict thread + ); + +// +// -- gemmd-like block-panel algorithm (object interface) ---------------------- +// + +// Define a function pointer array named ftypes and initialize its contents with +// the addresses of the typed functions defined below, bao_?gemmd_bp_var1(). +static FUNCPTR_T GENARRAY_PREF(ftypes,bao_,gemmd_bp_var1); + +void bao_gemmd_bp_var1 + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + const num_t dt = bli_obj_dt( c ); + + const conj_t conja = bli_obj_conj_status( a ); + const conj_t conjb = bli_obj_conj_status( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + void* restrict buf_a = bli_obj_buffer_at_off( a ); + const inc_t rs_a = bli_obj_row_stride( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + + void* restrict buf_d = bli_obj_buffer_at_off( d ); + const inc_t incd = bli_obj_vector_inc( d ); + + void* restrict buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t cs_b = bli_obj_col_stride( b ); + + void* restrict buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + + // Index into the function pointer array to extract the correct + // typed function pointer based on the chosen datatype. + FUNCPTR_T f = ftypes[dt]; + + // Invoke the function. + f + ( + conja, + conjb, + m, + n, + k, + buf_alpha, + buf_a, rs_a, cs_a, + buf_d, incd, + buf_b, rs_b, cs_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread + ); +} + +// +// -- gemmd-like block-panel algorithm (typed interface) ----------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict d, inc_t incd, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Query the context for various blocksizes. */ \ + const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \ + const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \ + const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + /* Query the context for the microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + /* Compute partitioning step values for each matrix of each loop. */ \ + const inc_t jcstep_c = cs_c; \ + const inc_t jcstep_b = cs_b; \ +\ + const inc_t pcstep_a = cs_a; \ + const inc_t pcstep_d = incd; \ + const inc_t pcstep_b = rs_b; \ +\ + const inc_t icstep_c = rs_c; \ + const inc_t icstep_a = rs_a; \ +\ + const inc_t jrstep_c = cs_c * NR; \ +\ + const inc_t irstep_c = rs_c * MR; \ +\ + ctype* restrict a_00 = a; \ + ctype* restrict d_00 = d; \ + ctype* restrict b_00 = b; \ + ctype* restrict c_00 = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + /* Make local copies of the scalars to prevent any unnecessary sharing of + cache lines between the cores' caches. */ \ + ctype alpha_local = *alpha_cast; \ + ctype beta_local = *beta_cast; \ + ctype one_local = *PASTEMAC(ch,1); \ + ctype zero_local = *PASTEMAC(ch,0); \ +\ + auxinfo_t aux; \ +\ + /* Initialize a mem_t entry for A and B. Strictly speaking, this is only + needed for the matrix we will be packing (if any), but we do it + unconditionally to be safe. */ \ + mem_t mem_a = BLIS_MEM_INITIALIZER; \ + mem_t mem_b = BLIS_MEM_INITIALIZER; \ +\ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. */ \ + bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \ + BLIS_KC, /* 4th loop */ \ + BLIS_NO_PART, /* pack B */ \ + BLIS_MC, /* 3rd loop */ \ + BLIS_NO_PART, /* pack A */ \ + BLIS_NR, /* 2nd loop */ \ + BLIS_MR, /* 1st loop */ \ + BLIS_KR }; /* microkernel loop */ \ +\ + bszid_t* restrict bszids_jc = &bszids[0]; \ + bszid_t* restrict bszids_pc = &bszids[1]; \ + /*bszid_t* restrict bszids_pb = &bszids[2];*/ \ + bszid_t* restrict bszids_ic = &bszids[3]; \ + /*bszid_t* restrict bszids_pa = &bszids[4];*/ \ + bszid_t* restrict bszids_jr = &bszids[5]; \ + /*bszid_t* restrict bszids_ir = &bszids[6];*/ \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ + thrinfo_t* restrict thread_ir = NULL; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ + const dim_t n_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = n_local % NC; \ +\ + /* Loop over the n dimension (NC rows/columns at a time). */ \ + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ + { \ + /* Calculate the thread's current JC block dimension. */ \ + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ +\ + ctype* restrict b_jc = b_00 + jj * jcstep_b; \ + ctype* restrict c_jc = c_00 + jj * jcstep_c; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ +\ + ctype* restrict a_pc = a_00 + pp * pcstep_a; \ + ctype* restrict d_pc = d_00 + pp * pcstep_d; \ + ctype* restrict b_pc = b_jc + pp * pcstep_b; \ +\ + /* Only apply beta to the first iteration of the pc loop. */ \ + ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ +\ + ctype* b_use; \ + inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pb = bli_thrinfo_sub_node( thread_pc ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + B. Then call the packm implementation. */ \ + PASTECH2(bao_,ch,packm_b) \ + ( \ + conjb, \ + KC, NC, \ + kc_cur, nc_cur, NR, \ + &one_local, \ + d_pc, incd, \ + b_pc, rs_b, cs_b, \ + &b_use, &rs_b_use, &cs_b_use, \ + &ps_b_use, \ + cntx, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ + /* Alias b_use so that it's clear this is our current block of + matrix B. */ \ + ctype* restrict b_pc_use = b_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \ + const dim_t m_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = m_local % MC; \ +\ + /* Loop over the m dimension (MC rows at a time). */ \ + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ + { \ + /* Calculate the thread's current IC block dimension. */ \ + const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ +\ + ctype* restrict a_ic = a_pc + ii * icstep_a; \ + ctype* restrict c_ic = c_jc + ii * icstep_c; \ +\ + ctype* a_use; \ + inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pa = bli_thrinfo_sub_node( thread_ic ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + A. Then call the packm implementation. */ \ + PASTECH2(bao_,ch,packm_a) \ + ( \ + conja, \ + MC, KC, \ + mc_cur, kc_cur, MR, \ + &one_local, \ + d_pc, incd, \ + a_ic, rs_a, cs_a, \ + &a_use, &rs_a_use, &cs_a_use, \ + &ps_a_use, \ + cntx, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ +\ + /* Alias a_use so that it's clear this is our current block of + matrix A. */ \ + ctype* restrict a_ic_use = a_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Query the number of threads and thread ids for the JR loop. + NOTE: These values are only needed when computing the next + micropanel of B. */ \ + const dim_t jr_nt = bli_thread_n_way( thread_jr ); \ + const dim_t jr_tid = bli_thread_work_id( thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ + dim_t jr_left = nc_cur % NR; \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ + { \ + const dim_t nr_cur \ + = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ +\ + ctype* restrict b_jr = b_pc_use + j * ps_b_use; \ + ctype* restrict c_jr = c_ic + j * jrstep_c; \ +\ + /* Assume for now that our next panel of B to be the current panel + of B. */ \ + ctype* restrict b2 = b_jr; \ +\ + /* Identify the current thrinfo_t node. */ \ + thread_ir = bli_thrinfo_sub_node( thread_jr ); \ +\ + /* Query the number of threads and thread ids for the IR loop. + NOTE: These values are only needed when computing the next + micropanel of A. */ \ + const dim_t ir_nt = bli_thread_n_way( thread_ir ); \ + const dim_t ir_tid = bli_thread_work_id( thread_ir ); \ +\ + /* Compute number of primary and leftover components of the IR loop. */ \ + dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ + dim_t ir_left = mc_cur % MR; \ +\ + /* Compute the IR loop thread range for the current thread. */ \ + dim_t ir_start, ir_end; \ + bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( dim_t i = ir_start; i < ir_end; i += 1 ) \ + { \ + const dim_t mr_cur \ + = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \ +\ + ctype* restrict a_ir = a_ic_use + i * ps_a_use; \ + ctype* restrict c_ir = c_jr + i * irstep_c; \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next micropanels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \ + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ + { \ + a2 = a_ic_use; \ + b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \ + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ + b2 = b_pc_use; \ + } \ +\ + /* Save the addresses of next micropanels of A and B to the + auxinfo_t object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( mr_cur == MR && nr_cur == NR ) \ + { \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + kc_cur, \ + &alpha_local, \ + a_ir, \ + b_jr, \ + beta_use, \ + c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + kc_cur, \ + &alpha_local, \ + a_ir, \ + b_jr, \ + &zero_local, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale the bottom edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn) \ + ( \ + mr_cur, \ + nr_cur, \ + ct, rs_ct, cs_ct, \ + beta_use, \ + c_ir, rs_c, cs_c \ + ); \ + } \ + } \ + } \ + } \ +\ + /* This barrier is needed to prevent threads from starting to pack + the next row panel of B before the current row panel is fully + computed upon. */ \ + bli_thread_barrier( thread_pb ); \ + } \ + } \ +\ + /* Release any memory that was acquired for packing matrices A and B. */ \ + PASTECH2(bao_,ch,packm_finalize_mem_a) \ + ( \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ + PASTECH2(bao_,ch,packm_finalize_mem_b) \ + ( \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var1: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \ +*/ \ +} + +//INSERT_GENTFUNC_BASIC0( gemmd_bp_var1 ) +GENTFUNC( float, s, gemmd_bp_var1 ) +GENTFUNC( double, d, gemmd_bp_var1 ) +GENTFUNC( scomplex, c, gemmd_bp_var1 ) +GENTFUNC( dcomplex, z, gemmd_bp_var1 ) + diff --git a/addon/gemmd/bao_gemmd_bp_var2.c b/addon/gemmd/bao_gemmd_bp_var2.c new file mode 100644 index 000000000..a0040fec0 --- /dev/null +++ b/addon/gemmd/bao_gemmd_bp_var2.c @@ -0,0 +1,602 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmd_fp + +typedef void (*FUNCPTR_T) + ( + conj_t conja, + conj_t conjb, + dim_t m, + dim_t n, + dim_t k, + void* restrict alpha, + void* restrict a, inc_t rs_a, inc_t cs_a, + void* restrict d, inc_t incd, + void* restrict b, inc_t rs_b, inc_t cs_b, + void* restrict beta, + void* restrict c, inc_t rs_c, inc_t cs_c, + cntx_t* restrict cntx, + rntm_t* restrict rntm, + thrinfo_t* restrict thread + ); + +// +// -- gemmd-like block-panel algorithm (object interface) ---------------------- +// + +// Define a function pointer array named ftypes and initialize its contents with +// the addresses of the typed functions defined below, bao_?gemmd_bp_var2(). +static FUNCPTR_T GENARRAY_PREF(ftypes,bao_,gemmd_bp_var2); + +void bao_gemmd_bp_var2 + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + const num_t dt = bli_obj_dt( c ); + + const conj_t conja = bli_obj_conj_status( a ); + const conj_t conjb = bli_obj_conj_status( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + void* restrict buf_a = bli_obj_buffer_at_off( a ); + const inc_t rs_a = bli_obj_row_stride( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + + void* restrict buf_d = bli_obj_buffer_at_off( d ); + const inc_t incd = bli_obj_vector_inc( d ); + + void* restrict buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t cs_b = bli_obj_col_stride( b ); + + void* restrict buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + + // Index into the function pointer array to extract the correct + // typed function pointer based on the chosen datatype. + FUNCPTR_T f = ftypes[dt]; + + // Invoke the function. + f + ( + conja, + conjb, + m, + n, + k, + buf_alpha, + buf_a, rs_a, cs_a, + buf_d, incd, + buf_b, rs_b, cs_b, + buf_beta, + buf_c, rs_c, cs_c, + cntx, + rntm, + thread + ); +} + +// +// -- gemmd-like block-panel algorithm (typed interface) ----------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict d, inc_t incd, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Query the context for various blocksizes. */ \ + const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \ + const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \ + const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \ +\ + /* Query the context for the microkernel address and cast it to its + function pointer type. */ \ + /* + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + */ \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + /* + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ + */ \ +\ + /* Compute partitioning step values for each matrix of each loop. */ \ + const inc_t jcstep_c = cs_c; \ + const inc_t jcstep_b = cs_b; \ +\ + const inc_t pcstep_a = cs_a; \ + const inc_t pcstep_d = incd; \ + const inc_t pcstep_b = rs_b; \ +\ + const inc_t icstep_c = rs_c; \ + const inc_t icstep_a = rs_a; \ +\ + const inc_t jrstep_c = cs_c * NR; \ +\ + const inc_t irstep_c = rs_c * MR; \ +\ + ctype* restrict a_00 = a; \ + ctype* restrict d_00 = d; \ + ctype* restrict b_00 = b; \ + ctype* restrict c_00 = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + /* Make local copies of the scalars to prevent any unnecessary sharing of + cache lines between the cores' caches. */ \ + ctype alpha_local = *alpha_cast; \ + ctype beta_local = *beta_cast; \ + ctype one_local = *PASTEMAC(ch,1); \ + /*ctype zero_local = *PASTEMAC(ch,0);*/ \ +\ + auxinfo_t aux; \ +\ + /* Initialize a mem_t entry for A and B. Strictly speaking, this is only + needed for the matrix we will be packing (if any), but we do it + unconditionally to be safe. */ \ + mem_t mem_a = BLIS_MEM_INITIALIZER; \ + mem_t mem_b = BLIS_MEM_INITIALIZER; \ +\ + /* Define an array of bszid_t ids, which will act as our substitute for + the cntl_t tree. */ \ + bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \ + BLIS_KC, /* 4th loop */ \ + BLIS_NO_PART, /* pack B */ \ + BLIS_MC, /* 3rd loop */ \ + BLIS_NO_PART, /* pack A */ \ + BLIS_NR, /* 2nd loop */ \ + BLIS_MR, /* 1st loop */ \ + BLIS_KR }; /* microkernel loop */ \ +\ + bszid_t* restrict bszids_jc = &bszids[0]; \ + bszid_t* restrict bszids_pc = &bszids[1]; \ + /*bszid_t* restrict bszids_pb = &bszids[2];*/ \ + bszid_t* restrict bszids_ic = &bszids[3]; \ + /*bszid_t* restrict bszids_pa = &bszids[4];*/ \ + bszid_t* restrict bszids_jr = &bszids[5]; \ + /*bszid_t* restrict bszids_ir = &bszids[6];*/ \ +\ + thrinfo_t* restrict thread_jc = NULL; \ + thrinfo_t* restrict thread_pc = NULL; \ + thrinfo_t* restrict thread_pb = NULL; \ + thrinfo_t* restrict thread_ic = NULL; \ + thrinfo_t* restrict thread_pa = NULL; \ + thrinfo_t* restrict thread_jr = NULL; \ + thrinfo_t* restrict thread_ir = NULL; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jc = thread; \ + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ +\ + /* Compute the JC loop thread range for the current thread. */ \ + dim_t jc_start, jc_end; \ + bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ + const dim_t n_local = jc_end - jc_start; \ +\ + /* Compute number of primary and leftover components of the JC loop. */ \ + /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ + const dim_t jc_left = n_local % NC; \ +\ + /* Loop over the n dimension (NC rows/columns at a time). */ \ + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ + { \ + /* Calculate the thread's current JC block dimension. */ \ + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ +\ + ctype* restrict b_jc = b_00 + jj * jcstep_b; \ + ctype* restrict c_jc = c_00 + jj * jcstep_c; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ +\ + /* Compute the PC loop thread range for the current thread. */ \ + const dim_t pc_start = 0, pc_end = k; \ + const dim_t k_local = k; \ +\ + /* Compute number of primary and leftover components of the PC loop. */ \ + /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ + const dim_t pc_left = k_local % KC; \ +\ + /* Loop over the k dimension (KC rows/columns at a time). */ \ + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ + { \ + /* Calculate the thread's current PC block dimension. */ \ + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ +\ + ctype* restrict a_pc = a_00 + pp * pcstep_a; \ + ctype* restrict d_pc = d_00 + pp * pcstep_d; \ + ctype* restrict b_pc = b_jc + pp * pcstep_b; \ +\ + /* Only apply beta to the first iteration of the pc loop. */ \ + ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ +\ + ctype* b_use; \ + inc_t rs_b_use, cs_b_use, ps_b_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pb = bli_thrinfo_sub_node( thread_pc ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + B. Then call the packm implementation. */ \ + PASTECH2(bao_,ch,packm_b) \ + ( \ + conjb, \ + KC, NC, \ + kc_cur, nc_cur, NR, \ + &one_local, \ + d_pc, incd, \ + b_pc, rs_b, cs_b, \ + &b_use, &rs_b_use, &cs_b_use, \ + &ps_b_use, \ + cntx, \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ + /* Alias b_use so that it's clear this is our current block of + matrix B. */ \ + ctype* restrict b_pc_use = b_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ +\ + /* Compute the IC loop thread range for the current thread. */ \ + dim_t ic_start, ic_end; \ + bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \ + const dim_t m_local = ic_end - ic_start; \ +\ + /* Compute number of primary and leftover components of the IC loop. */ \ + /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ + const dim_t ic_left = m_local % MC; \ +\ + /* Loop over the m dimension (MC rows at a time). */ \ + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ + { \ + /* Calculate the thread's current IC block dimension. */ \ + const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ +\ + ctype* restrict a_ic = a_pc + ii * icstep_a; \ + ctype* restrict c_ic = c_jc + ii * icstep_c; \ +\ + ctype* a_use; \ + inc_t rs_a_use, cs_a_use, ps_a_use; \ +\ + /* Identify the current thrinfo_t node. Note that the thrinfo_t + node will have already been created by a previous call to + bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART + cause the tree to grow by two (e.g. to the next bszid that is + a normal bszid_t value). */ \ + thread_pa = bli_thrinfo_sub_node( thread_ic ); \ + /*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \ +\ + /* Determine the packing buffer and related parameters for matrix + A. Then call the packm implementation. */ \ + PASTECH2(bao_,ch,packm_a) \ + ( \ + conja, \ + MC, KC, \ + mc_cur, kc_cur, MR, \ + &one_local, \ + d_pc, incd, \ + a_ic, rs_a, cs_a, \ + &a_use, &rs_a_use, &cs_a_use, \ + &ps_a_use, \ + cntx, \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ +\ + /* Alias a_use so that it's clear this is our current block of + matrix A. */ \ + ctype* restrict a_ic_use = a_use; \ +\ + /* Identify the current thrinfo_t node and then grow the tree. */ \ + thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ +\ + /* Query the number of threads and thread ids for the JR loop. + NOTE: These values are only needed when computing the next + micropanel of B. */ \ + const dim_t jr_nt = bli_thread_n_way( thread_jr ); \ + const dim_t jr_tid = bli_thread_work_id( thread_jr ); \ +\ + /* Compute number of primary and leftover components of the JR loop. */ \ + dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ + dim_t jr_left = nc_cur % NR; \ +\ + /* Compute the JR loop thread range for the current thread. */ \ + dim_t jr_start, jr_end; \ + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ + { \ + const dim_t nr_cur \ + = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ +\ + ctype* restrict b_jr = b_pc_use + j * ps_b_use; \ + ctype* restrict c_jr = c_ic + j * jrstep_c; \ +\ + /* Assume for now that our next panel of B to be the current panel + of B. */ \ + ctype* restrict b2 = b_jr; \ +\ + /* Identify the current thrinfo_t node. */ \ + thread_ir = bli_thrinfo_sub_node( thread_jr ); \ +\ + /* Query the number of threads and thread ids for the IR loop. + NOTE: These values are only needed when computing the next + micropanel of A. */ \ + const dim_t ir_nt = bli_thread_n_way( thread_ir ); \ + const dim_t ir_tid = bli_thread_work_id( thread_ir ); \ +\ + /* Compute number of primary and leftover components of the IR loop. */ \ + dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ + dim_t ir_left = mc_cur % MR; \ +\ + /* Compute the IR loop thread range for the current thread. */ \ + dim_t ir_start, ir_end; \ + bli_thread_range_sub( thread_ir, ir_iter, 1, FALSE, &ir_start, &ir_end ); \ +\ + /* Loop over the m dimension (MR rows at a time). */ \ + for ( dim_t i = ir_start; i < ir_end; i += 1 ) \ + { \ + const dim_t mr_cur \ + = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \ +\ + ctype* restrict a_ir = a_ic_use + i * ps_a_use; \ + ctype* restrict c_ir = c_jr + i * irstep_c; \ +\ + ctype* restrict a2; \ +\ + /* Compute the addresses of the next micropanels of A and B. */ \ + a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \ + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ + { \ + a2 = a_ic_use; \ + b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \ + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ + b2 = b_pc_use; \ + } \ +\ + /* Save the addresses of next micropanels of A and B to the + auxinfo_t object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Call a wrapper to the kernel (which handles edge cases). */ \ + PASTECH2(bao_,ch,gemm_kernel) \ + ( \ + MR, \ + NR, \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + &alpha_local, \ + a_ir, rs_a_use, cs_a_use, \ + b_jr, rs_b_use, cs_b_use, \ + beta_use, \ + c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +\ + /* This barrier is needed to prevent threads from starting to pack + the next row panel of B before the current row panel is fully + computed upon. */ \ + bli_thread_barrier( thread_pb ); \ + } \ + } \ +\ + /* Release any memory that was acquired for packing matrices A and B. */ \ + PASTECH2(bao_,ch,packm_finalize_mem_a) \ + ( \ + rntm, \ + &mem_a, \ + thread_pa \ + ); \ + PASTECH2(bao_,ch,packm_finalize_mem_b) \ + ( \ + rntm, \ + &mem_b, \ + thread_pb \ + ); \ +\ +/* +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: b1_packed", kc_cur, nr_cur, b_jr, rs_b_use, cs_b_use, "%5.2f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemmd_bp_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%5.2f", "" ); \ +*/ \ +} + +//INSERT_GENTFUNC_BASIC0( gemmd_bp_var2 ) +GENTFUNC( float, s, gemmd_bp_var2 ) +GENTFUNC( double, d, gemmd_bp_var2 ) +GENTFUNC( scomplex, c, gemmd_bp_var2 ) +GENTFUNC( dcomplex, z, gemmd_bp_var2 ) + +// +// -- gemm-like microkernel wrapper -------------------------------------------- +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + const dim_t MR, \ + const dim_t NR, \ + dim_t mr_cur, \ + dim_t nr_cur, \ + dim_t kc_cur, \ + ctype* restrict alpha, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict aux, \ + cntx_t* restrict cntx \ + ) \ +{ \ + /* Infer the datatype from the ctype. */ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Query the context for the microkernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype zero = *PASTEMAC(ch,0); \ +\ + /* Handle interior and edge cases separately. */ \ + if ( mr_cur == MR && nr_cur == NR ) \ + { \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + kc_cur, \ + alpha, \ + a, \ + b, \ + beta, \ + c, rs_c, cs_c, \ + aux, \ + cntx \ + ); \ + } \ + else \ + { \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + kc_cur, \ + alpha, \ + a, \ + b, \ + &zero, \ + ct, rs_ct, cs_ct, \ + aux, \ + cntx \ + ); \ +\ + /* Scale the bottom edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn) \ + ( \ + mr_cur, \ + nr_cur, \ + ct, rs_ct, cs_ct, \ + beta, \ + c, rs_c, cs_c \ + ); \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( gemm_kernel ) +GENTFUNC( float, s, gemm_kernel ) +GENTFUNC( double, d, gemm_kernel ) +GENTFUNC( scomplex, c, gemm_kernel ) +GENTFUNC( dcomplex, z, gemm_kernel ) + diff --git a/addon/gemmd/bao_gemmd_check.c b/addon/gemmd/bao_gemmd_check.c new file mode 100644 index 000000000..864e9a1ac --- /dev/null +++ b/addon/gemmd/bao_gemmd_check.c @@ -0,0 +1,131 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bao_gemmd_check + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_noninteger_object( alpha ); + bli_check_error_code( e_val ); + + e_val = bli_check_noninteger_object( beta ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( c ); + bli_check_error_code( e_val ); + + // Check scalar/vector/matrix type. + + e_val = bli_check_scalar_object( alpha ); + bli_check_error_code( e_val ); + + e_val = bli_check_scalar_object( beta ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_object( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_matrix_object( c ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( alpha ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( d ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( beta ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( c ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_level3_dims( a, b, c ); + bli_check_error_code( e_val ); + + e_val = bli_check_vector_dim_equals( d, bli_obj_width_after_trans( a ) ); + bli_check_error_code( e_val ); + + // Check for consistent datatypes. + // NOTE: We only perform these tests when mixed datatype support is + // disabled. + + e_val = bli_check_consistent_object_datatypes( c, a ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( c, d ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( c, b ); + bli_check_error_code( e_val ); +} + diff --git a/addon/gemmd/bao_gemmd_check.h b/addon/gemmd/bao_gemmd_check.h new file mode 100644 index 000000000..243ec70c8 --- /dev/null +++ b/addon/gemmd/bao_gemmd_check.h @@ -0,0 +1,50 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based check functions. +// + +void bao_gemmd_check + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx + ); + diff --git a/addon/gemmd/bao_gemmd_var.h b/addon/gemmd/bao_gemmd_var.h new file mode 100644 index 000000000..5c6674727 --- /dev/null +++ b/addon/gemmd/bao_gemmd_var.h @@ -0,0 +1,126 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype the object-based variant interfaces. +// + +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTECH(bao_,opname) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* d, \ + obj_t* b, \ + obj_t* beta, \ + obj_t* c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ); + +GENPROT( gemmd_bp_var1 ) +GENPROT( gemmd_bp_var2 ) + + +// +// Prototype the typed variant interfaces. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* restrict alpha, \ + void* restrict a, inc_t rs_a, inc_t cs_a, \ + void* restrict d, inc_t incd, \ + void* restrict b, inc_t rs_b, inc_t cs_b, \ + void* restrict beta, \ + void* restrict c, inc_t rs_c, inc_t cs_c, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + thrinfo_t* restrict thread \ + ); + +//INSERT_GENTPROT_BASIC0( gemmd_bp_var1 ) +GENTPROT( float, s, gemmd_bp_var1 ) +GENTPROT( double, d, gemmd_bp_var1 ) +GENTPROT( scomplex, c, gemmd_bp_var1 ) +GENTPROT( dcomplex, z, gemmd_bp_var1 ) + +//INSERT_GENTPROT_BASIC0( gemmd_bp_var2 ) +GENTPROT( float, s, gemmd_bp_var2 ) +GENTPROT( double, d, gemmd_bp_var2 ) +GENTPROT( scomplex, c, gemmd_bp_var2 ) +GENTPROT( dcomplex, z, gemmd_bp_var2 ) + + +// +// Prototype the typed kernel interfaces. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + const dim_t MR, \ + const dim_t NR, \ + dim_t mr_cur, \ + dim_t nr_cur, \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict aux, \ + cntx_t* restrict cntx \ + ); + +//INSERT_GENTPROT_BASIC0( gemm_kernel ) +GENTPROT( float, s, gemm_kernel ) +GENTPROT( double, d, gemm_kernel ) +GENTPROT( scomplex, c, gemm_kernel ) +GENTPROT( dcomplex, z, gemm_kernel ) + diff --git a/addon/gemmd/bao_l3_packm_a.c b/addon/gemmd/bao_l3_packm_a.c new file mode 100644 index 000000000..49bb34664 --- /dev/null +++ b/addon/gemmd/bao_l3_packm_a.c @@ -0,0 +1,330 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + /* Set the pack buffer type so that we are obtaining memory blocks from + the pool dedicated to blocks of A. */ \ + const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK; \ +\ + /* NOTE: This "rounding up" of the last upanel is absolutely necessary since + we NEED that last micropanel to have the same ldim (cs_p) as the other + micropanels. Why? Because the microkernel assumes that the register (MR, + NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \ + const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \ + const dim_t k_pack = k; \ +\ + /* Barrier to make sure all threads are caught up and ready to begin the + packm stage. */ \ + bli_thread_barrier( thread ); \ +\ + /* Compute the size of the memory block eneded. */ \ + siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \ +\ + /* Check the mem_t entry provided by the caller. If it is unallocated, + then we need to acquire a block from the packed block allocator. */ \ + if ( bli_mem_is_unalloc( mem ) ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* Acquire directly to the chief thread's mem_t that was passed in. + It needs to be that mem_t struct, and not a local (temporary) + mem_t, since there is no barrier until after packing is finished, + which could allow a race condition whereby the chief thread exits + the current function before the other threads have a chance to + copy from it. (A barrier would fix that race condition, but then + again, I prefer to keep barriers to a minimum.) */ \ + bli_pba_acquire_m \ + ( \ + rntm, \ + size_needed, \ + pack_buf_type, \ + mem \ + ); \ + } \ +\ + /* Broadcast the address of the chief thread's passed-in mem_t to all + threads. */ \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ +\ + /* Non-chief threads: Copy the contents of the chief thread's + passed-in mem_t to the passed-in mem_t for this thread. (The + chief thread already has the mem_t, so it does not need to + perform any copy.) */ \ + if ( !bli_thread_am_ochief( thread ) ) \ + { \ + *mem = *mem_p; \ + } \ + } \ + else /* if ( bli_mem_is_alloc( mem ) ) */ \ + { \ + /* If the mem_t entry provided by the caller does NOT contain a NULL + buffer, then a block has already been acquired from the packed + block allocator and cached by the caller. */ \ +\ + /* As a sanity check, we should make sure that the mem_t object isn't + associated with a block that is too small compared to the size of + the packed matrix buffer that is needed, according to the value + computed above. */ \ + siz_t mem_size = bli_mem_size( mem ); \ +\ + if ( mem_size < size_needed ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* The chief thread releases the existing block associated + with the mem_t, and then re-acquires a new block, saving + the associated mem_t to its passed-in mem_t. (See coment + above for why the acquisition needs to be directly to + the chief thread's passed-in mem_t and not a local + (temporary) mem_t. */ \ + bli_pba_release \ + ( \ + rntm, \ + mem \ + ); \ + bli_pba_acquire_m \ + ( \ + rntm, \ + size_needed, \ + pack_buf_type, \ + mem \ + ); \ + } \ +\ + /* Broadcast the address of the chief thread's passed-in mem_t + to all threads. */ \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ +\ + /* Non-chief threads: Copy the contents of the chief thread's + passed-in mem_t to the passed-in mem_t for this thread. (The + chief thread already has the mem_t, so it does not need to + perform any copy.) */ \ + if ( !bli_thread_am_ochief( thread ) ) \ + { \ + *mem = *mem_p; \ + } \ + } \ + else \ + { \ + /* If the mem_t entry is already allocated and sufficiently large, + then we use it as-is. No action is needed. */ \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_init_mem_a ) +GENTFUNC( float, s, packm_init_mem_a ) +GENTFUNC( double, d, packm_init_mem_a ) +GENTFUNC( scomplex, c, packm_init_mem_a ) +GENTFUNC( dcomplex, z, packm_init_mem_a ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + if ( thread != NULL ) \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* Check the mem_t entry provided by the caller. Only proceed if it + is allocated, which it should be. */ \ + if ( bli_mem_is_alloc( mem ) ) \ + { \ + bli_pba_release \ + ( \ + rntm, \ + mem \ + ); \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_a ) +GENTFUNC( float, s, packm_finalize_mem_a ) +GENTFUNC( double, d, packm_finalize_mem_a ) +GENTFUNC( scomplex, c, packm_finalize_mem_a ) +GENTFUNC( dcomplex, z, packm_finalize_mem_a ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + pack_t* restrict schema, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + dim_t* restrict m_max, \ + dim_t* restrict k_max, \ + ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + dim_t* restrict pd_p, inc_t* restrict ps_p, \ + mem_t* restrict mem \ + ) \ +{ \ + /* NOTE: This "rounding up" of the last upanel is absolutely necessary since + we NEED that last micropanel to have the same ldim (cs_p) as the other + micropanels. Why? Because the microkernel assumes that the register (MR, + NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \ + *m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \ + *k_max = k; \ +\ + /* Determine the dimensions and strides for the packed matrix A. */ \ + { \ + /* Pack A to column-stored row-panels. */ \ + *rs_p = 1; \ + *cs_p = mr; \ +\ + *pd_p = mr; \ + *ps_p = mr * k; \ +\ + /* Set the schema to "packed row panels" to indicate packing to + conventional column-stored row panels. */ \ + *schema = BLIS_PACKED_ROW_PANELS; \ + } \ +\ + /* Set the buffer address provided by the caller to point to the memory + associated with the mem_t entry acquired from the memory pool. */ \ + *p = bli_mem_buffer( mem ); \ +} + +//INSERT_GENTFUNC_BASIC0( packm_init_a ) +GENTFUNC( float, s, packm_init_a ) +GENTFUNC( double, d, packm_init_a ) +GENTFUNC( scomplex, c, packm_init_a ) +GENTFUNC( dcomplex, z, packm_init_a ) + + +// +// Define BLAS-like interfaces to the variant chooser. +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + conj_t conj, \ + dim_t m_alloc, \ + dim_t k_alloc, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + inc_t* restrict ps_p, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + pack_t schema; \ + dim_t m_max; \ + dim_t k_max; \ + dim_t pd_p; \ +\ + /* Prepare the packing destination buffer. */ \ + PASTECH2(bao_,ch,packm_init_mem_a) \ + ( \ + m_alloc, k_alloc, mr, \ + cntx, \ + rntm, \ + mem, \ + thread \ + ); \ +\ + /* Determine the packing buffer and related parameters for matrix A. */ \ + PASTECH2(bao_,ch,packm_init_a) \ + ( \ + &schema, \ + m, k, mr, \ + &m_max, &k_max, \ + p, rs_p, cs_p, \ + &pd_p, ps_p, \ + mem \ + ); \ +\ + /* Pack matrix A to the destination buffer chosen above. Here, the packed + matrix is stored to column-stored MR x k micropanels. */ \ + PASTECH2(bao_,ch,packm_var1) \ + ( \ + conj, \ + schema, \ + m, \ + k, \ + m_max, \ + k_max, \ + kappa, \ + d, incd, \ + a, rs_a, cs_a, \ + *p, *rs_p, *cs_p, \ + pd_p, *ps_p, \ + cntx, \ + thread \ + ); \ +\ + /* Barrier so that packing is done before computation. */ \ + bli_thread_barrier( thread ); \ +} + +//INSERT_GENTFUNC_BASIC0( packm_a ) +GENTFUNC( float, s, packm_a ) +GENTFUNC( double, d, packm_a ) +GENTFUNC( scomplex, c, packm_a ) +GENTFUNC( dcomplex, z, packm_a ) + diff --git a/addon/gemmd/bao_l3_packm_a.h b/addon/gemmd/bao_l3_packm_a.h new file mode 100644 index 000000000..b683b79d4 --- /dev/null +++ b/addon/gemmd/bao_l3_packm_a.h @@ -0,0 +1,123 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_init_mem_a ) +GENTPROT( float, s, packm_init_mem_a ) +GENTPROT( double, d, packm_init_mem_a ) +GENTPROT( scomplex, c, packm_init_mem_a ) +GENTPROT( dcomplex, z, packm_init_mem_a ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_finalize_mem_a ) +GENTPROT( float, s, packm_finalize_mem_a ) +GENTPROT( double, d, packm_finalize_mem_a ) +GENTPROT( scomplex, c, packm_finalize_mem_a ) +GENTPROT( dcomplex, z, packm_finalize_mem_a ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + pack_t* restrict schema, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + dim_t* restrict m_max, \ + dim_t* restrict k_max, \ + ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + dim_t* restrict pd_p, inc_t* restrict ps_p, \ + mem_t* restrict mem \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_init_a ) +GENTPROT( float, s, packm_init_a ) +GENTPROT( double, d, packm_init_a ) +GENTPROT( scomplex, c, packm_init_a ) +GENTPROT( dcomplex, z, packm_init_a ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + conj_t conj, \ + dim_t m_alloc, \ + dim_t k_alloc, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict a, inc_t rs_a, inc_t cs_a, \ + ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + inc_t* restrict ps_p, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_a ) +GENTPROT( float, s, packm_a ) +GENTPROT( double, d, packm_a ) +GENTPROT( scomplex, c, packm_a ) +GENTPROT( dcomplex, z, packm_a ) + diff --git a/addon/gemmd/bao_l3_packm_b.c b/addon/gemmd/bao_l3_packm_b.c new file mode 100644 index 000000000..c41b062b6 --- /dev/null +++ b/addon/gemmd/bao_l3_packm_b.c @@ -0,0 +1,330 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + /* Set the pack buffer type so that we are obtaining memory blocks from + the pool dedicated to panels of B. */ \ + const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_B_PANEL; \ +\ + /* NOTE: This "rounding up" of the last upanel is absolutely necessary since + we NEED that last micropanel to have the same ldim (cs_p) as the other + micropanels. Why? Because the microkernel assumes that the register (MR, + NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \ + const dim_t k_pack = k; \ + const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \ +\ + /* Barrier to make sure all threads are caught up and ready to begin the + packm stage. */ \ + bli_thread_barrier( thread ); \ +\ + /* Compute the size of the memory block eneded. */ \ + siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \ +\ + /* Check the mem_t entry provided by the caller. If it is unallocated, + then we need to acquire a block from the packed block allocator. */ \ + if ( bli_mem_is_unalloc( mem ) ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* Acquire directly to the chief thread's mem_t that was passed in. + It needs to be that mem_t struct, and not a local (temporary) + mem_t, since there is no barrier until after packing is finished, + which could allow a race condition whereby the chief thread exits + the current function before the other threads have a chance to + copy from it. (A barrier would fix that race condition, but then + again, I prefer to keep barriers to a minimum.) */ \ + bli_pba_acquire_m \ + ( \ + rntm, \ + size_needed, \ + pack_buf_type, \ + mem \ + ); \ + } \ +\ + /* Broadcast the address of the chief thread's passed-in mem_t to all + threads. */ \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ +\ + /* Non-chief threads: Copy the contents of the chief thread's + passed-in mem_t to the passed-in mem_t for this thread. (The + chief thread already has the mem_t, so it does not need to + perform any copy.) */ \ + if ( !bli_thread_am_ochief( thread ) ) \ + { \ + *mem = *mem_p; \ + } \ + } \ + else /* if ( bli_mem_is_alloc( mem ) ) */ \ + { \ + /* If the mem_t entry provided by the caller does NOT contain a NULL + buffer, then a block has already been acquired from the packed + block allocator and cached by the caller. */ \ +\ + /* As a sanity check, we should make sure that the mem_t object isn't + associated with a block that is too small compared to the size of + the packed matrix buffer that is needed, according to the value + computed above. */ \ + siz_t mem_size = bli_mem_size( mem ); \ +\ + if ( mem_size < size_needed ) \ + { \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* The chief thread releases the existing block associated + with the mem_t, and then re-acquires a new block, saving + the associated mem_t to its passed-in mem_t. (See coment + above for why the acquisition needs to be directly to + the chief thread's passed-in mem_t and not a local + (temporary) mem_t. */ \ + bli_pba_release \ + ( \ + rntm, \ + mem \ + ); \ + bli_pba_acquire_m \ + ( \ + rntm, \ + size_needed, \ + pack_buf_type, \ + mem \ + ); \ + } \ +\ + /* Broadcast the address of the chief thread's passed-in mem_t + to all threads. */ \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ +\ + /* Non-chief threads: Copy the contents of the chief thread's + passed-in mem_t to the passed-in mem_t for this thread. (The + chief thread already has the mem_t, so it does not need to + perform any copy.) */ \ + if ( !bli_thread_am_ochief( thread ) ) \ + { \ + *mem = *mem_p; \ + } \ + } \ + else \ + { \ + /* If the mem_t entry is already allocated and sufficiently large, + then we use it as-is. No action is needed. */ \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_init_mem_b ) +GENTFUNC( float, s, packm_init_mem_b ) +GENTFUNC( double, d, packm_init_mem_b ) +GENTFUNC( scomplex, c, packm_init_mem_b ) +GENTFUNC( dcomplex, z, packm_init_mem_b ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + if ( thread != NULL ) \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + /* Check the mem_t entry provided by the caller. Only proceed if it + is allocated, which it should be. */ \ + if ( bli_mem_is_alloc( mem ) ) \ + { \ + bli_pba_release \ + ( \ + rntm, \ + mem \ + ); \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_b ) +GENTFUNC( float, s, packm_finalize_mem_b ) +GENTFUNC( double, d, packm_finalize_mem_b ) +GENTFUNC( scomplex, c, packm_finalize_mem_b ) +GENTFUNC( dcomplex, z, packm_finalize_mem_b ) + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + pack_t* restrict schema, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + dim_t* restrict k_max, \ + dim_t* restrict n_max, \ + ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + dim_t* restrict pd_p, inc_t* restrict ps_p, \ + mem_t* restrict mem \ + ) \ +{ \ + /* NOTE: This "rounding up" of the last upanel is absolutely necessary since + we NEED that last micropanel to have the same ldim (cs_p) as the other + micropanels. Why? Because the microkernel assumes that the register (MR, + NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \ + *k_max = k; \ + *n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \ +\ + /* Determine the dimensions and strides for the packed matrix B. */ \ + { \ + /* Pack B to row-stored column-panels. */ \ + *rs_p = nr; \ + *cs_p = 1; \ +\ + *pd_p = nr; \ + *ps_p = k * nr; \ +\ + /* Set the schema to "packed column panels" to indicate packing to + conventional row-stored column panels. */ \ + *schema = BLIS_PACKED_COL_PANELS; \ + } \ +\ + /* Set the buffer address provided by the caller to point to the memory + associated with the mem_t entry acquired from the memory pool. */ \ + *p = bli_mem_buffer( mem ); \ +} + +//INSERT_GENTFUNC_BASIC0( packm_init_b ) +GENTFUNC( float, s, packm_init_b ) +GENTFUNC( double, d, packm_init_b ) +GENTFUNC( scomplex, c, packm_init_b ) +GENTFUNC( dcomplex, z, packm_init_b ) + + +// +// Define BLAS-like interfaces to the variant chooser. +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + conj_t conj, \ + dim_t k_alloc, \ + dim_t n_alloc, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + inc_t* restrict ps_p, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + pack_t schema; \ + dim_t k_max; \ + dim_t n_max; \ + dim_t pd_p; \ +\ + /* Prepare the packing destination buffer. */ \ + PASTECH2(bao_,ch,packm_init_mem_b) \ + ( \ + k_alloc, n_alloc, nr, \ + cntx, \ + rntm, \ + mem, \ + thread \ + ); \ +\ + /* Determine the packing buffer and related parameters for matrix B. */ \ + PASTECH2(bao_,ch,packm_init_b) \ + ( \ + &schema, \ + k, n, nr, \ + &k_max, &n_max, \ + p, rs_p, cs_p, \ + &pd_p, ps_p, \ + mem \ + ); \ +\ + /* Pack matrix B to the destination buffer chosen above. Here, the packed + matrix is stored to row-stored k x NR micropanels. */ \ + PASTECH2(bao_,ch,packm_var1) \ + ( \ + conj, \ + schema, \ + k, \ + n, \ + k_max, \ + n_max, \ + kappa, \ + d, incd, \ + b, rs_b, cs_b, \ + *p, *rs_p, *cs_p, \ + pd_p, *ps_p, \ + cntx, \ + thread \ + ); \ +\ + /* Barrier so that packing is done before computation. */ \ + bli_thread_barrier( thread ); \ +} + +//INSERT_GENTFUNC_BASIC0( packm_b ) +GENTFUNC( float, s, packm_b ) +GENTFUNC( double, d, packm_b ) +GENTFUNC( scomplex, c, packm_b ) +GENTFUNC( dcomplex, z, packm_b ) + diff --git a/addon/gemmd/bao_l3_packm_b.h b/addon/gemmd/bao_l3_packm_b.h new file mode 100644 index 000000000..9161604ce --- /dev/null +++ b/addon/gemmd/bao_l3_packm_b.h @@ -0,0 +1,123 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_init_mem_b ) +GENTPROT( float, s, packm_init_mem_b ) +GENTPROT( double, d, packm_init_mem_b ) +GENTPROT( scomplex, c, packm_init_mem_b ) +GENTPROT( dcomplex, z, packm_init_mem_b ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_finalize_mem_b ) +GENTPROT( float, s, packm_finalize_mem_b ) +GENTPROT( double, d, packm_finalize_mem_b ) +GENTPROT( scomplex, c, packm_finalize_mem_b ) +GENTPROT( dcomplex, z, packm_finalize_mem_b ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + pack_t* restrict schema, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + dim_t* restrict k_max, \ + dim_t* restrict n_max, \ + ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + dim_t* restrict pd_p, inc_t* restrict ps_p, \ + mem_t* restrict mem \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_init_b ) +GENTPROT( float, s, packm_init_b ) +GENTPROT( double, d, packm_init_b ) +GENTPROT( scomplex, c, packm_init_b ) +GENTPROT( dcomplex, z, packm_init_b ) + + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + conj_t conj, \ + dim_t k_alloc, \ + dim_t n_alloc, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict b, inc_t rs_b, inc_t cs_b, \ + ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ + inc_t* restrict ps_p, \ + cntx_t* restrict cntx, \ + rntm_t* restrict rntm, \ + mem_t* restrict mem, \ + thrinfo_t* restrict thread \ + ); \ + +//INSERT_GENTPROT_BASIC0( packm_b ) +GENTPROT( float, s, packm_b ) +GENTPROT( double, d, packm_b ) +GENTPROT( scomplex, c, packm_b ) +GENTPROT( dcomplex, z, packm_b ) + diff --git a/addon/gemmd/bao_l3_packm_var.h b/addon/gemmd/bao_l3_packm_var.h new file mode 100644 index 000000000..063e59e5f --- /dev/null +++ b/addon/gemmd/bao_l3_packm_var.h @@ -0,0 +1,69 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// +// Prototype BLAS-like interfaces to the variants. +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* restrict cntx, \ + thrinfo_t* restrict thread \ + ); + +//INSERT_GENTPROT_BASIC0( packm_var1 ) +GENTPROT( float, s, packm_var1 ) +GENTPROT( double, d, packm_var1 ) +GENTPROT( scomplex, c, packm_var1 ) +GENTPROT( dcomplex, z, packm_var1 ) + +//INSERT_GENTPROT_BASIC0( packm_var2 ) +GENTPROT( float, s, packm_var2 ) +GENTPROT( double, d, packm_var2 ) +GENTPROT( scomplex, c, packm_var2 ) +GENTPROT( dcomplex, z, packm_var2 ) diff --git a/addon/gemmd/bao_l3_packm_var1.c b/addon/gemmd/bao_l3_packm_var1.c new file mode 100644 index 000000000..24c0a2cc1 --- /dev/null +++ b/addon/gemmd/bao_l3_packm_var1.c @@ -0,0 +1,195 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Variant 1 provides basic support for packing by calling packm_cxk(). +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* restrict cntx, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + ctype* restrict kappa_cast = kappa; \ + ctype* restrict c_cast = c; \ + ctype* restrict p_cast = p; \ +\ + dim_t iter_dim; \ + dim_t n_iter; \ + dim_t it, ic; \ + dim_t ic0; \ + doff_t ic_inc; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + dim_t panel_dim; \ + dim_t panel_dim_max; \ + inc_t incc; \ + inc_t ldc; \ + inc_t ldp; \ + conj_t conjc; \ +\ +\ + /* Extract the conjugation bit from the transposition argument. */ \ + conjc = bli_extract_conj( transc ); \ +\ + /* Create flags to incidate row or column storage. Note that the + schema bit that encodes row or column is describing the form of + micro-panel, not the storage in the micro-panel. Hence the + mismatch in "row" and "column" semantics. */ \ + bool row_stored = bli_is_col_packed( schema ); \ + /*bool col_stored = bli_is_row_packed( schema );*/ \ +\ + /* If the row storage flag indicates row storage, then we are packing + to column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( row_stored ) \ + { \ + /* Prepare to pack to row-stored column panels. */ \ + iter_dim = n; \ + panel_len = m; \ + panel_len_max = m_max; \ + panel_dim_max = pd_p; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( col_stored ) */ \ + { \ + /* Prepare to pack to column-stored row panels. */ \ + iter_dim = m; \ + panel_len = n; \ + panel_len_max = n_max; \ + panel_dim_max = pd_p; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ + /* Compute the total number of iterations we'll need. */ \ + n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ +\ + /* Set the initial values and increments for indices related to C and P + based on whether reverse iteration was requested. */ \ + { \ + ic0 = 0; \ + ic_inc = panel_dim_max; \ + } \ +\ + ctype* restrict p_begin = p_cast; \ +\ + /* Query the number of threads and thread ids from the current thread's + packm thrinfo_t node. */ \ + const dim_t nt = bli_thread_n_way( thread ); \ + const dim_t tid = bli_thread_work_id( thread ); \ +\ + /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ + ( void )nt; \ + ( void )tid; \ +\ + dim_t it_start, it_end, it_inc; \ +\ + /* Determine the thread range and increment using the current thread's + packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + will depend on whether slab or round-robin partitioning was requested + at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ +\ + /* Iterate over every logical micropanel in the source matrix. */ \ + for ( ic = ic0, it = 0; it < n_iter; \ + ic += ic_inc, it += 1 ) \ + { \ + panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \ +\ + ctype* restrict c_begin = c_cast + (ic )*incc; \ +\ + ctype* restrict c_use = c_begin; \ + ctype* restrict p_use = p_begin; \ +\ + /* The definition of bli_packm_my_iter() will depend on whether slab + or round-robin partitioning was requested at configure-time. (The + default is slab.) */ \ + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + { \ + PASTECH2(bao_,ch,packm_cxk) \ + ( \ + conjc, \ + schema, \ + panel_dim, \ + panel_dim_max, \ + panel_len, \ + panel_len_max, \ + kappa_cast, \ + d, incd, \ + c_use, incc, ldc, \ + p_use, ldp, \ + cntx \ + ); \ + } \ +\ +/* +if ( !row_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +else \ +PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +*/ \ +\ + p_begin += ps_p; \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_var1 ) +GENTFUNC( float, s, packm_var1 ) +GENTFUNC( double, d, packm_var1 ) +GENTFUNC( scomplex, c, packm_var1 ) +GENTFUNC( dcomplex, z, packm_var1 ) + diff --git a/addon/gemmd/bao_l3_packm_var2.c b/addon/gemmd/bao_l3_packm_var2.c new file mode 100644 index 000000000..830e499b3 --- /dev/null +++ b/addon/gemmd/bao_l3_packm_var2.c @@ -0,0 +1,245 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Variant 2 is similar to variant 1, but inlines the contents of packm_cxk(). +// + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict d, inc_t incd, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* restrict cntx, \ + thrinfo_t* restrict thread \ + ) \ +{ \ + ctype* restrict kappa_cast = kappa; \ + ctype* restrict c_cast = c; \ + ctype* restrict p_cast = p; \ +\ + dim_t iter_dim; \ + dim_t n_iter; \ + dim_t it, ic; \ + dim_t ic0; \ + doff_t ic_inc; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + dim_t panel_dim; \ + dim_t panel_dim_max; \ + inc_t incc; \ + inc_t ldc; \ + inc_t ldp; \ + conj_t conjc; \ +\ +\ + /* Extract the conjugation bit from the transposition argument. */ \ + conjc = bli_extract_conj( transc ); \ +\ + /* Create flags to incidate row or column storage. Note that the + schema bit that encodes row or column is describing the form of + micro-panel, not the storage in the micro-panel. Hence the + mismatch in "row" and "column" semantics. */ \ + bool row_stored = bli_is_col_packed( schema ); \ + /*bool col_stored = bli_is_row_packed( schema );*/ \ +\ + /* If the row storage flag indicates row storage, then we are packing + to column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( row_stored ) \ + { \ + /* Prepare to pack to row-stored column panels. */ \ + iter_dim = n; \ + panel_len = m; \ + panel_len_max = m_max; \ + panel_dim_max = pd_p; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( col_stored ) */ \ + { \ + /* Prepare to pack to column-stored row panels. */ \ + iter_dim = m; \ + panel_len = n; \ + panel_len_max = n_max; \ + panel_dim_max = pd_p; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ + /* Compute the total number of iterations we'll need. */ \ + n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ +\ + /* Set the initial values and increments for indices related to C and P + based on whether reverse iteration was requested. */ \ + { \ + ic0 = 0; \ + ic_inc = panel_dim_max; \ + } \ +\ + ctype* restrict p_begin = p_cast; \ +\ + /* Query the number of threads and thread ids from the current thread's + packm thrinfo_t node. */ \ + const dim_t nt = bli_thread_n_way( thread ); \ + const dim_t tid = bli_thread_work_id( thread ); \ +\ + /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ + ( void )nt; \ + ( void )tid; \ +\ + dim_t it_start, it_end, it_inc; \ +\ + /* Determine the thread range and increment using the current thread's + packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + will depend on whether slab or round-robin partitioning was requested + at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ +\ + /* Iterate over every logical micropanel in the source matrix. */ \ + for ( ic = ic0, it = 0; it < n_iter; \ + ic += ic_inc, it += 1 ) \ + { \ + panel_dim = bli_min( panel_dim_max, iter_dim - ic ); \ +\ + ctype* restrict c_begin = c_cast + (ic )*incc; \ +\ + ctype* restrict c_use = c_begin; \ + ctype* restrict p_use = p_begin; \ +\ + /* The definition of bli_packm_my_iter() will depend on whether slab + or round-robin partitioning was requested at configure-time. (The + default is slab.) */ \ + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + { \ + /* NOTE: We assume here that kappa = 1 and therefore ignore it. If + we're wrong, this will get someone's attention. */ \ + if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + bli_abort(); \ +\ + /* Perform the packing, taking conjc into account. */ \ + if ( bli_is_conj( conjc ) ) \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t d = 0; d < panel_dim; ++d ) \ + { \ + ctype* cld = c_use + (l )*ldc + (d )*incc; \ + ctype* pld = p_use + (l )*ldp + (d )*1; \ +\ + PASTEMAC(ch,copyjs)( *cld, *pld ); \ + } \ + } \ + } \ + else \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t d = 0; d < panel_dim; ++d ) \ + { \ + ctype* cld = c_use + (l )*ldc + (d )*incc; \ + ctype* pld = p_use + (l )*ldp + (d )*1; \ +\ + PASTEMAC(ch,copys)( *cld, *pld ); \ + } \ + } \ + } \ +\ + /* If panel_dim < panel_dim_max, then we zero those unused rows. */ \ + if ( panel_dim < panel_dim_max ) \ + { \ + const dim_t i = panel_dim; \ + const dim_t m_edge = panel_dim_max - panel_dim; \ + const dim_t n_edge = panel_len_max; \ + ctype* restrict p_edge = p_use + (i )*1; \ +\ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m_edge, \ + n_edge, \ + p_edge, 1, ldp \ + ); \ + } \ +\ + /* If panel_len < panel_len_max, then we zero those unused columns. */ \ + if ( panel_len < panel_len_max ) \ + { \ + const dim_t j = panel_len; \ + const dim_t m_edge = panel_dim_max; \ + const dim_t n_edge = panel_len_max - panel_len; \ + ctype* restrict p_edge = p_use + (j )*ldp; \ +\ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m_edge, \ + n_edge, \ + p_edge, 1, ldp \ + ); \ + } \ + } \ +\ +/* +if ( !row_stored ) \ +PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +else \ +PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \ + p_use, rs_p, cs_p, "%5.2f", "" ); \ +*/ \ +\ + p_begin += ps_p; \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_var1 ) +GENTFUNC( float, s, packm_var2 ) +GENTFUNC( double, d, packm_var2 ) +GENTFUNC( scomplex, c, packm_var2 ) +GENTFUNC( dcomplex, z, packm_var2 ) + diff --git a/addon/gemmd/bao_packm_cxk.c b/addon/gemmd/bao_packm_cxk.c new file mode 100644 index 000000000..645f09d79 --- /dev/null +++ b/addon/gemmd/bao_packm_cxk.c @@ -0,0 +1,199 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname ) \ +\ +void PASTECH2(bao_,ch,opname) \ + ( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_dim_max, \ + dim_t panel_len, \ + dim_t panel_len_max, \ + ctype* kappa, \ + ctype* d, inc_t incd, \ + ctype* a, inc_t inca, inc_t lda, \ + ctype* p, inc_t ldp, \ + cntx_t* cntx \ + ) \ +{ \ + /* Note that we use panel_dim_max, not panel_dim, to query the packm + kernel function pointer. This means that we always use the same + kernel, even for edge cases. */ \ + num_t dt = PASTEMAC(ch,type); \ + l1mkr_t ker_id = panel_dim_max; \ +\ + PASTECH2(ch,opname,_ker_ft) f; \ +\ + /* Query the context for the packm kernel corresponding to the current + panel dimension, or kernel id. If the id is invalid, the function will + return NULL. */ \ + f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \ +\ + /* If there exists a kernel implementation for the micro-panel dimension + provided, we invoke the implementation. Otherwise, we use scal2m. */ \ + /* NOTE: We've disabled calling packm micro-kernels from the context for + this implementation. To re-enable, change FALSE to TRUE in the + conditional below. */ \ + if ( f != NULL && FALSE ) \ + { \ + f \ + ( \ + conja, \ + schema, \ + panel_dim, \ + panel_len, \ + panel_len_max, \ + kappa, \ + a, inca, lda, \ + p, ldp, \ + cntx \ + ); \ + } \ + else \ + { \ + /* NOTE: We assume here that kappa = 1 and therefore ignore it. If + we're wrong, this will get someone's attention. */ \ + if ( !PASTEMAC(ch,eq1)( *kappa ) ) \ + bli_abort(); \ +\ + if ( d == NULL ) \ + { \ + /* Perform the packing, taking conja into account. */ \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ + { \ + ctype* ali = a + (l )*lda + (i )*inca; \ + ctype* pli = p + (l )*ldp + (i )*1; \ +\ + PASTEMAC(ch,copyjs)( *ali, *pli ); \ + } \ + } \ + } \ + else \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ + { \ + ctype* ali = a + (l )*lda + (i )*inca; \ + ctype* pli = p + (l )*ldp + (i )*1; \ +\ + PASTEMAC(ch,copys)( *ali, *pli ); \ + } \ + } \ + } \ + } \ + else /* if ( d != NULL ) */ \ + { \ + /* Perform the packing, taking conja into account. */ \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ + { \ + ctype* ali = a + (l )*lda + (i )*inca; \ + ctype* dl = d + (l )*incd; \ + ctype* pli = p + (l )*ldp + (i )*1; \ +\ + /* Note that ali must be the second operand here since + that is what is conjugated by scal2js. */ \ + PASTEMAC(ch,scal2js)( *dl, *ali, *pli ); \ + } \ + } \ + } \ + else \ + { \ + for ( dim_t l = 0; l < panel_len; ++l ) \ + { \ + for ( dim_t i = 0; i < panel_dim; ++i ) \ + { \ + ctype* ali = a + (l )*lda + (i )*inca; \ + ctype* dl = d + (l )*incd; \ + ctype* pli = p + (l )*ldp + (i )*1; \ +\ + PASTEMAC(ch,scal2s)( *ali, *dl, *pli ); \ + } \ + } \ + } \ + } \ +\ + /* If panel_dim < panel_dim_max, then we zero those unused rows. */ \ + if ( panel_dim < panel_dim_max ) \ + { \ + const dim_t i = panel_dim; \ + const dim_t m_edge = panel_dim_max - panel_dim; \ + const dim_t n_edge = panel_len_max; \ + ctype* restrict p_edge = p + (i )*1; \ +\ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m_edge, \ + n_edge, \ + p_edge, 1, ldp \ + ); \ + } \ +\ + /* If panel_len < panel_len_max, then we zero those unused columns. */ \ + if ( panel_len < panel_len_max ) \ + { \ + const dim_t j = panel_len; \ + const dim_t m_edge = panel_dim_max; \ + const dim_t n_edge = panel_len_max - panel_len; \ + ctype* restrict p_edge = p + (j )*ldp; \ +\ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m_edge, \ + n_edge, \ + p_edge, 1, ldp \ + ); \ + } \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( packm_cxk ) +GENTFUNC( float, s, packm_cxk ) +GENTFUNC( double, d, packm_cxk ) +GENTFUNC( scomplex, c, packm_cxk ) +GENTFUNC( dcomplex, z, packm_cxk ) + diff --git a/addon/gemmd/bao_packm_cxk.h b/addon/gemmd/bao_packm_cxk.h new file mode 100644 index 000000000..3e977a7cc --- /dev/null +++ b/addon/gemmd/bao_packm_cxk.h @@ -0,0 +1,59 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTECH2(bao_,ch,varname) \ + ( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_dim_max, \ + dim_t panel_len, \ + dim_t panel_len_max, \ + ctype* kappa, \ + ctype* d, inc_t incd, \ + ctype* a, inc_t inca, inc_t lda, \ + ctype* p, inc_t ldp, \ + cntx_t* cntx \ + ); + +//INSERT_GENTPROT_BASIC0( packm_cxk ) +GENTPROT( float, s, packm_cxk ) +GENTPROT( double, d, packm_cxk ) +GENTPROT( scomplex, c, packm_cxk ) +GENTPROT( dcomplex, z, packm_cxk ) + diff --git a/addon/gemmd/gemmd.h b/addon/gemmd/gemmd.h new file mode 100644 index 000000000..cab61bd18 --- /dev/null +++ b/addon/gemmd/gemmd.h @@ -0,0 +1,54 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of copyright holder(s) nor the names + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef GEMMD_H +#define GEMMD_H + +// This header should contain (or #include) any definitions that must be +// folded into blis.h. + +#include "bao_gemmd.h" +#include "bao_gemmd_check.h" +#include "bao_gemmd_var.h" + +#include "bao_l3_packm_a.h" +#include "bao_l3_packm_b.h" +#include "bao_l3_packm_var.h" + +#include "bao_packm_cxk.h" + +#include "bao_l3_decor.h" + + +#endif diff --git a/addon/gemmd/thread/bao_l3_decor.h b/addon/gemmd/thread/bao_l3_decor.h new file mode 100644 index 000000000..b4fd2b9b7 --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor.h @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SBX_L3_DECOR_H +#define BLIS_SBX_L3_DECOR_H + +// -- sup definitions ---------------------------------------------------------- + +// Level-3 sup internal function type. +typedef void (*l3sbxint_t) + ( + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +// Level-3 sup thread decorator prototype. +void bao_l3_thread_decorator + ( + l3sbxint_t func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +// Include definitions specific to the method of multithreading. +#include "bao_l3_decor_single.h" +#include "bao_l3_decor_openmp.h" +#include "bao_l3_decor_pthreads.h" + +#endif + diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.c b/addon/gemmd/thread/bao_l3_decor_openmp.c new file mode 100644 index 000000000..1aca8de27 --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor_openmp.c @@ -0,0 +1,140 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_OPENMP + +// Define a dummy thread entry function, which is needed in the pthreads +// version, so that when building Windows DLLs (with OpenMP enabled or with +// no multithreading) we don't risk having an unresolved symbol. +void* bao_l3_thread_entry( void* data_void ) { return NULL; } + +//#define PRINT_THRINFO + +void bao_l3_thread_decorator + ( + l3sbxint_t func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + // Query the total number of threads from the rntm_t object. + const dim_t n_threads = bli_rntm_num_threads( rntm ); + + // NOTE: The sba was initialized in bli_init(). + + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* restrict array = bli_sba_checkout_array( n_threads ); + + // Access the pool_t* for thread 0 and embed it into the rntm. We do + // this up-front only so that we have the rntm_t.sba_pool field + // initialized and ready for the global communicator creation below. + bli_sba_rntm_set_pool( 0, array, rntm ); + + // Set the packing block allocator field of the rntm. This will be + // inherited by all of the child threads when they make local copies of + // the rntm below. + bli_pba_rntm_set_pba( rntm ); + + // Allcoate a global communicator for the root thrinfo_t structures. + thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + + + _Pragma( "omp parallel num_threads(n_threads)" ) + { + // Create a thread-local copy of the master thread's rntm_t. This is + // necessary since we want each thread to be able to track its own + // small block pool_t as it executes down the function stack. + rntm_t rntm_l = *rntm; + rntm_t* restrict rntm_p = &rntm_l; + + // Query the thread's id from OpenMP. + const dim_t tid = omp_get_thread_num(); + + // Check for a somewhat obscure OpenMP thread-mistmatch issue. + // NOTE: This calls the same function used for the conventional/large + // code path. + bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); + + // Use the thread id to access the appropriate pool_t* within the + // array_t, and use it to set the sba_pool field within the rntm_t. + // If the pool_t* element within the array_t is NULL, it will first + // be allocated/initialized. + bli_sba_rntm_set_pool( tid, array, rntm_p ); + + thrinfo_t* thread = NULL; + + // Create the root node of the thread's thrinfo_t structure. + bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); + + func + ( + alpha, + a, + d, + b, + beta, + c, + cntx, + rntm_p, + thread + ); + + // Free the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_free( rntm_p, thread ); + } + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called from the thread entry function). + + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); +} + +#endif + diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.h b/addon/gemmd/thread/bao_l3_decor_openmp.h new file mode 100644 index 000000000..9c956d7c3 --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor_openmp.h @@ -0,0 +1,44 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SBX_L3_DECOR_OPENMP_H +#define BLIS_SBX_L3_DECOR_OPENMP_H + +// Definitions specific to situations when OpenMP multithreading is enabled. +#ifdef BLIS_ENABLE_OPENMP + +#endif + +#endif + diff --git a/addon/gemmd/thread/bao_l3_decor_pthreads.c b/addon/gemmd/thread/bao_l3_decor_pthreads.c new file mode 100644 index 000000000..587b8400f --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor_pthreads.c @@ -0,0 +1,220 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_PTHREADS + +// A data structure to assist in passing operands to additional threads. +typedef struct thread_data +{ + l3sbxint_t func; + opid_t family; + obj_t* alpha; + obj_t* a; + obj_t* d; + obj_t* b; + obj_t* beta; + obj_t* c; + cntx_t* cntx; + rntm_t* rntm; + dim_t tid; + thrcomm_t* gl_comm; + array_t* array; +} thread_data_t; + +// Entry point function for additional threads. +void* bao_l3_thread_entry( void* data_void ) +{ + thread_data_t* data = data_void; + + l3sbxint_t func = data->func; + opid_t family = data->family; + obj_t* alpha = data->alpha; + obj_t* a = data->a; + obj_t* d = data->d; + obj_t* b = data->b; + obj_t* beta = data->beta; + obj_t* c = data->c; + cntx_t* cntx = data->cntx; + rntm_t* rntm = data->rntm; + dim_t tid = data->tid; + array_t* array = data->array; + thrcomm_t* gl_comm = data->gl_comm; + + ( void )family; + + // Create a thread-local copy of the master thread's rntm_t. This is + // necessary since we want each thread to be able to track its own + // small block pool_t as it executes down the function stack. + rntm_t rntm_l = *rntm; + rntm_t* restrict rntm_p = &rntm_l; + + // Use the thread id to access the appropriate pool_t* within the + // array_t, and use it to set the sba_pool field within the rntm_t. + // If the pool_t* element within the array_t is NULL, it will first + // be allocated/initialized. + bli_sba_rntm_set_pool( tid, array, rntm_p ); + + thrinfo_t* thread = NULL; + + // Create the root node of the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); + + func + ( + alpha, + a, + d, + b, + beta, + c, + cntx, + rntm_p, + thread + ); + + // Free the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_free( rntm_p, thread ); + + return NULL; +} + +void bao_l3_thread_decorator + ( + l3sbxint_t func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + err_t r_val; + + // Query the total number of threads from the context. + const dim_t n_threads = bli_rntm_num_threads( rntm ); + + // NOTE: The sba was initialized in bli_init(). + + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* restrict array = bli_sba_checkout_array( n_threads ); + + // Access the pool_t* for thread 0 and embed it into the rntm. We do + // this up-front only so that we have the rntm_t.sba_pool field + // initialized and ready for the global communicator creation below. + bli_sba_rntm_set_pool( 0, array, rntm ); + + // Set the packing block allocator field of the rntm. This will be + // inherited by all of the child threads when they make local copies of + // the rntm below. + bli_pba_rntm_set_pba( rntm ); + + // Allocate a global communicator for the root thrinfo_t structures. + thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + + // Allocate an array of pthread objects and auxiliary data structs to pass + // to the thread entry functions. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val ); + + // NOTE: We must iterate backwards so that the chief thread (thread id 0) + // can spawn all other threads before proceeding with its own computation. + for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- ) + { + // Set up thread data for additional threads (beyond thread 0). + datas[tid].func = func; + datas[tid].family = family; + datas[tid].alpha = alpha; + datas[tid].a = a; + datas[tid].d = d; + datas[tid].b = b; + datas[tid].beta = beta; + datas[tid].c = c; + datas[tid].cntx = cntx; + datas[tid].rntm = rntm; + datas[tid].tid = tid; + datas[tid].gl_comm = gl_comm; + datas[tid].array = array; + + // Spawn additional threads for ids greater than 1. + if ( tid != 0 ) + bli_pthread_create( &pthreads[tid], NULL, &bao_l3_thread_entry, &datas[tid] ); + else + bao_l3_thread_entry( ( void* )(&datas[0]) ); + } + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called from the thread entry function). + + // Thread 0 waits for additional threads to finish. + for ( dim_t tid = 1; tid < n_threads; tid++ ) + { + bli_pthread_join( pthreads[tid], NULL ); + } + + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_free_intl( pthreads ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_free_intl( datas ); +} + +#endif + diff --git a/addon/gemmd/thread/bao_l3_decor_pthreads.h b/addon/gemmd/thread/bao_l3_decor_pthreads.h new file mode 100644 index 000000000..69adec45e --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor_pthreads.h @@ -0,0 +1,47 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SBX_L3_DECOR_PTHREADS_H +#define BLIS_SBX_L3_DECOR_PTHREADS_H + +// Definitions specific to situations when POSIX multithreading is enabled. +#ifdef BLIS_ENABLE_PTHREADS + +// Thread entry point prototype. +void* bao_l3_thread_entry( void* data_void ); + +#endif + +#endif + diff --git a/addon/gemmd/thread/bao_l3_decor_single.c b/addon/gemmd/thread/bao_l3_decor_single.c new file mode 100644 index 000000000..d60891d65 --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor_single.c @@ -0,0 +1,143 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifndef BLIS_ENABLE_MULTITHREADING + +#define SKIP_THRINFO_TREE + +void bao_l3_thread_decorator + ( + l3sbxint_t func, + opid_t family, + //pack_t schema_a, + //pack_t schema_b, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + // For sequential execution, we use only one thread. + const dim_t n_threads = 1; + + // NOTE: The sba was initialized in bli_init(). + + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* restrict array = bli_sba_checkout_array( n_threads ); + + // Access the pool_t* for thread 0 and embed it into the rntm. + bli_sba_rntm_set_pool( 0, array, rntm ); + + // Set the packing block allocator field of the rntm. + bli_pba_rntm_set_pba( rntm ); + +#ifndef SKIP_THRINFO_TREE + // Allcoate a global communicator for the root thrinfo_t structures. + thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); +#endif + + + { + // NOTE: We don't need to create another copy of the rntm_t since + // it was already copied in one of the high-level oapi functions. + rntm_t* restrict rntm_p = rntm; + + // There is only one thread id (for the thief thread). + const dim_t tid = 0; + + // Use the thread id to access the appropriate pool_t* within the + // array_t, and use it to set the sba_pool field within the rntm_t. + // If the pool_t* element within the array_t is NULL, it will first + // be allocated/initialized. + // NOTE: This is commented out because, in the single-threaded case, + // this is redundant since it's already been done above. + //bli_sba_rntm_set_pool( tid, array, rntm_p ); + +#ifndef SKIP_THRINFO_TREE + thrinfo_t* thread = NULL; + + // Create the root node of the thread's thrinfo_t structure. + bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); +#else + // This optimization allows us to use one of the global thrinfo_t + // objects for single-threaded execution rather than grow one from + // scratch. The key is that bli_thrinfo_sup_grow(), which is called + // from within the variants, will immediately return if it detects + // that the thrinfo_t* passed into it is either + // &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED. + thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED; + + ( void )tid; +#endif + + func + ( + alpha, + a, + d, + b, + beta, + c, + cntx, + rntm_p, + thread + ); + +#ifndef SKIP_THRINFO_TREE + // Free the current thread's thrinfo_t structure. + bli_l3_sup_thrinfo_free( rntm_p, thread ); +#endif + } + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called above). + + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); +} + +#endif + diff --git a/addon/gemmd/thread/bao_l3_decor_single.h b/addon/gemmd/thread/bao_l3_decor_single.h new file mode 100644 index 000000000..211a43a89 --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor_single.h @@ -0,0 +1,44 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SBX_L3_DECOR_SINGLE_H +#define BLIS_SBX_L3_DECOR_SINGLE_H + +// Definitions specific to situations when multithreading is disabled. +#ifndef BLIS_ENABLE_MULTITHREADING + +#endif + +#endif + diff --git a/build/bli_addon.h.in b/build/bli_addon.h.in new file mode 100644 index 000000000..36a8e29bd --- /dev/null +++ b/build/bli_addon.h.in @@ -0,0 +1,47 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_ADDON_H +#define BLIS_ADDON_H + +#if @enable_addons@ +#define BLIS_ENABLE_ADDONS +#else +#define BLIS_DISABLE_ADDONS +#endif + +// Enabled addons +@addon_list_includes@ + +#endif diff --git a/build/config.mk.in b/build/config.mk.in index 7533d1acb..63cd53e28 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -185,6 +185,10 @@ MK_ENABLE_CBLAS := @enable_cblas@ # Whether libblis will depend on libmemkind for certain memory allocations. MK_ENABLE_MEMKIND := @enable_memkind@ +# The names of the addons to include when building BLIS. If empty, no addons +# will be included. +ADDON_LIST := @addon_list@ + # The name of a sandbox defining an alternative gemm implementation. If empty, # no sandbox will be used and the conventional gemm implementation will remain # enabled. diff --git a/common.mk b/common.mk index 90c3da83f..5f2d30c9b 100644 --- a/common.mk +++ b/common.mk @@ -152,18 +152,35 @@ get-kernel-cflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \ # When compiling sandboxes, we use flags similar to those of general framework # source. This ensures that the same code can be linked and run across various -# sub-configurations. (If we switch to using refkern/kernel flags, we should -# prevent enabling sandboxes for umbrella families by verifying that -# config_list == config_name if --enable-sandbox is given.) +# sub-configurations. +get-addon-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ + $(call get-noopt-cflags-for,$(1)) \ + $(CADDONINCFLAGS) \ + $(BUILD_CPPFLAGS) \ + $(BUILD_SYMFLAGS) \ + ) +get-addon-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ + $(call get-noopt-cxxflags-for,$(1)) \ + $(CADDONINCFLAGS) \ + $(BUILD_CPPFLAGS) \ + $(BUILD_SYMFLAGS) \ + ) + +# When compiling sandboxes, we use flags similar to those of general framework +# source. This ensures that the same code can be linked and run across various +# sub-configurations. (NOTE: If we ever switch to using refkernel or kernel +# flags, we should prevent enabling sandboxes for umbrella families by verifying +# that config_list == config_name if --enable-sandbox is given. THIS ALSO +# APPLIES TO ADDONS ABOVE.) get-sandbox-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ - $(CSBOXINCFLAGS) \ + $(CSANDINCFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ ) get-sandbox-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(call get-noopt-cxxflags-for,$(1)) \ - $(CSBOXINCFLAGS) \ + $(CSANDINCFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ ) @@ -188,6 +205,8 @@ get-refkern-text-for = "('$(1)' CFLAGS for ref. kernels)" get-config-text-for = "('$(1)' CFLAGS for config code)" get-frame-text-for = "('$(1)' CFLAGS for framework code)" get-kernel-text-for = "('$(1)' CFLAGS for kernels)" +get-addon-c99text-for = "('$(1)' CFLAGS for addons)" +get-addon-cxxtext-for = "('$(1)' CXXFLAGS for addons)" get-sandbox-c99text-for = "('$(1)' CFLAGS for sandboxes)" get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)" @@ -202,6 +221,10 @@ get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)" files-that-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),$(f),))) files-that-dont-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),,$(f)))) +# Define a function that removes duplicate strings *without* using the sort +# function. +rm-dups = $(if $1,$(firstword $1) $(call rm-dups,$(filter-out $(firstword $1),$1))) + # # --- Include makefile configuration file -------------------------------------- @@ -286,6 +309,7 @@ CONFIG_DIR := config FRAME_DIR := frame REFKERN_DIR := ref_kernels KERNELS_DIR := kernels +ADDON_DIR := addon SANDBOX_DIR := sandbox OBJ_DIR := obj LIB_DIR := lib @@ -302,11 +326,13 @@ REFNM := ref # Source suffixes. CONFIG_SRC_SUFS := c - KERNELS_SRC_SUFS := c s S - FRAME_SRC_SUFS := c +ADDON_C99_SUFS := c +ADDON_CXX_SUFS := cc cpp cxx +ADDON_SRC_SUFS := $(ADDON_C99_SUFS) $(ADDON_CXX_SUFS) + SANDBOX_C99_SUFS := c SANDBOX_CXX_SUFS := cc cpp cxx SANDBOX_SRC_SUFS := $(SANDBOX_C99_SUFS) $(SANDBOX_CXX_SUFS) @@ -314,15 +340,21 @@ SANDBOX_SRC_SUFS := $(SANDBOX_C99_SUFS) $(SANDBOX_CXX_SUFS) # Header suffixes. FRAME_HDR_SUFS := h +ADDON_H99_SUFS := h +ADDON_HXX_SUFS := hh hpp hxx +ADDON_HDR_SUFS := $(ADDON_H99_SUFS) $(ADDON_HXX_SUFS) + SANDBOX_H99_SUFS := h SANDBOX_HXX_SUFS := hh hpp hxx SANDBOX_HDR_SUFS := $(SANDBOX_H99_SUFS) $(SANDBOX_HXX_SUFS) # Combine all header suffixes and remove duplicates via sort(). ALL_HDR_SUFS := $(sort $(FRAME_HDR_SUFS) \ + $(ADDON_HDR_SUFS) \ $(SANDBOX_HDR_SUFS) ) ALL_H99_SUFS := $(sort $(FRAME_HDR_SUFS) \ + $(ADDON_HDR_SUFS) \ $(SANDBOX_H99_SUFS) ) # The names of scripts that check output from the BLAS test drivers and @@ -349,11 +381,13 @@ SHELL := bash # Construct paths to the four primary directories of source code: # the config directory, general framework code, reference kernel code, -# and optimized kernel code. +# and optimized kernel code. Also process paths for addon and sandbox +# directories. CONFIG_PATH := $(DIST_PATH)/$(CONFIG_DIR) FRAME_PATH := $(DIST_PATH)/$(FRAME_DIR) REFKERN_PATH := $(DIST_PATH)/$(REFKERN_DIR) KERNELS_PATH := $(DIST_PATH)/$(KERNELS_DIR) +ADDON_PATH := $(DIST_PATH)/$(ADDON_DIR) SANDBOX_PATH := $(DIST_PATH)/$(SANDBOX_DIR) # Construct paths to some optional C++ template headers contributed by AMD. @@ -367,6 +401,7 @@ CONFIG_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(CONFIG_DIR) FRAME_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(FRAME_DIR) REFKERN_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(REFKERN_DIR) KERNELS_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(KERNELS_DIR) +ADDON_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(ADDON_DIR) SANDBOX_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(SANDBOX_DIR) @@ -855,6 +890,7 @@ MK_CONFIG_SRC := MK_KERNELS_SRC := MK_REFKERN_SRC := MK_FRAME_SRC := +MK_ADDON_SRC := MK_SANDBOX_SRC := # -- config -- @@ -905,6 +941,24 @@ PARENT_PATH := $(OBJ_DIR)/$(CONFIG_NAME) -include $(addsuffix /$(FRAGMENT_MK), $(REFKERN_FRAG_PATH)) -include $(addsuffix /$(FRAGMENT_MK), $(FRAME_FRAG_PATH)) +# -- addon -- + +# Construct paths to each addon. +# NOTE: If $(ADDON_LIST) is empty (because no addon was enabled at configure- +# time) then $(ADDON_PATHS) will also be empty, which will cause no fragments +# to be included. +ADDON_PATHS := $(addprefix $(ADDON_FRAG_PATH)/, $(ADDON_LIST)) + +# This variable is used by the include statements as they recursively include +# one another. For the 'addons' directory, we initialize it to that directory +# in preparation to include the fragments in the configuration sub-directory. +PARENT_SRC_PATH := $(ADDON_PATH) +PARENT_PATH := $(ADDON_FRAG_PATH) + +# Recursively include the makefile fragments in each of the addons sub- +# directories. +-include $(addsuffix /$(FRAGMENT_MK), $(ADDON_PATHS)) + # -- sandbox -- # Construct paths to each sandbox. (At present, there can be only one.) @@ -922,6 +976,8 @@ PARENT_PATH := $(SANDBOX_FRAG_PATH) # Recursively include the makefile fragments in the sandbox sub-directory. -include $(addsuffix /$(FRAGMENT_MK), $(SANDBOX_PATHS)) +# -- post-processing -- + # Create a list of the makefile fragments using the variable into which each # of the above include statements accumulated their directory paths. MAKEFILE_FRAGMENTS := $(addsuffix /$(FRAGMENT_MK), $(FRAGMENT_DIR_PATHS)) @@ -940,14 +996,14 @@ endif # # Define a function that will expand all of the directory paths given in $(1) -# to actual filepaths using the list of suffixes provided $(2). +# to actual filepaths using the list of suffixes provided in $(2). get-filepaths = $(strip $(foreach path, $(1), \ $(foreach suf, $(2), \ $(wildcard $(path)/*.$(suf)) \ ) ) ) # Define a function that will expand all of the directory paths given in $(1) -# to actual filepaths using the list of suffixes provided $(2), taking only +# to actual filepaths using the list of suffixes provided in $(2), taking only # the first expansion from each directory with at least one file matching # the current suffix. Finally, strip the filenames from all resulting files, # returning only the directory paths. @@ -957,20 +1013,29 @@ get-dirpaths = $(dir $(foreach path, $(1), \ $(wildcard $(path)/*.$(suf)) \ ) ) ) ) -# We'll use two directory lists. The first is a list of all of the directories -# in which makefile fragments were generated (plus the current directory). The -# second is the subset of the first that begins with the sandbox root path. +# We'll use three directory lists. The first is a list of all of the directories +# in which makefile fragments were generated, plus the current directory. (The +# current directory is needed so we include bli_config.h and bli_addon.h in the +# processing of header files.) The second and third are subsets of the first +# that begins with the addon and sandbox root paths, respectively. ALLFRAG_DIR_PATHS := . $(FRAGMENT_DIR_PATHS) +ADDON_DIR_PATHS := $(filter $(ADDON_PATH)/%,$(ALLFRAG_DIR_PATHS)) SANDBOX_DIR_PATHS := $(filter $(SANDBOX_PATH)/%,$(ALLFRAG_DIR_PATHS)) ALL_H99_FILES := $(call get-filepaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS)) -FRAME_H99_FILES := $(filter-out $(SANDBOX_PATH)/%,$(ALL_H99_FILES)) +FRAME_H99_FILES := $(filter-out $(ADDON_PATH)/%, \ + $(filter-out $(SANDBOX_PATH)/%, \ + $(ALL_H99_FILES) \ + ) ) -ALL_H99_DIRPATHS := $(call get-dirpaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS)) +ALL_H99_DIRPATHS := $(call get-dirpaths,$(ALLFRAG_DIR_PATHS),$(ALL_H99_SUFS)) -SANDBOX_H99_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_H99_SUFS)) -SANDBOX_HXX_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_HXX_SUFS)) +ADDON_H99_FILES := $(call get-filepaths,$(ADDON_DIR_PATHS),$(ADDON_H99_SUFS)) +ADDON_HXX_FILES := $(call get-filepaths,$(ADDON_DIR_PATHS),$(ADDON_HXX_SUFS)) +ADDON_HDR_DIRPATHS := $(call get-dirpaths,$(ADDON_DIR_PATHS),$(ALL_HDR_SUFS)) +SANDBOX_H99_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_H99_SUFS)) +SANDBOX_HXX_FILES := $(call get-filepaths,$(SANDBOX_DIR_PATHS),$(SANDBOX_HXX_SUFS)) SANDBOX_HDR_DIRPATHS := $(call get-dirpaths,$(SANDBOX_DIR_PATHS),$(ALL_HDR_SUFS)) @@ -1025,8 +1090,8 @@ CBLAS_H_FLAT := $(BASE_INC_PATH)/$(CBLAS_H) # # Obtain a list of header files #included inside of the bli_cntx_ref.c file. -# Paths to these files will be needed when compiling with the monolithic -# header. +# Due to the way that bli_cntx_ref.c uses headers and macros, paths to these +# files will be needed when compiling bli_cntx_ref.c with the monolithic header. ifeq ($(strip $(SHARE_PATH)),.) REF_KER_SRC := $(DIST_PATH)/$(REFKERN_DIR)/bli_cntx_ref.c REF_KER_HEADERS := $(shell $(GREP) "\#include" $(REF_KER_SRC) | sed -e "s/\#include [\"<]\([a-zA-Z0-9\_\.\/\-]*\)[\">].*/\1/g" | $(GREP) -v $(BLIS_H)) @@ -1034,9 +1099,10 @@ endif # Match each header found above with the path to that header, and then strip # leading, trailing, and internal whitespace. -REF_KER_H_PATHS := $(strip $(foreach header, $(REF_KER_HEADERS), \ - $(dir $(filter %/$(header), \ - $(FRAME_H99_FILES))))) +REF_KER_H_PATHS := $(call rm-dups,$(strip \ + $(foreach header, $(REF_KER_HEADERS), \ + $(dir $(filter %/$(header), \ + $(FRAME_H99_FILES)))))) # Add -I to each header path so we can specify our include search paths to the # C compiler. Then add frame/include since it's needed when compiling source @@ -1056,17 +1122,22 @@ ifeq ($(MK_ENABLE_CBLAS),yes) CINCFLAGS += -I$(CBLAS_H_DIRPATH) endif +# Obtain a list of header paths in the configured addons. Then add -I to each +# header path. +CADDONINCFLAGS := $(strip $(patsubst %, -I%, $(ADDON_HDR_DIRPATHS))) + # Obtain a list of header paths in the configured sandbox. Then add -I to each # header path. -CSBOXINCFLAGS := $(strip $(patsubst %, -I%, $(SANDBOX_HDR_DIRPATHS))) +CSANDINCFLAGS := $(strip $(patsubst %, -I%, $(SANDBOX_HDR_DIRPATHS))) # # --- BLIS configuration header definitions ------------------------------------ # -# This file was created by configure, but we need to define it here so we can -# remove it as part of the clean targets. +# These files were created by configure, but we need to define them here so we +# can remove them as part of the clean targets. +BLIS_ADDON_H := ./bli_addon.h BLIS_CONFIG_H := ./bli_config.h diff --git a/configure b/configure index 3c865dad9..15577eb22 100755 --- a/configure +++ b/configure @@ -270,6 +270,15 @@ print_usage() echo " \"small\" depends on thresholds that may vary by sub-" echo " configuration." echo " " + echo " -a NAME --enable-addon=NAME" + echo " " + echo " Enable the code provided by an addon. An addon consists" + echo " of a separate directory of code that provides additional" + echo " APIs, implementations, and/or operations that would" + echo " otherwise not be present within a build of BLIS. This" + echo " option may be used multiple times to specify the inclusion" + echo " of multiple addons. By default, no addons are enabled." + echo " " echo " -s NAME --enable-sandbox=NAME" echo " " echo " Enable a separate sandbox implementation of gemm. This" @@ -973,6 +982,18 @@ canonicalize_ws() echo "${str}" } +rm_duplicate_words_simple() +{ + local str revstr revres res + + str="$1" + + # Remote duplicates, keeping the first occurrence. + res=$(echo "${str}" | awk '{for (i=1;i<=NF;i++) if (!a[$i]++) printf("%s%s",$i,FS)}{printf("\n")}') + + echo "${res}" +} + rm_duplicate_words() { local str revstr revres res @@ -1958,6 +1979,13 @@ main() bli_config_h_in_path="${build_dirpath}/${bli_config_h_in}" bli_config_h_out_path="${cur_dirpath}/${bli_config_h_out}" + # The names/paths for the template bli_addon.h.in and its instantiated + # counterpart. + bli_addon_h_in='bli_addon.h.in' + bli_addon_h_out='bli_addon.h' + bli_addon_h_in_path="${build_dirpath}/${bli_addon_h_in}" + bli_addon_h_out_path="${cur_dirpath}/${bli_addon_h_out}" + # Path to 'mirror-tree.sh' script. mirror_tree_sh="${build_dirpath}/mirror-tree.sh" @@ -1981,6 +2009,10 @@ main() frame_dir='frame' frame_dirpath="${dist_path}/${frame_dir}" + # The names of the addons. + addon_dir='addon' + addon_dirpath="${dist_path}/${addon_dir}" + # The name of the sandbox directory. sandbox_dir='sandbox' sandbox_dirpath="${dist_path}/${sandbox_dir}" @@ -2088,6 +2120,10 @@ main() force_version='no' complex_return='default' + # The addon flag and names. + addon_flag='' + addon_list='' + # The sandbox flag and name. sandbox_flag='' sandbox='' @@ -2132,7 +2168,7 @@ main() # Process our command line options. unset OPTIND - while getopts ":hp:d:e:s:t:r:qci:b:-:" opt; do + while getopts ":hp:d:e:a:s:t:r:qci:b:-:" opt; do case $opt in -) case "$OPTARG" in @@ -2239,12 +2275,21 @@ main() disable-mem-tracing) enable_mem_tracing='no' ;; + enable-addon=*) + addon_flag=1 + addon_name=${OPTARG#*=} + # Append the addon name to the list. + addon_list="${addon_list} ${addon_name}" + ;; + disable-addon) + addon_flag='' + ;; enable-sandbox=*) sandbox_flag=1 sandbox=${OPTARG#*=} ;; disable-sandbox) - sandbox_flag=0 + sandbox_flag='' ;; int-size=*) int_type_size=${OPTARG#*=} @@ -2321,6 +2366,12 @@ main() e) export_shared=$OPTARG ;; + a) + addon_flag=1 + addon_name=$OPTARG + # Append the addon name to the list. + addon_list="${addon_list} ${addon_name}" + ;; s) sandbox_flag=1 sandbox=$OPTARG @@ -3168,6 +3219,34 @@ main() exit 1 fi + # Check if addons were given. + if [ -n "${addon_flag}" ]; then + + # Remove duplicates in the addon list, if they exist. + addon_list=$(rm_duplicate_words_simple "${addon_list}") + + echo "${script_name}: configuring with addons:" + + for addon in ${addon_list}; do + + echo "${script_name}: ${addon_dir}/${addon}" + + addon_fullpath="${addon_dirpath}/${addon}" + + if [ ! -d "${addon_fullpath}" ]; then + echo "${script_name}: requested addon sub-directory does not exist! Cannot continue." + echo "${script_name}: *** Please verify addon existence and name." + exit 1 + fi + done + + enable_addons_01=1 + else + echo "${script_name}: configuring with no addons." + + enable_addons_01=0 + fi + # Check if a sandbox was given. if [ -n "${sandbox_flag}" ]; then @@ -3306,6 +3385,15 @@ main() kernel_list_defines="${kernel_list_defines}#define ${kernel_define}\n" done + # Create a list of #includes, one for each addon in addon_list. + addon_list_includes="" + for addon in ${addon_list}; do + + # Create a #define and add it to the running list. + addon_header="\"${addon}.h\"" + addon_list_includes="${addon_list_includes}#include ${addon_header}\n" + done + # -- Determine whether we are performing an out-of-tree build -------------- @@ -3333,7 +3421,7 @@ main() fi - # -- Instantiate config.mk, bli_config.h files from templates -------------- + # -- Instantiate config.mk file from template ------------------------------ # Begin substituting information into the config_mk_in file, outputting # to config_mk_out. @@ -3380,9 +3468,11 @@ main() | sed -e "s/@enable_cblas@/${enable_cblas}/g" \ | sed -e "s/@enable_memkind@/${enable_memkind}/g" \ | sed -e "s/@pragma_omp_simd@/${pragma_omp_simd}/g" \ + | sed -e "s/@addon_list@/${addon_list}/g" \ | sed -e "s/@sandbox@/${sandbox}/g" \ > "${config_mk_out_path}" + # -- Instantiate bli_config.h file from template --------------------------- # Begin substituting information into the bli_config_h_in file, outputting # to bli_config_h_out. NOTE: We use perl instead of sed because the version @@ -3417,6 +3507,17 @@ main() | sed -e "s/@complex_return_intel@/${complex_return_intel01}/g" \ > "${bli_config_h_out_path}" + # -- Instantiate bli_addon.h file from template ---------------------------- + + # Begin substituting information into the bli_addon_h_in file, outputting + # to bli_addon_h_out. NOTE: We use perl instead of sed because the version + # of sed used on OS X is old and does not handle the '\n' character + # intuitively, which was used when constructing ${addon_list_includes}. + echo "${script_name}: creating ${bli_addon_h_out_path} from ${bli_addon_h_in_path}" + cat "${bli_addon_h_in_path}" \ + | perl -pe "s/\@addon_list_includes\@/${addon_list_includes}/g" \ + | sed -e "s/@enable_addons@/${enable_addons_01}/g" \ + > "${bli_addon_h_out_path}" # -- Create top-level object directories ----------------------------------- @@ -3429,7 +3530,6 @@ main() obj_config_dirpath="${base_obj_dirpath}/${config_dir}" - #echo "${script_name}: creating ${obj_config_dirpath}" mkdir -p ${obj_config_dirpath} for conf in ${config_list}; do echo "${script_name}: creating ${obj_config_dirpath}/${conf}" @@ -3439,7 +3539,6 @@ main() obj_kernels_dirpath="${base_obj_dirpath}/${kernels_dir}" - #echo "${script_name}: creating ${obj_kernels_dirpath}" mkdir -p ${obj_kernels_dirpath} for kern in ${kernel_list}; do echo "${script_name}: creating ${obj_kernels_dirpath}/${kern}" @@ -3449,7 +3548,6 @@ main() obj_refkern_dirpath="${base_obj_dirpath}/${refkern_dir}" - #echo "${script_name}: creating ${obj_refkern_dirpath}" mkdir -p ${obj_refkern_dirpath} for conf in ${config_list}; do echo "${script_name}: creating ${obj_refkern_dirpath}/${conf}" @@ -3462,6 +3560,18 @@ main() echo "${script_name}: creating ${obj_frame_dirpath}" mkdir -p ${obj_frame_dirpath} + + if [ -n "${addon_flag}" ]; then + + obj_addon_dirpath="${base_obj_dirpath}/${addon_dir}" + + for addon in ${addon_list}; do + echo "${script_name}: creating ${obj_addon_dirpath}/${addon}" + mkdir -p ${obj_addon_dirpath}/${addon} + done + fi + + if [ -n "${sandbox_flag}" ]; then obj_sandbox_dirpath="${base_obj_dirpath}/${sandbox_dir}" @@ -3489,6 +3599,7 @@ main() echo "${script_name}: creating ${base_lib_dirpath}" mkdir -p ${base_lib_dirpath} + # Create include directory (if it does not already exist). base_include_dirpath="${include_dirpath}/${config_name}" @@ -3543,6 +3654,16 @@ main() echo "${script_name}: mirroring ${frame_dirpath} to ${obj_frame_dirpath}" ${mirror_tree_sh} ${frame_dirpath} ${obj_frame_dirpath} + # Mirror the chosen addon source tree to its object sub-directory. + if [ -n "${addon_flag}" ]; then + + for addon in ${addon_list}; do + + echo "${script_name}: mirroring ${addon_dirpath}/${addon} to ${obj_addon_dirpath}/${addon}" + ${mirror_tree_sh} "${addon_dirpath}/${addon}" "${obj_addon_dirpath}/${addon}" + done + fi + # Mirror the chosen sandbox source tree to its object sub-directory. if [ -n "${sandbox_flag}" ]; then @@ -3629,6 +3750,25 @@ main() ${gen_make_frags_dirpath}/suffix_list \ ${gen_make_frags_dirpath}/ignore_list + # Generate makefile fragments in the addon sub-directory. + if [ -n "${addon_flag}" ]; then + + for addon in ${addon_list}; do + + echo "${script_name}: creating makefile fragments in ${obj_addon_dirpath}/${addon}" + ${gen_make_frags_sh} \ + -h -r -v0 \ + -o ${script_name} \ + -p 'ADDON' \ + ${addon_dirpath}/${addon} \ + ${obj_addon_dirpath}/${addon} \ + ${gen_make_frags_dirpath}/fragment.mk \ + ${gen_make_frags_dirpath}/suffix_list \ + ${gen_make_frags_dirpath}/ignore_list + done + fi + + # Generate makefile fragments in the sandbox sub-directory. if [ -n "${sandbox_flag}" ]; then diff --git a/docs/Addons.md b/docs/Addons.md new file mode 100644 index 000000000..595cebfa4 --- /dev/null +++ b/docs/Addons.md @@ -0,0 +1,231 @@ +## Contents + +* **[Introduction](Addons.md#introduction)** +* **[Enabling addons](Addons.md#enabling-addons)** +* **[Addon rules](Addons.md#addon-rules)** +* **[Caveats](Addons.md#caveats)** +* **[Known issues](Addons.md#known-issues)** +* **[Conclusion](Addons.md#conclusion)** + + +## Introduction + +This file briefly describes the requirements for building a custom BLIS +*addon*. + +Simply put, an addon in BLIS provides additional APIs, operations, and/or +implementations that may be useful to certain users. An addon can be +thought of as a standalone extension of BLIS that does not depend on any +other addon, although addons may utilize existing functionality or kernels +within the core framework. + +By definition, an addon should *never* provide APIs that conflict with +the interfaces that belong to either the [typed API](BLISTypedAPI.md) or the +[object API](BLISObjectAPI.md). Thus, you'll never have to worry about a +properly constructed (and properly functioning) addon interfering with or +otherwise changing core BLIS functionality. + +How does an addon differ from a [sandbox](Sandboxes.md)? Great question! +Sometimes you want to include additional BLIS-like functionality that does +not relate directly to `gemm` or any other BLIS operation. +(By contrast, a sandbox requires you to implement `gemm` whether you want +to or not.) +Furthermore, you may wish to enable multiple addons simultaneously. +(By contrast, only one sandbox may be enabled at a time.) +Thus, the addon feature provides additional flexibility to some +users in a way that sandboxes cannot, while still providing many of the +conveniences of sandboxes. + +## Enabling an addon + +To enable an existing addon at configure-time, you simply specify it as an +option to `configure`. Either of the following usages are accepted: +``` +$ ./configure --enable-addon=foobar auto +$ ./configure -a foobar auto +``` +Here, we tell `configure` that we want to use the `foobar` addon, which +corresponds to a subdirectory of the `addon` directory named `foobar`. +(Reminder: the `auto` argument is the configuration target and +unrelated to addons.) + +You may also enable multiple addons within the same build of BLIS: +``` +$ ./configure -a foobar -a thing1 -a thing2 auto +``` +Note that the default behavior of `configure` is that no addons are enabled. + +As `configure` runs, you should get output that includes lines +similar to: +``` +configure: configuring with addons: +configure: addon/foobar +configure: addon/thing1 +configure: addon/thing2 +``` +And when you build BLIS, the addon source code will be among the last files to +be compiled: +``` +Compiling obj/haswell/addon/foobar/foobar.o ('haswell' CFLAGS for addons) +Compiling obj/haswell/addon/thing1/thing1.o ('haswell' CFLAGS for addons) +Compiling obj/haswell/addon/thing1/thing1_api.o ('haswell' CFLAGS for addons) +Compiling obj/haswell/addon/thing2/thing2_api.o ('haswell' CFLAGS for addons) +... +``` +That's it! After the BLIS library is built, it will contain your chosen +addons. You can always confirm this by using `nm` to confirm the presence +of your API symbols: +``` +$ nm lib/haswell/libblis.a | grep foobar +foobar.o: +0000000000000000 T foobar +``` + +## Addon rules + +Please follow these guidelines for the best developer experience when +creating addons. + +1. As with sandboxes, you don't need to worry about creating makefiles. The +BLIS build system will take care of this for you. :) By configuring BLIS with +an addon enabled, `make` will scan your addon subdirectory and compile +all of its source code using similar compilation rules as were used for the rest +of the framework. In addition, the compilation command line will automatically +contain one `-I` option for every subdirectory in your addon, +so it doesn't matter where in your addon directory hierarchy you place your +header files -- they will be found! + +2. We recommend that you write your addon in C99. While you *may* use C++11 +to implement your addon, you should provide a C99 wrapper API to your +implementation so that others can interface with it. There is no guarantee +that the end-user will be using a C++11 compiler, and therefore you should +limit the definitions in your addon header to those that are C99 compliant. +If you write your addon in C++11, you must use one of the BLIS-approved file +extensions for your source files (`.cc`, `.cpp`, `.cxx`) and your local +header files (`.hh`, `.hpp`, `.hxx`). +Note that `blis.h` already contains all of its definitions inside of an +`extern "C"` block, so you should be able to `#include "blis.h"` from your +C++11 source code without any issues. + +3. All of your code related to the addon should reside within the named +addon directory, or some subdirectory therein. If your addon requires +new kernels, you should add kernel source code to an appropriate +microarchitecture-specific subdirectory within the top-level `kernels` +directory so that they are compiled with the correct +microarchitecture-specific optimization flags. + +4. If your addon is named `foobar`, the BLIS build system will expect to +find a header called `foobar.h` somewhere in the `addon/foobar` directory +(or one of its subdirectories). This `foobar.h` header will automatically +be inlined into the monolithic `blis.h` header that is produced by the +BLIS build system. `foobar.h` may `#include` other local headers, each of +which will also (recursively) get inlined into `blis.h`. However, you may +choose to omit some local addon headers from `foobar.h.` You might do this, +for example, because those headers define things that are not needed in +order for the end user to call your addon code. + +5. Your addon APIs will always be available within static library builds of +BLIS, but if you want your addon APIs to be exported as public APIs within +*shared* library builds of BLIS, you'll need to annotate the prototypes +accordingly. (BLIS makes its shared library symbols private by default; this +allows us to export only those functions that we consider to be part of the +public APIs.) This annotation can be done by prefixing function prototypes +with the `BLIS_EXPORT_ADDON` macro as follows: +```c +BLIS_EXPORT_ADDON void foobar_calc( void* a, void* b ); +``` + +6. Do not define any symbols in your addon that conflict with any symbols within +the core framework. For example, don't define a function called `bli_copym()` +in your addon since that function is already defined within BLIS. + +7. Do not define any symbols in your addon that conflict with any symbols within +the C99 standard libraries/headers. For example, don't define a function called +`printf()` since that function is already defined within the C99 standard library. + +8. *Try* to not define any symbols in your addon that conflict with symbols in any +other addon, unless your addon is meant to serve as an alternative to the +conflicting addon, in which case conflicting symbol names is okay (since you +will presumably never build with both addons enabled). + +9. When choosing names for your addon files, avoid source filenames that already +exist within BLIS. For example, don't name one of your files `bli_obj.c` +since that file would compile into `bli_obj.o`, which will have already been +placed into the library by the build system. + +10. Similarly, avoid header filenames that already exist within BLIS or C99. +For example, don't name one of your header files `bli_obj.h` since that file +already exists in BLIS. Also, don't name one of your header files `math.h` +since that name would conflict with the `math.h` defined by C99. (This also +means you shouldn't name your addon `math` since normally that name would +require that you provide a `math.h` header inside the addon directory.) + +If you follow these rules, you will be much more likely to have a pleasant +experience integrating your BLIS addon into the larger framework. + +## Caveats + +Notice that the BLIS addons are limited in what they can accomplish. Generally +speaking, addons cannot change existing implementations within BLIS. Instead, +addons aim to provide a way to quickly augment BLIS with additional bundles of +code that extend BLIS's set of functionality in some interesting way. If you +want to define new BLAS-like functions, but don't know where to start, creating +a new addon is an appropriate place to start experimenting. If you want to +change or refactor existing BLIS code, an addon is probably not suited for your +needs. + +Another important limitation is the fact that the build system currently uses +"framework `CFLAGS`" when compiling the addon source files. These are the same +`CFLAGS` used when compiling general framework source code, +``` +# Example framework CFLAGS used by 'haswell' sub-configuration +-O2 -Wall -Wno-unused-function -Wfatal-errors -fPIC -std=c99 +-D_POSIX_C_SOURCE=200112L -Iinclude/haswell -I./frame/3/ +-I./frame/1m/ -I./frame/1f/ -I./frame/1/ -I./frame/include +-DBLIS_VERSION_STRING=\"0.8.1-195\" -fvisibility=hidden +``` +which are likely more general-purpose than the `CFLAGS` used for, say, +optimized kernels or even reference kernels: +``` +# Example optimized kernel CFLAGS used by 'haswell' sub-configuration +-O3 -fomit-frame-pointer -mavx2 -mfma -mfpmath=sse -march=haswell -Wall +-Wno-unused-function -Wfatal-errors -fPIC -std=c99 -D_POSIX_C_SOURCE=200112L +-Iinclude/haswell -I./frame/3/ -I./frame/1m/ -I./frame/1f/ -I./frame/1/ +-I./frame/include -DBLIS_VERSION_STRING=\"0.8.1-195\" -fvisibility=hidden +``` +(To see precisely which flags are being employed for any given file, enable +verbosity at compile-time via `make V=1`.) Compiling addons with these more +versatile `CFLAGS` compiler options means that we only need to compile one +instance of each addon source file, even when targeting multiple +configurations (for example, via `./configure x86_64`). However, it also means +that addons are not ideal for microkernels, as they sometimes need additional +compiler flags in order to +yield the highest performance. If you have a new microkernel you would like to +use within an addon, you can always develop it within that addon. However, +once it is stable and ready for use by others, it's best to move the kernel(s) +to the appropriate microarchitecture-specific subdirectory of the `kernels` +directory the kernel(s). This will allow the kernel to be compiled with the +appropriate microarchitecture-specific compiler flags. +Please see the +[Configuration Guide](ConfigurationHowTo) +for more details, and when in doubt, please don't be shy about seeking +guidance from BLIS developers by opening a +[new issue](https://github.com/flame/blis/issues) or sending a message to the +[blis-devel](http://groups.google.com/d/forum/blis-devel) mailing list. + +Notwithstanding these limitations, hopefully you still find BLIS addons +useful! + +## Known issues + +* None yet. + +## Conclusion + +If you encounter any problems, please open +a new [issue on GitHub](https://github.com/flame/blis/issues). + +If you are unsure about how something works, you can still open an issue. Or, you +can send a message to +[blis-devel](https://groups.google.com/d/forum/blis-devel) mailing list. + diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index 86f23df6e..5a4c8a15d 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -231,8 +231,9 @@ #endif #endif -#define BLIS_EXPORT_BLIS BLIS_EXPORT -#define BLIS_EXPORT_BLAS BLIS_EXPORT +#define BLIS_EXPORT_BLIS BLIS_EXPORT +#define BLIS_EXPORT_BLAS BLIS_EXPORT +#define BLIS_EXPORT_ADDON BLIS_EXPORT // -- STATIC INLINE FUNCTIONS -------------------------------------------------- diff --git a/frame/include/blis.h b/frame/include/blis.h index b374e8539..98ebee878 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -196,6 +196,14 @@ extern "C" { #include "bli_util.h" +// -- addon definitions -- + +// NOTE: These definitions should not be included much earlier since an addon +// may wish to utilize other types and definitions provided by BLIS. + +#include "bli_addon.h" + + // -- sandbox implementation -- #include "bli_sbox.h" From 78cd1b045155ddf0b9ec6e2ab815f2b216ad9a9e Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 16 Nov 2021 15:53:40 -0600 Subject: [PATCH 004/230] Added 'Example Code' section to README.md. Details: - Inserted a new 'Example Code' section into the README.md immediately after the 'Getting Started' section. Thanks to Devin Matthews for recommending this addition. - Moved the 'Performance' section of the README down slightly so that it appears after the 'Documentation' section. --- README.md | 77 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index f4ec4acb3..2abe79400 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,9 @@ Contents * **[Key Features](#key-features)** * **[How to Download BLIS](#how-to-download-blis)** * **[Getting Started](#getting-started)** -* **[Performance](#performance)** +* **[Example Code](#example-code)** * **[Documentation](#documentation)** +* **[Performance](#performance)** * **[External Packages](#external-packages)** * **[Discussion](#discussion)** * **[Contributing](#contributing)** @@ -394,23 +395,41 @@ If/when you have time, we *strongly* encourage you to read the detailed walkthrough of the build system found in our [Build System](docs/BuildSystem.md) guide. -Performance ------------ - -We provide graphs that report performance of several implementations across a -range of hardware types, multithreading configurations, problem sizes, -operations, and datatypes. These pages also document most of the details needed -to reproduce these experiments. +Example Code +------------ - * **[Performance](docs/Performance.md).** This document reports empirically -measured performance of a representative set of level-3 operations on a variety -of hardware architectures, as implemented within BLIS and other BLAS libraries -for all four of the standard floating-point datatypes. +The BLIS source distribution provides example code in the `examples` directory. +Example code focuses on using BLIS APIs (not BLAS or CBLAS), and resides in +two subdirectories: [examples/oapi](examples/oapi) (which demonstrates the +[object API](docs/BLISObjectAPI.md)) and [examples/tapi](examples/tapi) (which +demonstrates the [typed API](docs/BLISTypedAPI.md)). + +Either directory contains several files, each containing various pieces of +code that exercise core functionality of the BLIS API in question (object or +typed). These example files should be thought of collectively like a tutorial, +and therefore it is recommended to start from the beginning (the file that +starts in `00`). + +You can build all of the examples by simply running `make` from either example +subdirectory (`examples/oapi` or `examples/tapi`). (You can also run +`make clean`.) The local `Makefile` assumes that you've already configured and +built (but not necessarily installed) BLIS two directories up, in `../..`. If +you have already installed BLIS to some permanent directory, you may refer to +that installation by setting the environment variable `BLIS_INSTALL_PATH` prior +to running make: +``` +export BLIS_INSTALL_PATH=/usr/local; make +``` +or by setting the same variable as part of the make command: +``` +make BLIS_INSTALL_PATH=/usr/local +``` +**Once the executable files have been built, we recommend reading the code and +the corresponding executable output side by side. This will help you see the +effects of each section of code.** - * **[PerformanceSmall](docs/PerformanceSmall.md).** This document reports -empirically measured performance of `gemm` on select hardware architectures -within BLIS and other BLAS libraries when performing matrix problems where one -or two dimensions is exceedingly small. +This tutorial is not exhaustive or complete; several object API functions were +omitted (mostly for brevity's sake) and thus more examples could be written. Documentation ------------- @@ -432,16 +451,12 @@ included BLAS test drivers. * **[BLIS Typed API Reference](docs/BLISTypedAPI.md).** Here we document the so-called "typed" (or BLAS-like) API. This is the API that many users who are -already familiar with the BLAS will likely want to use. You can find lots of -example code for the typed API in the [examples/tapi](examples/tapi) directory -included in the BLIS source distribution. +already familiar with the BLAS will likely want to use. * **[BLIS Object API Reference](docs/BLISObjectAPI.md).** Here we document the object API. This is API abstracts away properties of vectors and matrices within `obj_t` structs that can be queried with accessor functions. Many -developers and experts prefer this API over the typed API. You can find lots of -example code for the object API in the [examples/oapi](examples/oapi) directory -included in the BLIS source distribution. +developers and experts prefer this API over the typed API. * **[Hardware Support](docs/HardwareSupport.md).** This document maintains a table of supported microarchitectures. @@ -501,6 +516,24 @@ please read this thorough walkthrough of the configuration system. about using sandboxes in BLIS--that is, providing alternative implementations of the `gemm` operation--please read this document. +Performance +----------- + +We provide graphs that report performance of several implementations across a +range of hardware types, multithreading configurations, problem sizes, +operations, and datatypes. These pages also document most of the details needed +to reproduce these experiments. + + * **[Performance](docs/Performance.md).** This document reports empirically +measured performance of a representative set of level-3 operations on a variety +of hardware architectures, as implemented within BLIS and other BLAS libraries +for all four of the standard floating-point datatypes. + + * **[PerformanceSmall](docs/PerformanceSmall.md).** This document reports +empirically measured performance of `gemm` on select hardware architectures +within BLIS and other BLAS libraries when performing matrix problems where one +or two dimensions is exceedingly small. + External Packages ----------------- From cbc88feb51b949ce562d044cf9f99c4e46bb8a39 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 16 Nov 2021 16:02:39 -0600 Subject: [PATCH 005/230] Marked some markdown shell code blocks as 'bash'. Details: - Annotated the code blocks that represent shell commands and output as 'bash' in README.md and BuildSystem.md. --- README.md | 12 ++++++------ docs/BuildSystem.md | 40 ++++++++++++++++++++-------------------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 2abe79400..21bfe10d3 100644 --- a/README.md +++ b/README.md @@ -337,7 +337,7 @@ slightly out of date.) URL by clicking on the green button above the file/directory listing near the top of this page (as rendered by GitHub). Generally speaking, it will amount to executing the following command in your terminal shell: - ``` + ```bash git clone https://github.com/flame/blis.git ``` @@ -375,18 +375,18 @@ as discussed in [the previous section](#how-to-download-blis).* If you just want to build a sequential (not parallelized) version of BLIS in a hurry and come back and explore other topics later, you can configure and build BLIS as follows: -``` +```bash $ ./configure auto $ make [-j] ``` You can then verify your build by running BLAS- and BLIS-specific test drivers via `make check`: -``` +```bash $ make check [-j] ``` And if you would like to install BLIS to the directory specified to `configure` via the `--prefix` option, run the `install` target: -``` +```bash $ make install ``` Please read the output of `./configure --help` for a full list of configure-time @@ -417,11 +417,11 @@ built (but not necessarily installed) BLIS two directories up, in `../..`. If you have already installed BLIS to some permanent directory, you may refer to that installation by setting the environment variable `BLIS_INSTALL_PATH` prior to running make: -``` +```bash export BLIS_INSTALL_PATH=/usr/local; make ``` or by setting the same variable as part of the make command: -``` +```bash make BLIS_INSTALL_PATH=/usr/local ``` **Once the executable files have been built, we recommend reading the code and diff --git a/docs/BuildSystem.md b/docs/BuildSystem.md index 5e290d9bb..60fd541d6 100644 --- a/docs/BuildSystem.md +++ b/docs/BuildSystem.md @@ -40,14 +40,14 @@ Finally, we also require various other shell utilities that are so ubiquitous th Before starting, you must obtain a copy of BLIS. If you are an end-user (i.e., not a developer), you can download a tarball or zip file of the latest tagged version by returning to the main [BLIS homepage](https://github.com/flame/blis) and clicking on the [releases](https://github.com/flame/blis/releases) link. **However**, we highly recommend that you instead clone a copy using the command: -``` +```bash $ git clone https://github.com/flame/blis.git ``` Cloning a repository allows users and developers alike to quickly and easily pull in new commits as they are available, including commits that occur **between** tagged releases. Once you download the BLIS distribution, the top-level directory should look something like: -``` +```bash $ ls CHANGELOG Makefile common.mk configure mpi_test testsuite CREDITS README.md config frame obj version @@ -63,7 +63,7 @@ The first step is to choose how to configure BLIS. Specifically, a user must dec Configurations are described in detail in the [Configuration Guide](ConfigurationHowTo.md). Generally speaking, a configuration consists of several files that reside in a sub-directory of the `config` directory. To see a list of the available configurations, you may inspect this directory, or run `configure` with no arguments. Here are the current (as of this writing) contents of the `config` directory: -``` +```bash $ ls config amd64 cortexa15 excavator intel64 old power7 template bgq cortexa57 generic knc penryn sandybridge zen @@ -85,19 +85,19 @@ Multithreading in BLIS is disabled by default. For more information on enabling ## Step 2: Running `configure` This step should be somewhat familiar to many people who use open source software. To configure the build system, simply run: -``` +```bash $ ./configure ``` where `` is the configuration sub-directory name you chose in [Step 1](BuildSystem.md#step-1-choose-a-framework-configuration) above. If `` is not given, a helpful message is printed reminding you to explicit specify a configuration name along with a list of valid configuration families and their implied sub-configurations. For more information on sub-configurations and families, please see the BLIS [Configuration Guide](ConfigurationHowTo.md). Alternatively, `configure` can automatically select a configuration based on your hardware: -``` +```bash $ ./configure auto ``` However, as of this writing, BLIS lacks support for automatically detecting some architectures. If the `configure` script is not able to detect your architecture, the `generic` configuration will be used. Upon running configure, you will get output similar to the following. The exact output will depend on whether you cloned BLIS from a `git` repository or whether you obtained BLIS via a downloadable tarball from the [releases](https://github.com/flame/blis/releases) page. -``` +```bash $ ./configure --prefix=$HOME/blis haswell configure: using 'gcc' compiler. configure: found gcc version 5.4.0 (maj: 5, min: 4, rev: 0). @@ -174,17 +174,17 @@ configure: creating makefile fragments in ./frame configure: configured to build within top-level directory of source distribution. ``` The installation prefix can be specified via the `--prefix=PREFIX` option: -``` +```bash $ ./configure --prefix=/usr ``` This will cause libraries to eventually be installed (via `make install`) to `PREFIX/lib` and development headers to be installed to `PREFIX/include`. (The default value of `PREFIX` is `/usr/local`.) You can also specify the library install directory separately from the development header install directory with the `--libdir=LIBDIR` and `--includedir=INCDIR` options, respectively: -``` +```bash $ ./configure --libdir=/usr/lib --includedir=/usr/include ``` The `--libdir=LIBDIR` and `--includedir=INCDIR` options will override any path implied by `PREFIX`, whether it was specified explicitly via `--prefix` or implicitly (via the default). That is, `LIBDIR` defaults to `EXECPREFIX/lib` (where `EXECPREFIX`, set via `--exec-prefix=EXECPREFIX`, defaults to `PREFIX`) and `INCDIR` defaults to `PREFIX/include`, but `LIBDIR` and `INCDIR` will each be overriden by their respective `--libdir`/`--includedir` options. There is a third related option, `--sharedir=SHAREDIR`, where `SHAREDIR` defaults to `PREFIX/share`. This option specifies the installation directory for certain makefile fragments that contain variables determined by `configure` (e.g. `CC`, `CFLAGS`, `LDFLAGS`, etc.). These files allow certain BLIS makefiles, such as those in the `examples` or `testsuite` directories, to operate on an installed copy of BLIS rather than a local (and possibly uninstalled) copy. For a complete list of supported `configure` options and arguments, run `configure` with the `-h` option: -``` +```bash $ ./configure -h ``` The output from this invocation of `configure` should give you an up-to-date list of options and their descriptions. @@ -192,7 +192,7 @@ The output from this invocation of `configure` should give you an up-to-date lis ## Step 3: Compilation Once `configure` is finished, you are ready to instantiate (compile) BLIS into a library by running `make`. Running `make` will result in output similar to: -``` +```bash $ make Generating monolithic blis.h......................................................... ..................................................................................... @@ -209,11 +209,11 @@ Compiling obj/haswell/kernels/zen/1/bli_dotv_zen_int.o ('haswell' CFLAGS for ker Compiling obj/haswell/kernels/zen/1/bli_dotv_zen_int10.o ('haswell' CFLAGS for kernels) ``` If you want to see the individual command line invocations of the compiler, you can run `make` as follows: -``` +```bash $ make V=1 ``` Also, if you are compiling on a multicore system, you can get parallelism via: -``` +```bash $ make -j ``` where `` is the number of jobs `make` is allowed to run simultaneously. Generally, you should limit `` to p+1, where p is the number of processor cores on your system. @@ -236,7 +236,7 @@ The archiver and/or linker should no longer choke when creating the libraries. ## Step 3b: Testing (optional) If you would like to run some ready-made tests that exercise BLIS in a number of ways, including through its BLAS compatibility layer, run `make check`: -``` +```bash $ make check ``` Watch the output near the end. You should see the following messages, though not necessarily in immediate succession: @@ -263,7 +263,7 @@ Archiving lib/haswell/libblis.a Dynamically linking lib/haswell/libblis.so ``` Now you have a BLIS library (in static and shared forms) residing in the `lib//` directory. To install the libraries and the header files associated with it, simply execute: -``` +```bash $ make install ``` This installs copies of the libraries and header files, and also creates conventional symbolic links of shared libraries: @@ -275,7 +275,7 @@ Installing symlink libblis.so.0 into /u/field/blis/lib/ Installing blis.h into /u/field/blis/include/blis/ ``` This results in your `PREFIX` directory looking like: -``` +```bash # Check the contents of 'PREFIX'. $ ls -l $HOME/blis drwxr-xr-x 3 field dept 4096 May 10 17:36 include @@ -296,14 +296,14 @@ lrwxrwxrwx 1 field dept 16 May 10 17:42 libblis.so.0 -> libblis.so.0.0.0 ## Cleaning out build products If you want to remove various build products, you can use one of the `make` targets already defined for you in the BLIS Makefile: -``` +```bash $ make clean Removing flattened header files from ./include/haswell. Removing object files from ./obj/haswell. Removing libraries from ./lib/haswell. ``` Executing the `clean` target will remove all binary object files and library builds from the `obj` and `lib` directories, as well as any flattened header files. Any other configurations' build products are left untouched. -``` +```bash $ make cleanmk Removing makefile fragments from ./config. Removing makefile fragments from ./frame. @@ -311,7 +311,7 @@ Removing makefile fragments from ./ref_kernels. Removing makefile fragments from ./kernels. ``` The `cleanmk` target results in removal of all makefile fragments from the framework source tree. (Makefile fragments are named `.fragment.mk` and are generated at configure-time.) -``` +```bash $ make distclean Removing makefile fragments from ./config. Removing makefile fragments from ./frame. @@ -357,7 +357,7 @@ If the BLAS compatibility layer was enabled at configure-time (as it is by defau ### Disabling BLAS prototypes Some applications already `#include` a header that contains BLAS prototypes. This can cause problems if those applications also try to `#include` the BLIS header file, as shown above. Suppose for a moment that `otherstuff.h` in the example above already provides BLAS prototypes. -``` +```bash $ gcc -I/path/to/blis -I/path/to/otherstuff -c main.c -o main.o In file included from main.c:41:0: /path/to/blis/blis.h:36900:111: error: conflicting declaration of C function ‘int xerbla_(const bla_character*, const bla_integer*, ftnlen)’ @@ -413,7 +413,7 @@ The makefile shown above a very simple example. If you need help linking your ap ## Uninstalling If you decide that you want to uninstall BLIS, simply run `make uninstall` -``` +```bash $ make uninstall Uninstalling libraries libblis.a libblis.so.0.0.0 from /u/field/blis/lib/. Uninstalling symlinks libblis.so libblis.so.0 from /u/field/blis/lib/. From 74c0c622216aba0c24aa2c3a923811366a160cf5 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 16 Nov 2021 16:06:33 -0600 Subject: [PATCH 006/230] Reverted cbc88fe. Details: - Reverted the annotation of some markdown code blocks with 'bash' after realizing that the in-browser syntax highlighting was not worthwhile. --- README.md | 12 ++++++------ docs/BuildSystem.md | 40 ++++++++++++++++++++-------------------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 21bfe10d3..2abe79400 100644 --- a/README.md +++ b/README.md @@ -337,7 +337,7 @@ slightly out of date.) URL by clicking on the green button above the file/directory listing near the top of this page (as rendered by GitHub). Generally speaking, it will amount to executing the following command in your terminal shell: - ```bash + ``` git clone https://github.com/flame/blis.git ``` @@ -375,18 +375,18 @@ as discussed in [the previous section](#how-to-download-blis).* If you just want to build a sequential (not parallelized) version of BLIS in a hurry and come back and explore other topics later, you can configure and build BLIS as follows: -```bash +``` $ ./configure auto $ make [-j] ``` You can then verify your build by running BLAS- and BLIS-specific test drivers via `make check`: -```bash +``` $ make check [-j] ``` And if you would like to install BLIS to the directory specified to `configure` via the `--prefix` option, run the `install` target: -```bash +``` $ make install ``` Please read the output of `./configure --help` for a full list of configure-time @@ -417,11 +417,11 @@ built (but not necessarily installed) BLIS two directories up, in `../..`. If you have already installed BLIS to some permanent directory, you may refer to that installation by setting the environment variable `BLIS_INSTALL_PATH` prior to running make: -```bash +``` export BLIS_INSTALL_PATH=/usr/local; make ``` or by setting the same variable as part of the make command: -```bash +``` make BLIS_INSTALL_PATH=/usr/local ``` **Once the executable files have been built, we recommend reading the code and diff --git a/docs/BuildSystem.md b/docs/BuildSystem.md index 60fd541d6..5e290d9bb 100644 --- a/docs/BuildSystem.md +++ b/docs/BuildSystem.md @@ -40,14 +40,14 @@ Finally, we also require various other shell utilities that are so ubiquitous th Before starting, you must obtain a copy of BLIS. If you are an end-user (i.e., not a developer), you can download a tarball or zip file of the latest tagged version by returning to the main [BLIS homepage](https://github.com/flame/blis) and clicking on the [releases](https://github.com/flame/blis/releases) link. **However**, we highly recommend that you instead clone a copy using the command: -```bash +``` $ git clone https://github.com/flame/blis.git ``` Cloning a repository allows users and developers alike to quickly and easily pull in new commits as they are available, including commits that occur **between** tagged releases. Once you download the BLIS distribution, the top-level directory should look something like: -```bash +``` $ ls CHANGELOG Makefile common.mk configure mpi_test testsuite CREDITS README.md config frame obj version @@ -63,7 +63,7 @@ The first step is to choose how to configure BLIS. Specifically, a user must dec Configurations are described in detail in the [Configuration Guide](ConfigurationHowTo.md). Generally speaking, a configuration consists of several files that reside in a sub-directory of the `config` directory. To see a list of the available configurations, you may inspect this directory, or run `configure` with no arguments. Here are the current (as of this writing) contents of the `config` directory: -```bash +``` $ ls config amd64 cortexa15 excavator intel64 old power7 template bgq cortexa57 generic knc penryn sandybridge zen @@ -85,19 +85,19 @@ Multithreading in BLIS is disabled by default. For more information on enabling ## Step 2: Running `configure` This step should be somewhat familiar to many people who use open source software. To configure the build system, simply run: -```bash +``` $ ./configure ``` where `` is the configuration sub-directory name you chose in [Step 1](BuildSystem.md#step-1-choose-a-framework-configuration) above. If `` is not given, a helpful message is printed reminding you to explicit specify a configuration name along with a list of valid configuration families and their implied sub-configurations. For more information on sub-configurations and families, please see the BLIS [Configuration Guide](ConfigurationHowTo.md). Alternatively, `configure` can automatically select a configuration based on your hardware: -```bash +``` $ ./configure auto ``` However, as of this writing, BLIS lacks support for automatically detecting some architectures. If the `configure` script is not able to detect your architecture, the `generic` configuration will be used. Upon running configure, you will get output similar to the following. The exact output will depend on whether you cloned BLIS from a `git` repository or whether you obtained BLIS via a downloadable tarball from the [releases](https://github.com/flame/blis/releases) page. -```bash +``` $ ./configure --prefix=$HOME/blis haswell configure: using 'gcc' compiler. configure: found gcc version 5.4.0 (maj: 5, min: 4, rev: 0). @@ -174,17 +174,17 @@ configure: creating makefile fragments in ./frame configure: configured to build within top-level directory of source distribution. ``` The installation prefix can be specified via the `--prefix=PREFIX` option: -```bash +``` $ ./configure --prefix=/usr ``` This will cause libraries to eventually be installed (via `make install`) to `PREFIX/lib` and development headers to be installed to `PREFIX/include`. (The default value of `PREFIX` is `/usr/local`.) You can also specify the library install directory separately from the development header install directory with the `--libdir=LIBDIR` and `--includedir=INCDIR` options, respectively: -```bash +``` $ ./configure --libdir=/usr/lib --includedir=/usr/include ``` The `--libdir=LIBDIR` and `--includedir=INCDIR` options will override any path implied by `PREFIX`, whether it was specified explicitly via `--prefix` or implicitly (via the default). That is, `LIBDIR` defaults to `EXECPREFIX/lib` (where `EXECPREFIX`, set via `--exec-prefix=EXECPREFIX`, defaults to `PREFIX`) and `INCDIR` defaults to `PREFIX/include`, but `LIBDIR` and `INCDIR` will each be overriden by their respective `--libdir`/`--includedir` options. There is a third related option, `--sharedir=SHAREDIR`, where `SHAREDIR` defaults to `PREFIX/share`. This option specifies the installation directory for certain makefile fragments that contain variables determined by `configure` (e.g. `CC`, `CFLAGS`, `LDFLAGS`, etc.). These files allow certain BLIS makefiles, such as those in the `examples` or `testsuite` directories, to operate on an installed copy of BLIS rather than a local (and possibly uninstalled) copy. For a complete list of supported `configure` options and arguments, run `configure` with the `-h` option: -```bash +``` $ ./configure -h ``` The output from this invocation of `configure` should give you an up-to-date list of options and their descriptions. @@ -192,7 +192,7 @@ The output from this invocation of `configure` should give you an up-to-date lis ## Step 3: Compilation Once `configure` is finished, you are ready to instantiate (compile) BLIS into a library by running `make`. Running `make` will result in output similar to: -```bash +``` $ make Generating monolithic blis.h......................................................... ..................................................................................... @@ -209,11 +209,11 @@ Compiling obj/haswell/kernels/zen/1/bli_dotv_zen_int.o ('haswell' CFLAGS for ker Compiling obj/haswell/kernels/zen/1/bli_dotv_zen_int10.o ('haswell' CFLAGS for kernels) ``` If you want to see the individual command line invocations of the compiler, you can run `make` as follows: -```bash +``` $ make V=1 ``` Also, if you are compiling on a multicore system, you can get parallelism via: -```bash +``` $ make -j ``` where `` is the number of jobs `make` is allowed to run simultaneously. Generally, you should limit `` to p+1, where p is the number of processor cores on your system. @@ -236,7 +236,7 @@ The archiver and/or linker should no longer choke when creating the libraries. ## Step 3b: Testing (optional) If you would like to run some ready-made tests that exercise BLIS in a number of ways, including through its BLAS compatibility layer, run `make check`: -```bash +``` $ make check ``` Watch the output near the end. You should see the following messages, though not necessarily in immediate succession: @@ -263,7 +263,7 @@ Archiving lib/haswell/libblis.a Dynamically linking lib/haswell/libblis.so ``` Now you have a BLIS library (in static and shared forms) residing in the `lib//` directory. To install the libraries and the header files associated with it, simply execute: -```bash +``` $ make install ``` This installs copies of the libraries and header files, and also creates conventional symbolic links of shared libraries: @@ -275,7 +275,7 @@ Installing symlink libblis.so.0 into /u/field/blis/lib/ Installing blis.h into /u/field/blis/include/blis/ ``` This results in your `PREFIX` directory looking like: -```bash +``` # Check the contents of 'PREFIX'. $ ls -l $HOME/blis drwxr-xr-x 3 field dept 4096 May 10 17:36 include @@ -296,14 +296,14 @@ lrwxrwxrwx 1 field dept 16 May 10 17:42 libblis.so.0 -> libblis.so.0.0.0 ## Cleaning out build products If you want to remove various build products, you can use one of the `make` targets already defined for you in the BLIS Makefile: -```bash +``` $ make clean Removing flattened header files from ./include/haswell. Removing object files from ./obj/haswell. Removing libraries from ./lib/haswell. ``` Executing the `clean` target will remove all binary object files and library builds from the `obj` and `lib` directories, as well as any flattened header files. Any other configurations' build products are left untouched. -```bash +``` $ make cleanmk Removing makefile fragments from ./config. Removing makefile fragments from ./frame. @@ -311,7 +311,7 @@ Removing makefile fragments from ./ref_kernels. Removing makefile fragments from ./kernels. ``` The `cleanmk` target results in removal of all makefile fragments from the framework source tree. (Makefile fragments are named `.fragment.mk` and are generated at configure-time.) -```bash +``` $ make distclean Removing makefile fragments from ./config. Removing makefile fragments from ./frame. @@ -357,7 +357,7 @@ If the BLAS compatibility layer was enabled at configure-time (as it is by defau ### Disabling BLAS prototypes Some applications already `#include` a header that contains BLAS prototypes. This can cause problems if those applications also try to `#include` the BLIS header file, as shown above. Suppose for a moment that `otherstuff.h` in the example above already provides BLAS prototypes. -```bash +``` $ gcc -I/path/to/blis -I/path/to/otherstuff -c main.c -o main.o In file included from main.c:41:0: /path/to/blis/blis.h:36900:111: error: conflicting declaration of C function ‘int xerbla_(const bla_character*, const bla_integer*, ftnlen)’ @@ -413,7 +413,7 @@ The makefile shown above a very simple example. If you need help linking your ap ## Uninstalling If you decide that you want to uninstall BLIS, simply run `make uninstall` -```bash +``` $ make uninstall Uninstalling libraries libblis.a libblis.so.0.0.0 from /u/field/blis/lib/. Uninstalling symlinks libblis.so libblis.so.0 from /u/field/blis/lib/. From 26e4b6b29312b472c3cadf95ccdf5240764777f4 Mon Sep 17 00:00:00 2001 From: Dipal M Zambare <71366780+dzambare@users.noreply.github.com> Date: Thu, 18 Nov 2021 00:32:00 +0530 Subject: [PATCH 007/230] Added support for AMD's Zen3 microarchitecture. Details: - Added a new 'zen3' subconfiguration targeting support for the AMD Zen3 microarchitecture (#561). Thanks to AMD for this contribution. - Restructured clang and AOCC support for zen, zen2, and zen3 make_defs.mk files. The clang and AOCC version detection now happens in configure, not in the subconfigurations' makefile fragments. That is, we've added logic to configure that detects the version of clang/AOCC, outputs an appropriate variable to config.mk (ie: CLANG_OT_*, AOCC_OT_*), and then checks for it within the makefile fragment (as is currently done for the GCC_OT_* variables). - Added configure support for a GCC_OT_10_1_0 variable (and associated substitution anchor) to communicate whether the gcc version is older than 10.1.0, and use this variable to check for recent enough versions of gcc to use -march=znver3 in the zen3 subconfig. - Inlined the contents of config/zen/amd_config.mk into the zen and zen2 make_defs.mk so that the files are self-contained, harmonizing the format of all three Zen-based subconfigurations' make_defs.mk files. - Added indenting (with spaces) of GNU make conditionals for easier reading in zen, zen2, and zen3 make_defs.mk files. - Adjusted the range of models checked by bli_cpuid_is_zen() (which was previously 0x00 ~ 0xff and is now 0x00 ~ 0x2f) so that it is completely disjoint from the models checked by bli_cpuid_is_zen2() (0x30 ~ 0xff). This is normally necessary because Zen and Zen2 microarchitectures share the same family (23, or 0x17), and so the model code is the only way to differentiate the two. But in our case, fixing the model range for zen *wasn't* actually necessary since we checked for zen2 first, and therefore the wide zen range acted like the 'else' of an 'if-else' statement. That said, the change helps improve clarity for the reader by encoding useful knowledge, which was obtained from https://en.wikichip.org/wiki/amd/cpuid . - Added zen2.def and zen3.def files to the collection in travis/cpuid. Note that support for zen, zen2, and zen3 is now present, and while all the three microarchitectures have identical instruction sets from the perspective of BLIS microkernels, they each correspond to different subconfigurations and therefore merit separate testing. Thanks to Devin Matthews for his guidance in hacking these files as slight modifications of zen.def. - Enabled testing of zen2 and zen3 via the SDE in travis/do_sde.sh. Now, zen, zen2, and zen3 are tested through the SDE via Travis CI builds. - Updated travis/do_sde.sh to grab the SDE tarball from a new ci-utils repository on GitHub rather than on Intel's website. This change was made in an attempt to circumvent recent troubles with Travis CI not being able to download the SDE directly from Intel's website via curl. Thanks to Devin Matthews for suggesting the idea. - Updated travis/do_sde.sh to grab the latest version (8.69.1) of the Intel SDE from the flame/ci-utils repository. - Updated .travis.yml to use gcc 9. The file was previously using gcc 8, which did not support -march=znver2. - Created amd64_legacy umbrella family in config_registry for targeting older (bulldozer, piledriver, steamroller, and excavator) microarchitectures and moved those same subconfigs out of the amd64 umbrella family. However, x86_64 retains amd64_legacy as a constituent member. - Fixed a bug in configure related to the building of the so-called config list. When processing the contents of config_registry, configure creates a series of structures and lists that allow for various mappings related to configuration families, subconfigs, and kernel sets. Two of those lists are built via substitution of umbrella families with their subconfig members, and one of those lists was improperly performing the substitution in a way that would erroneously match on partial umbrella family names. That code was changed to match the code that was already doing the substitution properly, via substitute_words(). Also added comments noting the importance of using substitute_words() in both instances. - Comment updates. --- .travis.yml | 8 +- build/config.mk.in | 5 + config/amd64/bli_family_amd64.h | 19 +- config/amd64/make_defs.mk | 29 +- config/amd64_legacy/bli_family_amd64_legacy.h | 42 +++ config/amd64_legacy/make_defs.mk | 70 ++++ config/zen/make_defs.mk | 67 ++-- config/zen/make_defs.mk.old | 84 +++++ config/zen2/make_defs.mk | 81 +++-- config/zen2/make_defs.mk.old | 94 ++++++ config/zen3/bli_cntx_init_zen3.c | 298 ++++++++++++++++++ config/zen3/bli_family_zen3.h | 94 ++++++ config/zen3/make_defs.mk | 113 +++++++ config/zen3/make_defs.mk.old | 137 ++++++++ config_registry | 12 +- configure | 128 +++++++- frame/base/bli_arch.c | 8 +- frame/base/bli_cpuid.c | 52 ++- frame/base/bli_cpuid.h | 3 +- frame/base/bli_error.h | 1 + frame/base/bli_gks.c | 7 +- frame/include/bli_arch_config.h | 12 + frame/include/bli_type_defs.h | 1 + kernels/zen3/.gitignore | 4 + travis/cpuid/zen2.def | 87 +++++ travis/cpuid/zen3.def | 87 +++++ travis/do_sde.sh | 4 +- 27 files changed, 1423 insertions(+), 124 deletions(-) create mode 100644 config/amd64_legacy/bli_family_amd64_legacy.h create mode 100644 config/amd64_legacy/make_defs.mk create mode 100644 config/zen/make_defs.mk.old create mode 100644 config/zen2/make_defs.mk.old create mode 100644 config/zen3/bli_cntx_init_zen3.c create mode 100644 config/zen3/bli_family_zen3.h create mode 100644 config/zen3/make_defs.mk create mode 100644 config/zen3/make_defs.mk.old create mode 100644 kernels/zen3/.gitignore create mode 100644 travis/cpuid/zen2.def create mode 100644 travis/cpuid/zen3.def diff --git a/.travis.yml b/.travis.yml index 555e9a11a..6603ca2f3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,17 +12,17 @@ matrix: - os: linux compiler: gcc env: OOT=1 TEST=ALL SDE=1 THR="none" CONF="x86_64" \ - PACKAGES="gcc-8 binutils" + PACKAGES="gcc-9 binutils" # openmp build - os: linux compiler: gcc env: OOT=0 TEST=FAST SDE=0 THR="openmp" CONF="auto" \ - PACKAGES="gcc-8 binutils" + PACKAGES="gcc-9 binutils" # pthreads build - os: linux compiler: gcc env: OOT=0 TEST=FAST SDE=0 THR="pthreads" CONF="auto" \ - PACKAGES="gcc-8 binutils" + PACKAGES="gcc-9 binutils" # clang build - os: linux compiler: clang @@ -63,7 +63,7 @@ matrix: PACKAGES="gcc-10-aarch64-linux-gnu g++-10-aarch64-linux-gnu libc6-dev-arm64-cross qemu-system-arm qemu-user" \ TESTSUITE_WRAPPER="qemu-aarch64 -cpu max,sve=true,sve512=true -L /usr/aarch64-linux-gnu/" install: -- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-8"; fi +- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-9"; fi - if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi script: - export DIST_PATH=. diff --git a/build/config.mk.in b/build/config.mk.in index 7533d1acb..1032ce8e7 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -93,6 +93,11 @@ CC := @CC@ GCC_OT_4_9_0 := @gcc_older_than_4_9_0@ GCC_OT_6_1_0 := @gcc_older_than_6_1_0@ GCC_OT_9_1_0 := @gcc_older_than_9_1_0@ +GCC_OT_10_1_0 := @gcc_older_than_10_1_0@ +CLANG_OT_9_0_0 := @clang_older_than_9_0_0@ +CLANG_OT_12_0_0 := @clang_older_than_12_0_0@ +AOCC_OT_2_0_0 := @aocc_older_than_2_0_0@ +AOCC_OT_3_0_0 := @aocc_older_than_3_0_0@ # The C++ compiler. NOTE: A C++ is typically not needed. CXX := @CXX@ diff --git a/config/amd64/bli_family_amd64.h b/config/amd64/bli_family_amd64.h index 278c22818..ac10789aa 100644 --- a/config/amd64/bli_family_amd64.h +++ b/config/amd64/bli_family_amd64.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,15 +32,14 @@ */ -//#ifndef BLIS_FAMILY_H -//#define BLIS_FAMILY_H +#ifndef BLIS_FAMILY_AMD64_H +#define BLIS_FAMILY_AMD64_H +// Enable framework optimizations for EPYC family processors. +// With this macro defined, we can call kernels directly from +// BLAS interfaces for levels 1 & 2. +// This macro needs to be defined for all EPYC configurations. +#define BLIS_CONFIG_EPYC -// -- MEMORY ALLOCATION -------------------------------------------------------- - -#define BLIS_SIMD_ALIGN_SIZE 16 - - - -//#endif +#endif diff --git a/config/amd64/make_defs.mk b/config/amd64/make_defs.mk index b9232ac6c..ebb7a569f 100644 --- a/config/amd64/make_defs.mk +++ b/config/amd64/make_defs.mk @@ -1,10 +1,10 @@ # # -# BLIS +# BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # -# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -60,29 +60,8 @@ else COPTFLAGS := -O2 endif -# Flags specific to optimized kernels. -CKOPTFLAGS := $(COPTFLAGS) -O3 -ifeq ($(CC_VENDOR),gcc) -CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 -else -ifeq ($(CC_VENDOR),clang) -CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 -else -$(error gcc or clang are required for this configuration.) -endif -endif - -# Flags specific to reference kernels. -CROPTFLAGS := $(CKOPTFLAGS) -ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast -else -ifeq ($(CC_VENDOR),clang) -CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast -else -CRVECFLAGS := $(CKVECFLAGS) -endif -endif +# Setting for reference and optimized kernels are taken from individual +# subconfiguration makefile fragments in this family. # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/amd64_legacy/bli_family_amd64_legacy.h b/config/amd64_legacy/bli_family_amd64_legacy.h new file mode 100644 index 000000000..c4f84885f --- /dev/null +++ b/config/amd64_legacy/bli_family_amd64_legacy.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_FAMILY_AMD64_LEGACY_H +#define BLIS_FAMILY_AMD64_LEGACY_H + +// Placeholder for bundle configuration. + +#endif + diff --git a/config/amd64_legacy/make_defs.mk b/config/amd64_legacy/make_defs.mk new file mode 100644 index 000000000..37ccbdae2 --- /dev/null +++ b/config/amd64_legacy/make_defs.mk @@ -0,0 +1,70 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := amd64_legacy +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := +CMISCFLAGS := +CPICFLAGS := +CWARNFLAGS := + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O2 +endif + +# Setting for reference and optimized kernels are taken from individual +# subconfiguration makefile fragments in this family. + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config/zen/make_defs.mk b/config/zen/make_defs.mk index 8f975d5bc..8bdafd5ca 100644 --- a/config/zen/make_defs.mk +++ b/config/zen/make_defs.mk @@ -1,11 +1,10 @@ # # -# BLIS +# BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # -# Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2019, Advanced Micro Devices, Inc. +# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -33,9 +32,6 @@ # # -# FLAGS that are specific to the 'zen' architecture are added here. -# FLAGS that are common for all the AMD architectures are present in -# amd_config.mk. # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. @@ -46,37 +42,50 @@ THIS_CONFIG := zen # --- Determine the C compiler and related flags --- # -# Include the file containing common flags for all AMD architectures. -AMD_CONFIG_FILE := amd_config.mk -AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen --include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE) +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := +CMISCFLAGS := +CPICFLAGS := +CWARNFLAGS := -ifeq ($(CC_VENDOR),gcc) -# If gcc is older than 6.1.0, we must use -march=bdver4 and then remove the -# Bulldozer instruction sets that were omitted from Zen. -# Additionally, if gcc is 4.9 (clang 3.5?) or newer, we may want to add -# Zen-specific instructions back into the mix: -# -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt -ifeq ($(GCC_OT_6_1_0),yes) -CRVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp -CKVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 else -# If gcc is at least 6.1.0, then we can specify the microarchitecture using -# the preferred option. -CRVECFLAGS += -march=znver1 -CKVECFLAGS += -march=znver1 +COPTFLAGS := -O2 -fomit-frame-pointer endif + +# Flags specific to optimized and reference kernels. +# NOTE: The -fomit-frame-pointer option is needed for some kernels because +# they make explicit use of the rbp register. +CKOPTFLAGS := $(COPTFLAGS) -O3 +CROPTFLAGS := $(CKOPTFLAGS) +CKVECFLAGS := -mavx2 -mfma -mfpmath=sse +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +ifeq ($(CC_VENDOR),gcc) + ifeq ($(GCC_OT_6_1_0),yes) # gcc versions older than 6.1. + CVECFLAGS_VER := -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp + else + CVECFLAGS_VER := -march=znver1 -mno-avx256-split-unaligned-store + endif else ifeq ($(CC_VENDOR),clang) -# I couldn't find which versions of clang added support for -march=znver1, -# so we don't even bother attempting the differentiation that appears in the -# gcc branch above. -CRVECFLAGS += -march=znver1 -CKVECFLAGS += -march=znver1 + CVECFLAGS_VER := -march=znver1 +else +ifeq ($(CC_VENDOR),aocc) + CVECFLAGS_VER := -march=znver1 -mllvm -disable-licm-vrp else -$(error gcc or clang are required for this configuration.) + $(error gcc, clang, or aocc is required for this configuration.) +endif endif endif +CKVECFLAGS += $(CVECFLAGS_VER) +CRVECFLAGS += $(CVECFLAGS_VER) # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/zen/make_defs.mk.old b/config/zen/make_defs.mk.old new file mode 100644 index 000000000..44c2ad18d --- /dev/null +++ b/config/zen/make_defs.mk.old @@ -0,0 +1,84 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2019, Advanced Micro Devices, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# FLAGS that are specific to the 'zen' architecture are added here. +# FLAGS that are common for all the AMD architectures are present in +# amd_config.mk. + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := zen +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# Include the file containing common flags for all AMD architectures. +AMD_CONFIG_FILE := amd_config.mk +AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen +-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE) + +ifeq ($(CC_VENDOR),gcc) +# If gcc is older than 6.1.0, we must use -march=bdver4 and then remove the +# Bulldozer instruction sets that were omitted from Zen. +# Additionally, if gcc is 4.9 (clang 3.5?) or newer, we may want to add +# Zen-specific instructions back into the mix: +# -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt +ifeq ($(GCC_OT_6_1_0),yes) +CRVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp +CKVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp +else +# If gcc is at least 6.1.0, then we can specify the microarchitecture using +# the preferred option. +CRVECFLAGS += -march=znver1 +CKVECFLAGS += -march=znver1 +endif +else +ifeq ($(CC_VENDOR),clang) +# I couldn't find which versions of clang added support for -march=znver1, +# so we don't even bother attempting the differentiation that appears in the +# gcc branch above. +CRVECFLAGS += -march=znver1 +CKVECFLAGS += -march=znver1 +else +$(error gcc or clang are required for this configuration.) +endif +endif + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config/zen2/make_defs.mk b/config/zen2/make_defs.mk index 7d3ccb4bf..c14b8cba0 100644 --- a/config/zen2/make_defs.mk +++ b/config/zen2/make_defs.mk @@ -1,11 +1,10 @@ # # -# BLIS +# BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # -# Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2019, Advanced Micro Devices, Inc. +# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -33,9 +32,6 @@ # # -# FLAGS that are specific to the 'zen2' architecture are added here. -# FLAGS that are common for all the AMD architectures are present in -# config/zen/amd_config.mk. # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. @@ -46,41 +42,62 @@ THIS_CONFIG := zen2 # --- Determine the C compiler and related flags --- # -# Include file containing common flags for all AMD architectures. -AMD_CONFIG_FILE := amd_config.mk -AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen --include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE) +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := +CMISCFLAGS := +CPICFLAGS := +CWARNFLAGS := -ifeq ($(CC_VENDOR),gcc) -ifeq ($(GCC_OT_9_1_0),yes) -ifeq ($(GCC_OT_6_1_0),yes) -# If gcc is older than 6.1.0, we must use -march=bdver4 and then remove the -# Bulldozer instruction sets that were omitted from Zen. -CRVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp -CKVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp -else -# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1 -# as the fallback option. -CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store -CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 else -# If gcc is at least 9.1.0, then we can specify the microarchitecture using -# the preferred option. -CRVECFLAGS += -march=znver2 -CKVECFLAGS += -march=znver2 +COPTFLAGS := -O2 -fomit-frame-pointer endif + +# Flags specific to optimized and reference kernels. +# NOTE: The -fomit-frame-pointer option is needed for some kernels because +# they make explicit use of the rbp register. +CKOPTFLAGS := $(COPTFLAGS) -O3 +CROPTFLAGS := $(CKOPTFLAGS) +CKVECFLAGS := -mavx2 -mfma -mfpmath=sse +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +ifeq ($(CC_VENDOR),gcc) + ifeq ($(GCC_OT_6_1_0),yes) # gcc versions older than 6.1. + CVECFLAGS_VER := -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp + else + ifeq ($(GCC_OT_9_1_0),yes) # gcc versions 6.1 or newer, but older than 9.1. + CVECFLAGS_VER := -march=znver1 -mno-avx256-split-unaligned-store + else # gcc versions 9.1 or newer. + CVECFLAGS_VER := -march=znver2 + endif + endif else ifeq ($(CC_VENDOR),clang) -# I couldn't find which versions of clang added support for -march=znver1 -# or -march=znver2, so we don't even bother attempting the differentiation -# that appears in the gcc branch above. -CRVECFLAGS += -march=znver1 -CKVECFLAGS += -march=znver1 + ifeq ($(CLANG_OT_9_0_0),yes) # clang versions older than 9.0. + CVECFLAGS_VER := -march=znver1 + else # clang versions 9.0 or newer. + CVECFLAGS_VER := -march=znver2 + endif +else +ifeq ($(CC_VENDOR),aocc) + ifeq ($(AOCC_OT_2_0_0),yes) # aocc versions older than 2.0. + CVECFLAGS_VER := -march=znver1 -mllvm -disable-licm-vrp + else # aocc versions 2.0 or newer. + CVECFLAGS_VER := -march=znver2 + endif else -$(error gcc or clang are required for this configuration.) + $(error gcc, clang, or aocc is required for this configuration.) +endif endif endif +CKVECFLAGS += $(CVECFLAGS_VER) +CRVECFLAGS += $(CVECFLAGS_VER) # Store all of the variables here to new variables containing the # configuration name. diff --git a/config/zen2/make_defs.mk.old b/config/zen2/make_defs.mk.old new file mode 100644 index 000000000..9f0370376 --- /dev/null +++ b/config/zen2/make_defs.mk.old @@ -0,0 +1,94 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2019, Advanced Micro Devices, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# FLAGS that are specific to the 'zen2' architecture are added here. +# FLAGS that are common for all the AMD architectures are present in +# config/zen/amd_config.mk. + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := zen2 +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# Include file containing common flags for all AMD architectures. +AMD_CONFIG_FILE := amd_config.mk +AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen +-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE) + +ifeq ($(CC_VENDOR),gcc) + ifeq ($(GCC_OT_9_1_0),yes) + ifeq ($(GCC_OT_6_1_0),yes) + # If gcc is older than 6.1.0, we must use -march=bdver4 and then remove the + # Bulldozer instruction sets that were omitted from Zen. + CRVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp + CKVECFLAGS += -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp + else + # If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1 + # as the fallback option. + CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store + CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store + endif + else + # If gcc is at least 9.1.0, then we can specify the microarchitecture using + # the preferred option. + CRVECFLAGS += -march=znver2 + CKVECFLAGS += -march=znver2 + endif + else + ifeq ($(CC_VENDOR),clang) + ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1) + CKVECFLAGS += -march=znver2 + else + #if compiling with clang + VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')) + CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1)) + #clang 9.0 or later: + ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0) + CKVECFLAGS += -march=znver2 + else + CKVECFLAGS += -march=znver1 + endif # ge 9 + endif # AOCC 2 + endif # Clang +endif # gcc + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c new file mode 100644 index 000000000..b5bbb05ed --- /dev/null +++ b/config/zen3/bli_cntx_init_zen3.c @@ -0,0 +1,298 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_cntx_init_zen3( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + blksz_t thresh[ BLIS_NUM_THRESH ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_zen3_ref( cntx ); + + // ------------------------------------------------------------------------- + + // Update the context with optimized native gemm micro-kernels and + // their storage preferences. + bli_cntx_set_l3_nat_ukrs + ( + 8, + // gemm + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, + cntx + ); + +#if 0 + // AMD: This will be enabled in other PRs. + // packm kernels + bli_cntx_set_packm_kers + ( + 2, + BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen, + BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen, + cntx + ); +#else + // Update the context with optimized packm kernels. + bli_cntx_set_packm_kers + ( + 8, + BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, + BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, + BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, + BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, + BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, + BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, + BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, + BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, + cntx + ); +#endif + + // Update the context with optimized level-1f kernels. + bli_cntx_set_l1f_kers + ( + 4, + // axpyf + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, + // dotxf + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, + cntx + ); + + // Update the context with optimized level-1v kernels. + bli_cntx_set_l1v_kers + ( + 16, + + // amaxv + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, + + // axpyv + + // axpyv + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, + + // dotv + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10, + + // dotxv + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + + // scalv + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, + + //swap + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + + //copy + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, + + //set + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + + cntx + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // + // These are reference block sizes and may be overridden based on + // number of threads used at runtime. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); + + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + BLIS_NAT, 7, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f + BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, + BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, + cntx + ); + +// ------------------------------------------------------------------------- + + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 ); + bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, -1, -1 ); + bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, -1, -1 ); + + // Initialize the context with the sup thresholds. + bli_cntx_set_l3_sup_thresh + ( + 3, + BLIS_MT, &thresh[ BLIS_MT ], + BLIS_NT, &thresh[ BLIS_NT ], + BLIS_KT, &thresh[ BLIS_KT ], + cntx + ); + +#if 0 + // Initialize the context with the sup handlers. + bli_cntx_set_l3_sup_handlers + ( + 2, + BLIS_GEMM, bli_gemmsup_ref, + BLIS_GEMMT, bli_gemmtsup_ref, + cntx + ); +#endif + +#if 0 + // AMD: This should be enabled in the PR which has added these kernels + // Update the context with optimized small/unpacked gemm kernels. + bli_cntx_set_l3_sup_kers + ( + 28, + //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, + BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, + BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, + BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, + BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, + BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + cntx + ); +#else + // Update the context with optimized small/unpacked gemm kernels. + bli_cntx_set_l3_sup_kers + ( + 16, + //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, + BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, + BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, + BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + + BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, + BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, + BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, + BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, + BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, + BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, + BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, + BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, + cntx + ); + +#endif + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, + 9, 9, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); + + // Update the context with the current architecture's register and cache + // blocksizes for small/unpacked level-3 problems. + bli_cntx_set_l3_sup_blkszs + ( + 5, + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); +} + diff --git a/config/zen3/bli_family_zen3.h b/config/zen3/bli_family_zen3.h new file mode 100644 index 000000000..918e919ae --- /dev/null +++ b/config/zen3/bli_family_zen3.h @@ -0,0 +1,94 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLI_FAMILY_ZEN3_ +#define BLI_FAMILY_ZEN3_ + +// By default, it is effective to parallelize the outer loops. +// Setting these macros to 1 will force JR and IR inner loops +// to be not paralleized. +// + +#define BLIS_THREAD_MAX_IR 1 +#define BLIS_THREAD_MAX_JR 1 + + +// To enable framework optimizations for zen3 platform +// All zen3 specific code should be included in this macro +#define BLIS_CONFIG_ZEN3 + +// To enable framework optimizations for zen3 platform +// All zen3 specific code should be included in this macro +#define BLIS_CONFIG_ZEN3 + +#define BLIS_ENABLE_SMALL_MATRIX +#define BLIS_ENABLE_SMALL_MATRIX_TRSM + + +// This will select the threshold below which small matrix code will be called. +#define BLIS_SMALL_MATRIX_THRES 700 +#define BLIS_SMALL_M_RECT_MATRIX_THRES 160 +#define BLIS_SMALL_K_RECT_MATRIX_THRES 128 + +#define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) +#define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 + +#define BLIS_ENABLE_SMALL_MATRIX_ROME +#define BLIS_SMALL_MATRIX_THRES_ROME 400 + +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10 + +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130 + +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100 + +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30 + +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120 +#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50 + +#endif diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk new file mode 100644 index 000000000..5c68855db --- /dev/null +++ b/config/zen3/make_defs.mk @@ -0,0 +1,113 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := zen3 +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := +CMISCFLAGS := +CPICFLAGS := +CWARNFLAGS := + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -O3 +endif + +# Flags specific to optimized and reference kernels. +# NOTE: The -fomit-frame-pointer option is needed for some kernels because +# they make explicit use of the rbp register. +CKOPTFLAGS := $(COPTFLAGS) -fomit-frame-pointer +CROPTFLAGS := $(CKOPTFLAGS) +CKVECFLAGS := -mavx2 -mfma -mfpmath=sse +CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast +ifeq ($(CC_VENDOR),gcc) + ifeq ($(GCC_OT_9_1_0),yes) # gcc versions older than 9.1. + CVECFLAGS_VER := -march=znver1 -mno-avx256-split-unaligned-store + else + ifeq ($(GCC_OT_10_1_0),yes) # gcc versions 9.1 or newer, but older than 10.1. + CVECFLAGS_VER := -march=znver2 + else # gcc versions 10.1 or newer. + CVECFLAGS_VER := -march=znver3 + endif + endif +else +ifeq ($(CC_VENDOR),clang) + ifeq ($(CLANG_OT_9_0_0),yes) # clang versions older than 9.0. + CVECFLAGS_VER := -march=znver1 + else + ifeq ($(CLANG_OT_12_0_0),yes) # clang versions 9.0 or newer, but older than 12.0. + CVECFLAGS_VER := -march=znver2 + else # clang versions 12.0 or newer. + CVECFLAGS_VER := -march=znver3 + endif + endif +else +ifeq ($(CC_VENDOR),aocc) + ifeq ($(AOCC_OT_2_0_0),yes) # aocc versions older than 2.0. + CVECFLAGS_VER := -march=znver1 + else + ifeq ($(AOCC_OT_3_0_0),yes) # aocc versions 2.0 or newer, but older than 3.0. + CVECFLAGS_VER := -march=znver2 + else # aocc versions 3.0 or newer. + CVECFLAGS_VER := -march=znver3 + endif + endif +else + $(error gcc, clang, or aocc is required for this configuration.) +endif +endif +endif +CKVECFLAGS += $(CVECFLAGS_VER) +CRVECFLAGS += $(CVECFLAGS_VER) + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config/zen3/make_defs.mk.old b/config/zen3/make_defs.mk.old new file mode 100644 index 000000000..e0794ab0c --- /dev/null +++ b/config/zen3/make_defs.mk.old @@ -0,0 +1,137 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# FLAGS that are specific to the 'zen3' architecture are added here. +# FLAGS that are common for all the AMD architectures are present in +# config/zen/amd_config.mk. + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := zen3 +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CPPROCFLAGS := +CMISCFLAGS := +CPICFLAGS := +CWARNFLAGS := + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +#frame pointers are needed to execution tracing +ifeq ($(ETRACE_ENABLE),1) +COPTFLAGS := -O3 +else +COPTFLAGS := -O3 -fomit-frame-pointer +endif +endif + + +# +# --- Enable ETRACE across the library if enabled ETRACE_ENABLE=[0,1] ----------------------- +# + +ifeq ($(ETRACE_ENABLE),1) +CDBGFLAGS += -pg -finstrument-functions -DAOCL_DTL_AUTO_TRACE_ENABLE +LDFLAGS += -ldl +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) +ifeq ($(CC_VENDOR),gcc) +GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1)) +#gcc or clang version must be atleast 4.0 +# gcc 9.0 or later: +ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0) +CKVECFLAGS += -march=znver2 +else +# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1 +# as the fallback option. +CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store +CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store +endif +else +ifeq ($(CC_VENDOR),clang) + +# AOCC clang has various formats for the version line + +# AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19) +# AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12) +# AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0) +# AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0) +# AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0) + +# For our prupose we just want to know if it version 2x or 3x + +# for version 3x we will enable znver3 +ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1) +CKVECFLAGS += -march=znver3 +else +# for version 2x we will enable znver2 +ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1) +CKVECFLAGS += -march=znver2 +else +#if compiling with clang +VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')) +CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1)) +#clang 9.0 or later: +ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0) +CKVECFLAGS += -march=znver2 +else +CKVECFLAGS += -march=znver1 +endif # ge 9 +endif # aocc 2 +endif # aocc 3 +endif # clang +endif # gcc + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +CRVECFLAGS := $(CKVECFLAGS) + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config_registry b/config_registry index bdd3d2228..d472325c7 100644 --- a/config_registry +++ b/config_registry @@ -8,11 +8,12 @@ # # Processor families. -x86_64: intel64 amd64 -intel64: skx knl haswell sandybridge penryn generic -amd64: zen2 zen excavator steamroller piledriver bulldozer generic -arm64: firestorm thunderx2 cortexa57 cortexa53 generic -arm32: cortexa15 cortexa9 generic +x86_64: intel64 amd64 amd64_legacy +intel64: skx knl haswell sandybridge penryn generic +amd64_legacy: excavator steamroller piledriver bulldozer generic +amd64: zen3 zen2 zen generic +arm64: firestorm thunderx2 cortexa57 cortexa53 generic +arm32: cortexa15 cortexa9 generic # Intel architectures. skx: skx/skx/haswell/zen @@ -22,6 +23,7 @@ sandybridge: sandybridge penryn: penryn # AMD architectures. +zen3: zen3/zen3/zen2/zen/haswell zen2: zen2/zen2/zen/haswell zen: zen/zen/haswell excavator: excavator/piledriver diff --git a/configure b/configure index 3c865dad9..447b0791e 100755 --- a/configure +++ b/configure @@ -1434,22 +1434,80 @@ get_compiler_version() # The last part ({ read first rest ; echo $first ; }) is a workaround # to OS X's egrep only returning the first match. cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM|oneAPI|crosstool-NG' | { read first rest ; echo $first ; }) + + # AOCC version strings contain both "clang" and "AOCC" substrings, and + # so we have perform a follow-up check to make sure cc_vendor gets set + # correctly. + aocc_grep=$(echo "${vendor_string}" | grep 'AOCC') + if [ -n "${aocc_grep}" ]; then + cc_vendor="aocc" + fi + + # Begin parsing cc_vendor for the version string. + if [ "${cc_vendor}" = "crosstool-NG" ]; then # Treat compilers built by crosstool-NG (for eg: conda) as gcc. cc_vendor="gcc" fi if [ "${cc_vendor}" = "icc" -o \ "${cc_vendor}" = "gcc" ]; then + cc_version=$(${cc} -dumpversion) - # If compiler is AOCC, first grep for clang and then the version number. + elif [ "${cc_vendor}" = "clang" ]; then - cc_version=$(echo "${vendor_string}" | egrep -o '(clang|LLVM) version [0-9]+\.[0-9]+\.?[0-9]*' | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*') + + cc_version=$(echo "${vendor_string}" \ + | egrep -o '(clang|LLVM) version [0-9]+\.[0-9]+\.?[0-9]*' \ + | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*') + + elif [ "${cc_vendor}" = "aocc" ]; then + + aocc_ver21=$(echo "${vendor_string}" | grep 'AOCC.LLVM.2') + + # Versions 2.0 and 2.1 had different version string formats from + # 2.2 and later, so we have to handle them separately. + # Examples: + # AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19) + # AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12) + # AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0) + # AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0) + # AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0) + + if [ -n "${aocc_ver21}" ]; then + + # Grep for the AOCC.LLVM.x.y.z substring first, and then isolate the + # version number. Also, the string may contain multiple instances of + # the version number, so only use the first occurrence. + cc_version=$(echo "${vendor_string}" \ + | egrep -o 'AOCC.LLVM.[0-9]+\.[0-9]+\.?[0-9]*' \ + | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' \ + | { read first rest ; echo $first ; }) + else + + # Grep for the AOCC_x.y.z substring first, and then isolate the + # version number. As of this writing, these version strings don't + # include multiple instances of the version, but we nonetheless + # take only the first occurrence as a future-oriented safety + # measure. + cc_version=$(echo "${vendor_string}" \ + | egrep -o 'AOCC_[0-9]+\.[0-9]+\.?[0-9]*' \ + | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' \ + | { read first rest ; echo $first ; }) + fi + elif [ "${cc_vendor}" = "oneAPI" ]; then + # Treat Intel oneAPI's clang as clang, not icc. cc_vendor="clang" - cc_version=$(echo "${vendor_string}" | egrep -o '[0-9]+\.[0-9]+\.[0-9]+\.?[0-9]*' | { read first rest ; echo ${first} ; }) + cc_version=$(echo "${vendor_string}" \ + | egrep -o '[0-9]+\.[0-9]+\.[0-9]+\.?[0-9]*' \ + | { read first rest ; echo ${first} ; }) + else - cc_version=$(echo "${vendor_string}" | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' | { read first rest ; echo ${first} ; }) + + cc_version=$(echo "${vendor_string}" \ + | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' \ + | { read first rest ; echo ${first} ; }) fi # Parse the version number into its major, minor, and revision @@ -1500,6 +1558,8 @@ check_compiler() # penryn: any # # zen: gcc 6.0+[1], clang 4.0+ + # zen2: gcc 6.0+[1], clang 4.0+ + # zen3: gcc 6.0+[1], clang 4.0+ # excavator: gcc 4.9+, clang 3.5+ # steamroller: any # piledriver: any @@ -1683,12 +1743,30 @@ check_compiler_version_ranges() # Newer versions of gcc support Zen2 via the '-march=znver2' option [6]. # # [5] https://gcc.gnu.org/onlinedocs/gcc-8.3.0/gcc/x86-Options.html#x86-Options - # [6] https://gcc.gnu.org/onlinedocs/gcc-9.1.0/gcc/x86-Options.html#x86-Options + # [6] https://gcc.gnu.org/onlinedocs/gcc-9.4.0/gcc/x86-Options.html#x86-Options + # + # range: gcc < 10.1 (ie: 9.4 or older) + # variable: gcc_older_than_10_1_0 + # comments: + # These older versions of gcc do not explicitly support the Zen3 + # microarchitecture; the newest microarchitectural value understood by + # these versions is '-march=znver2' (if !gcc_older_than_9_1_0) [7]. + # Newer versions of gcc support Zen3 via the '-march=znver3' option [8]. + # + # [7] https://gcc.gnu.org/onlinedocs/gcc-9.4.0/gcc/x86-Options.html#x86-Options + # [8] https://gcc.gnu.org/onlinedocs/gcc-10.3.0/gcc/x86-Options.html#x86-Options # gcc_older_than_4_9_0='no' gcc_older_than_6_1_0='no' gcc_older_than_9_1_0='no' + gcc_older_than_10_1_0='no' + + clang_older_than_9_0_0='no' + clang_older_than_12_0_0='no' + + aocc_older_than_2_0_0='no' + aocc_older_than_3_0_0='no' echo "${script_name}: checking ${cc} ${cc_version} against known consequential version ranges." @@ -1714,6 +1792,12 @@ check_compiler_version_ranges() echo "${script_name}: note: found ${cc} version older than 9.1." gcc_older_than_9_1_0='yes' fi + + # Check for gcc < 10.1.0 (ie: 9.4 or older). + if [ ${cc_major} -lt 10 ]; then + echo "${script_name}: note: found ${cc} version older than 10.1." + gcc_older_than_10_1_0='yes' + fi fi # icc @@ -1723,7 +1807,34 @@ check_compiler_version_ranges() # clang if [ "x${cc_vendor}" = "xclang" ]; then - : + + # Check for clang < 9.0.0. + if [ ${cc_major} -lt 9 ]; then + echo "${script_name}: note: found ${cc} version older than 9.0." + clang_older_than_9_0_0='yes' + fi + + # Check for clang < 12.0.0. + if [ ${cc_major} -lt 12 ]; then + echo "${script_name}: note: found ${cc} version older than 12.0." + clang_older_than_12_0_0='yes' + fi + fi + + # aocc + if [ "x${cc_vendor}" = "xaocc" ]; then + + # Check for aocc < 2.0.0. + if [ ${cc_major} -lt 2 ]; then + echo "${script_name}: note: found ${cc} version older than 2.0." + aocc_older_than_2_0_0='yes' + fi + + # Check for aocc < 3.0.0. + if [ ${cc_major} -lt 3 ]; then + echo "${script_name}: note: found ${cc} version older than 3.0." + aocc_older_than_3_0_0='yes' + fi fi } @@ -3353,6 +3464,11 @@ main() | sed -e "s/@gcc_older_than_4_9_0@/${gcc_older_than_4_9_0}/g" \ | sed -e "s/@gcc_older_than_6_1_0@/${gcc_older_than_6_1_0}/g" \ | sed -e "s/@gcc_older_than_9_1_0@/${gcc_older_than_9_1_0}/g" \ + | sed -e "s/@gcc_older_than_10_1_0@/${gcc_older_than_10_1_0}/g" \ + | sed -e "s/@clang_older_than_9_0_0@/${clang_older_than_9_0_0}/g" \ + | sed -e "s/@clang_older_than_12_0_0@/${clang_older_than_12_0_0}/g" \ + | sed -e "s/@aocc_older_than_2_0_0@/${aocc_older_than_2_0_0}/g" \ + | sed -e "s/@aocc_older_than_3_0_0@/${aocc_older_than_3_0_0}/g" \ | sed -e "s/@CC@/${cc_esc}/g" \ | sed -e "s/@CXX@/${cxx_esc}/g" \ | sed -e "s/@RANLIB@/${ranlib_esc}/g" \ diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index c8d8eec79..54aa64d42 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018-2019, Advanced Micro Devices, Inc. + Copyright (C) 2018-2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -171,6 +171,9 @@ void bli_arch_set_id( void ) #endif // AMD microarchitectures. + #ifdef BLIS_FAMILY_ZEN3 + id = BLIS_ARCH_ZEN3; + #endif #ifdef BLIS_FAMILY_ZEN2 id = BLIS_ARCH_ZEN2; #endif @@ -259,6 +262,7 @@ static char* config_name[ BLIS_NUM_ARCHS ] = "sandybridge", "penryn", + "zen3", "zen2", "zen", "excavator", @@ -279,7 +283,7 @@ static char* config_name[ BLIS_NUM_ARCHS ] = "power9", "power7", "bgq", - + "generic" }; diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c index c7ceb8d7c..ff0f386e6 100644 --- a/frame/base/bli_cpuid.c +++ b/frame/base/bli_cpuid.c @@ -5,7 +5,8 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018-2019, Advanced Micro Devices, Inc. + Copyright (C) 2018-2020, Advanced Micro Devices, Inc. + Copyright (C) 2019, Dave Love, University of Manchester Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -131,6 +132,10 @@ arch_t bli_cpuid_query_id( void ) // Check for each AMD configuration that is enabled, check for that // microarchitecture. We check from most recent to most dated. +#ifdef BLIS_CONFIG_ZEN3 + if ( bli_cpuid_is_zen3( family, model, features ) ) + return BLIS_ARCH_ZEN3; +#endif #ifdef BLIS_CONFIG_ZEN2 if ( bli_cpuid_is_zen2( family, model, features ) ) return BLIS_ARCH_ZEN2; @@ -278,6 +283,35 @@ bool bli_cpuid_is_penryn // ----------------------------------------------------------------------------- +bool bli_cpuid_is_zen3 + ( + uint32_t family, + uint32_t model, + uint32_t features + ) +{ + // Check for expected CPU features. + const uint32_t expected = FEATURE_AVX | + FEATURE_FMA3 | + FEATURE_AVX2; + + if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; + + // All Zen3 cores have a family of 0x19. + if ( family != 0x19 ) return FALSE; + + // Finally, check for specific models: + // - 0x00 ~ 0xff + // NOTE: We accept any model because the family 25 (0x19) is unique. + const bool is_arch + = + ( 0x00 <= model && model <= 0xff ); + + if ( !is_arch ) return FALSE; + + return TRUE; +} + bool bli_cpuid_is_zen2 ( uint32_t family, @@ -296,7 +330,9 @@ bool bli_cpuid_is_zen2 if ( family != 0x17 ) return FALSE; // Finally, check for specific models: - // - 0x30-0xff (THIS NEEDS UPDATING) + // - 0x30 ~ 0xff + // NOTE: We must check model because the family 23 (0x17) is shared with + // zen. const bool is_arch = ( 0x30 <= model && model <= 0xff ); @@ -324,10 +360,12 @@ bool bli_cpuid_is_zen if ( family != 0x17 ) return FALSE; // Finally, check for specific models: - // - 0x00-0xff (THIS NEEDS UPDATING) + // - 0x00 ~ 0x2f + // NOTE: We must check model because the family 23 (0x17) is shared with + // zen2. const bool is_arch = - ( 0x00 <= model && model <= 0xff ); + ( 0x00 <= model && model <= 0x2f ); if ( !is_arch ) return FALSE; @@ -352,7 +390,7 @@ bool bli_cpuid_is_excavator if ( family != 0x15 ) return FALSE; // Finally, check for specific models: - // - 0x60-0x7f + // - 0x60 ~ 0x7f const bool is_arch = ( 0x60 <= model && model <= 0x7f ); @@ -380,7 +418,7 @@ bool bli_cpuid_is_steamroller if ( family != 0x15 ) return FALSE; // Finally, check for specific models: - // - 0x30-0x3f + // - 0x30 ~ 0x3f const bool is_arch = ( 0x30 <= model && model <= 0x3f ); @@ -409,7 +447,7 @@ bool bli_cpuid_is_piledriver // Finally, check for specific models: // - 0x02 - // - 0x10-0x1f + // - 0x10 ~ 0x1f const bool is_arch = model == 0x02 || ( 0x10 <= model && model <= 0x1f ); diff --git a/frame/base/bli_cpuid.h b/frame/base/bli_cpuid.h index d8e597aee..3fea78e5a 100644 --- a/frame/base/bli_cpuid.h +++ b/frame/base/bli_cpuid.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018-2019, Advanced Micro Devices, Inc. + Copyright (C) 2018-2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -61,6 +61,7 @@ bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t feature bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD +bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); diff --git a/frame/base/bli_error.h b/frame/base/bli_error.h index 8c2971781..e6e6f35dd 100644 --- a/frame/base/bli_error.h +++ b/frame/base/bli_error.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 0a5bcafd4..cc17b33ff 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018-2019, Advanced Micro Devices, Inc. + Copyright (C) 2018-2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -97,6 +97,11 @@ void bli_gks_init( void ) #endif // AMD architectures +#ifdef BLIS_CONFIG_ZEN3 + bli_gks_register_cntx( BLIS_ARCH_ZEN3, bli_cntx_init_zen3, + bli_cntx_init_zen3_ref, + bli_cntx_init_zen3_ind ); +#endif #ifdef BLIS_CONFIG_ZEN2 bli_gks_register_cntx( BLIS_ARCH_ZEN2, bli_cntx_init_zen2, bli_cntx_init_zen2_ref, diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h index b0d23419f..f804d3003 100644 --- a/frame/include/bli_arch_config.h +++ b/frame/include/bli_arch_config.h @@ -42,6 +42,7 @@ // // -- Intel64 architectures -- + #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif @@ -62,6 +63,10 @@ CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- + +#ifdef BLIS_CONFIG_ZEN3 +CNTX_INIT_PROTS( zen3 ) +#endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif @@ -145,11 +150,15 @@ CNTX_INIT_PROTS( generic ) #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" #endif +#ifdef BLIS_FAMILY_AMD64_LEGACY +#include "bli_family_amd64_legacy.h" +#endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" #endif // -- Intel64 architectures -- + #ifdef BLIS_FAMILY_SKX #include "bli_family_skx.h" #endif @@ -171,6 +180,9 @@ CNTX_INIT_PROTS( generic ) // -- AMD64 architectures -- +#ifdef BLIS_FAMILY_ZEN3 +#include "bli_family_zen3.h" +#endif #ifdef BLIS_FAMILY_ZEN2 #include "bli_family_zen2.h" #endif diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index c2db052e5..f1a7e8f8d 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -939,6 +939,7 @@ typedef enum BLIS_ARCH_PENRYN, // AMD + BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, diff --git a/kernels/zen3/.gitignore b/kernels/zen3/.gitignore new file mode 100644 index 000000000..5e7d2734c --- /dev/null +++ b/kernels/zen3/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore diff --git a/travis/cpuid/zen2.def b/travis/cpuid/zen2.def new file mode 100644 index 000000000..1e2cc6390 --- /dev/null +++ b/travis/cpuid/zen2.def @@ -0,0 +1,87 @@ +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2018, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CPU: AMD EPYC 7742 +# NOTE: This file was copied from zen.def and then the appropriate bits +# in the first field (eax) of leaf 1 were updated to reflect the Zen2 +# "Rome" processor. See [1] for details. +# [1] https://en.wikichip.org/wiki/amd/cpuid +# +00000000 ******** => 0000000D 68747541 444D4163 69746E65 +00000001 ******** => 00830F12 00400800 7ED8320B 178BFBFF +00000002 ******** => 00000000 00000000 00000000 00000000 +00000003 ******** => 00000000 00000000 00000000 00000000 +00000005 ******** => 00000040 00000040 00000003 00000011 +00000006 ******** => 00000004 00000000 00000001 00000000 +00000007 ******** => 00000000 209C01A9 00000000 00000000 +00000008 ******** => 00000000 00000000 00000000 00000000 +00000009 ******** => 00000000 00000000 00000000 00000000 +0000000A ******** => 00000000 00000000 00000000 00000000 +0000000C ******** => 00000000 00000000 00000000 00000000 +0000000D 00000000 => 00000007 00000340 00000340 00000000 +0000000D 00000001 => 0000000F 00000340 00000000 00000000 +0000000D 00000002 => 00000100 00000240 00000000 00000000 +80000000 ******** => 8000001F 68747541 444D4163 69746E65 +80000001 ******** => 00800F12 40000000 35C233FF 2FD3FBFF +80000002 ******** => 20444D41 43595045 35353720 33205031 +80000003 ******** => 6F432D32 50206572 65636F72 726F7373 +80000004 ******** => 20202020 20202020 20202020 00202020 +80000005 ******** => FF40FF40 FF40FF40 20080140 40040140 +80000006 ******** => 36006400 56006400 02006140 0200C140 +80000007 ******** => 00000000 0000001B 00000000 00006799 +80000008 ******** => 00003030 00000007 0000603F 00000000 +80000009 ******** => 00000000 00000000 00000000 00000000 +8000000A ******** => 00000001 00008000 00000000 0001BCFF +8000000B ******** => 00000000 00000000 00000000 00000000 +8000000C ******** => 00000000 00000000 00000000 00000000 +8000000D ******** => 00000000 00000000 00000000 00000000 +8000000E ******** => 00000000 00000000 00000000 00000000 +8000000F ******** => 00000000 00000000 00000000 00000000 +80000010 ******** => 00000000 00000000 00000000 00000000 +80000011 ******** => 00000000 00000000 00000000 00000000 +80000012 ******** => 00000000 00000000 00000000 00000000 +80000013 ******** => 00000000 00000000 00000000 00000000 +80000014 ******** => 00000000 00000000 00000000 00000000 +80000015 ******** => 00000000 00000000 00000000 00000000 +80000016 ******** => 00000000 00000000 00000000 00000000 +80000017 ******** => 00000000 00000000 00000000 00000000 +80000018 ******** => 00000000 00000000 00000000 00000000 +80000019 ******** => F040F040 00000000 00000000 00000000 +8000001A ******** => 00000003 00000000 00000000 00000000 +8000001B ******** => 000003FF 00000000 00000000 00000000 +8000001C ******** => 00000000 00000000 00000000 00000000 +8000001D 00000000 => 00004121 01C0003F 0000003F 00000000 +8000001D 00000001 => 00004122 00C0003F 000000FF 00000000 +8000001D 00000002 => 00004143 01C0003F 000003FF 00000002 +8000001D 00000003 => 0001C163 03C0003F 00001FFF 00000001 +8000001E ******** => 00000000 00000100 00000300 00000000 +8000001F ******** => 0000000F 0000016F 0000000F 00000001 +8FFFFFFF ******** => 00000000 00000000 00000000 00000000 diff --git a/travis/cpuid/zen3.def b/travis/cpuid/zen3.def new file mode 100644 index 000000000..ed791813e --- /dev/null +++ b/travis/cpuid/zen3.def @@ -0,0 +1,87 @@ +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2018, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CPU: AMD EPYC 7xxx +# NOTE: This file was copied from zen.def and then the appropriate bits +# in the first field (eax) of leaf 1 were updated to reflect the Zen3 +# "Milan" processor. See [1] for details. +# [1] https://en.wikichip.org/wiki/amd/cpuid +# +00000000 ******** => 0000000D 68747541 444D4163 69746E65 +00000001 ******** => 00A00F12 00400800 7ED8320B 178BFBFF +00000002 ******** => 00000000 00000000 00000000 00000000 +00000003 ******** => 00000000 00000000 00000000 00000000 +00000005 ******** => 00000040 00000040 00000003 00000011 +00000006 ******** => 00000004 00000000 00000001 00000000 +00000007 ******** => 00000000 209C01A9 00000000 00000000 +00000008 ******** => 00000000 00000000 00000000 00000000 +00000009 ******** => 00000000 00000000 00000000 00000000 +0000000A ******** => 00000000 00000000 00000000 00000000 +0000000C ******** => 00000000 00000000 00000000 00000000 +0000000D 00000000 => 00000007 00000340 00000340 00000000 +0000000D 00000001 => 0000000F 00000340 00000000 00000000 +0000000D 00000002 => 00000100 00000240 00000000 00000000 +80000000 ******** => 8000001F 68747541 444D4163 69746E65 +80000001 ******** => 00800F12 40000000 35C233FF 2FD3FBFF +80000002 ******** => 20444D41 43595045 35353720 33205031 +80000003 ******** => 6F432D32 50206572 65636F72 726F7373 +80000004 ******** => 20202020 20202020 20202020 00202020 +80000005 ******** => FF40FF40 FF40FF40 20080140 40040140 +80000006 ******** => 36006400 56006400 02006140 0200C140 +80000007 ******** => 00000000 0000001B 00000000 00006799 +80000008 ******** => 00003030 00000007 0000603F 00000000 +80000009 ******** => 00000000 00000000 00000000 00000000 +8000000A ******** => 00000001 00008000 00000000 0001BCFF +8000000B ******** => 00000000 00000000 00000000 00000000 +8000000C ******** => 00000000 00000000 00000000 00000000 +8000000D ******** => 00000000 00000000 00000000 00000000 +8000000E ******** => 00000000 00000000 00000000 00000000 +8000000F ******** => 00000000 00000000 00000000 00000000 +80000010 ******** => 00000000 00000000 00000000 00000000 +80000011 ******** => 00000000 00000000 00000000 00000000 +80000012 ******** => 00000000 00000000 00000000 00000000 +80000013 ******** => 00000000 00000000 00000000 00000000 +80000014 ******** => 00000000 00000000 00000000 00000000 +80000015 ******** => 00000000 00000000 00000000 00000000 +80000016 ******** => 00000000 00000000 00000000 00000000 +80000017 ******** => 00000000 00000000 00000000 00000000 +80000018 ******** => 00000000 00000000 00000000 00000000 +80000019 ******** => F040F040 00000000 00000000 00000000 +8000001A ******** => 00000003 00000000 00000000 00000000 +8000001B ******** => 000003FF 00000000 00000000 00000000 +8000001C ******** => 00000000 00000000 00000000 00000000 +8000001D 00000000 => 00004121 01C0003F 0000003F 00000000 +8000001D 00000001 => 00004122 00C0003F 000000FF 00000000 +8000001D 00000002 => 00004143 01C0003F 000003FF 00000002 +8000001D 00000003 => 0001C163 03C0003F 00001FFF 00000001 +8000001E ******** => 00000000 00000100 00000300 00000000 +8000001F ******** => 0000000F 0000016F 0000000F 00000001 +8FFFFFFF ******** => 00000000 00000000 00000000 00000000 diff --git a/travis/do_sde.sh b/travis/do_sde.sh index c8eb5aa58..de1545886 100755 --- a/travis/do_sde.sh +++ b/travis/do_sde.sh @@ -3,7 +3,7 @@ set -e set -x -SDE_VERSION=sde-external-8.63.0-2021-01-18-lin +SDE_VERSION=sde-external-8.69.1-2021-07-18-lin SDE_TARBALL=$SDE_VERSION.tar.bz2 SDE=$SDE_VERSION/sde64 @@ -46,7 +46,7 @@ for LIB in $LD_SO $LIBC_SO $LIBM_SO; do done #for ARCH in penryn sandybridge haswell skx knl piledriver steamroller excavator zen; do -for ARCH in penryn sandybridge haswell skx knl zen; do +for ARCH in penryn sandybridge haswell skx knl zen zen2 zen3; do if [ "$ARCH" = "knl" ]; then $SDE -knl -- ./test_libblis.x > output.testsuite else From 9be97c150e19fa58bca30cb993a6509ae21e2025 Mon Sep 17 00:00:00 2001 From: Madan mohan Manokar <86282872+madanm3@users.noreply.github.com> Date: Thu, 18 Nov 2021 00:46:46 +0530 Subject: [PATCH 008/230] Support all four dts in test/test_her[2][k].c (#578) Details: - Replaced the hard-coded calls to double-precision real syr, syr2, syrk, and syrk in the corresponding standalone test drivers in the 'test' directory with conditional branches that will call the appropriate BLAS interface depending on which datatype is enabled. Thanks to Madan mohan Manokar for this improvement. - CREDITS file update. --- CREDITS | 1 + test/test_her.c | 93 ++++++++++++++++++++++++++++----------- test/test_her2.c | 110 ++++++++++++++++++++++++++++++++++------------ test/test_her2k.c | 90 +++++++++++++++++++------------------ test/test_herk.c | 75 +++++++++++++++---------------- 5 files changed, 230 insertions(+), 139 deletions(-) diff --git a/CREDITS b/CREDITS index df088c746..81fc9bec5 100644 --- a/CREDITS +++ b/CREDITS @@ -58,6 +58,7 @@ but many others have contributed code and feedback, including Tze Meng Low (The University of Texas at Austin) Ye Luo @ye-luo (Argonne National Laboratory) Ricardo Magana @magania (Hewlett Packard Enterprise) + Madan mohan Manokar @madanm3 (AMD) Giorgos Margaritis Bryan Marker @bamarker (The University of Texas at Austin) Simon Lukas Märtens @ACSimon33 (RWTH Aachen University) diff --git a/test/test_her.c b/test/test_her.c index 341b8a5fc..267e1bfe0 100644 --- a/test/test_her.c +++ b/test/test_her.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -81,11 +82,8 @@ int main( int argc, char** argv ) m_input = 6; #endif -#if 1 - dt_alpha = dt_x = dt_a = BLIS_DOUBLE; -#else + // her supports complex and double complex dt_alpha = dt_x = dt_a = BLIS_DCOMPLEX; -#endif uplo = BLIS_LOWER; @@ -127,7 +125,7 @@ int main( int argc, char** argv ) bli_copym( &a, &a_save ); - + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) @@ -143,33 +141,76 @@ int main( int argc, char** argv ) #endif #ifdef BLIS - //bli_obj_toggle_conj( &x ); - //bli_syr( &alpha, bli_her( &alpha, &x, &a ); #else - - f77_char uplo = 'L'; - f77_int mm = bli_obj_length( &a ); - f77_int incx = bli_obj_vector_inc( &x ); - f77_int lda = bli_obj_col_stride( &a ); - double* alphap = bli_obj_buffer( &alpha ); - double* xp = bli_obj_buffer( &x ); - double* ap = bli_obj_buffer( &a ); -/* - dcomplex* xp = bli_obj_buffer( x ); - dcomplex* ap = bli_obj_buffer( &a ); -*/ - - dsyr_( &uplo, - //zher_( &uplo, - &mm, - alphap, - xp, &incx, - ap, &lda ); + if ( bli_is_float( dt_a ) ) + { + f77_char uplo = 'L'; + f77_int mm = bli_obj_length( &a ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int lda = bli_obj_col_stride( &a ); + float* alphap = bli_obj_buffer( &alpha ); + float* xp = bli_obj_buffer( &x ); + float* ap = bli_obj_buffer( &a ); + + ssyr_( &uplo, + &mm, + alphap, + xp, &incx, + ap, &lda ); + } + else if ( bli_is_double( dt_a ) ) + { + f77_char uplo = 'L'; + f77_int mm = bli_obj_length( &a ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int lda = bli_obj_col_stride( &a ); + double* alphap = bli_obj_buffer( &alpha ); + double* xp = bli_obj_buffer( &x ); + double* ap = bli_obj_buffer( &a ); + + dsyr_( &uplo, + &mm, + alphap, + xp, &incx, + ap, &lda ); + } + else if ( bli_is_scomplex( dt_a ) ) + { + f77_char uplo = 'L'; + f77_int mm = bli_obj_length( &a ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int lda = bli_obj_col_stride( &a ); + float* alphap = bli_obj_buffer( &alpha ); + scomplex* xp = bli_obj_buffer( &x ); + scomplex* ap = bli_obj_buffer( &a ); + + cher_( &uplo, + &mm, + alphap, + xp, &incx, + ap, &lda ); + } + else if ( bli_is_dcomplex( dt_a ) ) + { + f77_char uplo = 'L'; + f77_int mm = bli_obj_length( &a ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int lda = bli_obj_col_stride( &a ); + double* alphap = bli_obj_buffer( &alpha ); + dcomplex* xp = bli_obj_buffer( &x ); + dcomplex* ap = bli_obj_buffer( &a ); + + zher_( &uplo, + &mm, + alphap, + xp, &incx, + ap, &lda ); + } #endif #ifdef PRINT diff --git a/test/test_her2.c b/test/test_her2.c index 7e6a61602..3672051dd 100644 --- a/test/test_her2.c +++ b/test/test_her2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -41,7 +42,7 @@ // uplo m alpha x incx y incy a lda //void dsyr2_( char*, int*, double*, double*, int*, double*, int*, double*, int* ); - + //#define PRINT int main( int argc, char** argv ) @@ -80,11 +81,8 @@ int main( int argc, char** argv ) m_input = 6; #endif -#if 1 - dt_alpha = dt_x = dt_y = dt_a = BLIS_DOUBLE; -#else - dt_alpha = dt_x = dt_y = dt_a = BLIS_DCOMPLEX; -#endif + // her2 supports complex and double complex + dt_alpha = dt_x = dt_y = dt_a = BLIS_SCOMPLEX; uplo = BLIS_LOWER; @@ -128,7 +126,7 @@ int main( int argc, char** argv ) bli_copym( &a, &a_save ); - + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) @@ -142,37 +140,93 @@ int main( int argc, char** argv ) bli_printm( "x", &x, "%4.1f", "" ); bli_printm( "y", &y, "%4.1f", "" ); bli_printm( "a", &a, "%4.1f", "" ); -#endif +#endif #ifdef BLIS - //bli_obj_toggle_conj( &x ); - //bli_obj_toggle_conj( &y ); - - //bli_syr2( &alpha, bli_her2( &alpha, &x, &y, &a ); #else + if ( bli_is_float( dt_a ) ) + { + f77_char uplo = 'L'; + f77_int mm = bli_obj_length( &a ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + f77_int lda = bli_obj_col_stride( &a ); + float* alphap = bli_obj_buffer( &alpha ); + float* xp = bli_obj_buffer( &x ); + float* yp = bli_obj_buffer( &y ); + float* ap = bli_obj_buffer( &a ); + + ssyr2_( &uplo, + &mm, + alphap, + xp, &incx, + yp, &incy, + ap, &lda ); + } + else if ( bli_is_double( dt_a ) ) + { + f77_char uplo = 'L'; + f77_int mm = bli_obj_length( &a ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + f77_int lda = bli_obj_col_stride( &a ); + double* alphap = bli_obj_buffer( &alpha ); + double* xp = bli_obj_buffer( &x ); + double* yp = bli_obj_buffer( &y ); + double* ap = bli_obj_buffer( &a ); + + dsyr2_( &uplo, + &mm, + alphap, + xp, &incx, + yp, &incy, + ap, &lda ); + } + else if ( bli_is_scomplex( dt_a ) ) + { + f77_char uplo = 'L'; + f77_int mm = bli_obj_length( &a ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + f77_int lda = bli_obj_col_stride( &a ); + scomplex* alphap = bli_obj_buffer( &alpha ); + scomplex* xp = bli_obj_buffer( &x ); + scomplex* yp = bli_obj_buffer( &y ); + scomplex* ap = bli_obj_buffer( &a ); + + cher2_( &uplo, + &mm, + alphap, + xp, &incx, + yp, &incy, + ap, &lda ); + } + else if ( bli_is_dcomplex( dt_a ) ) + { + f77_char uplo = 'L'; + f77_int mm = bli_obj_length( &a ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + f77_int lda = bli_obj_col_stride( &a ); + dcomplex* alphap = bli_obj_buffer( &alpha ); + dcomplex* xp = bli_obj_buffer( &x ); + dcomplex* yp = bli_obj_buffer( &y ); + dcomplex* ap = bli_obj_buffer( &a ); + + zher2_( &uplo, + &mm, + alphap, + xp, &incx, + yp, &incy, + ap, &lda ); + } - f77_char uplo = 'L'; - f77_int mm = bli_obj_length( &a ); - f77_int incx = bli_obj_vector_inc( &x ); - f77_int incy = bli_obj_vector_inc( &y ); - f77_int lda = bli_obj_col_stride( &a ); - double* alphap = bli_obj_buffer( &alpha ); - double* xp = bli_obj_buffer( &x ); - double* yp = bli_obj_buffer( &y ); - double* ap = bli_obj_buffer( &a ); - - dsyr2_( &uplo, - &mm, - alphap, - xp, &incx, - yp, &incy, - ap, &lda ); #endif #ifdef PRINT diff --git a/test/test_her2k.c b/test/test_her2k.c index 85dabc98d..7e8a7b8fe 100644 --- a/test/test_her2k.c +++ b/test/test_her2k.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -84,13 +85,10 @@ int main( int argc, char** argv ) k_input = 1; #endif -#if 1 - //dt = BLIS_FLOAT; - dt = BLIS_DOUBLE; -#else + // her2k supports complex and double complex //dt = BLIS_SCOMPLEX; dt = BLIS_DCOMPLEX; -#endif + uploc = BLIS_LOWER; //uploc = BLIS_UPPER; @@ -153,7 +151,7 @@ int main( int argc, char** argv ) bli_copym( &c, &c_save ); - + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) @@ -181,16 +179,16 @@ int main( int argc, char** argv ) #else if ( bli_is_float( dt ) ) { - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldb = bli_obj_col_stride( &b ); - f77_int ldc = bli_obj_col_stride( &c ); - float* alphap = bli_obj_buffer( &alpha ); - float* ap = bli_obj_buffer( &a ); - float* bp = bli_obj_buffer( &b ); - float* betap = bli_obj_buffer( &beta ); - float* cp = bli_obj_buffer( &c ); + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = bli_obj_buffer( &alpha ); + float* ap = bli_obj_buffer( &a ); + float* bp = bli_obj_buffer( &b ); + float* betap = bli_obj_buffer( &beta ); + float* cp = bli_obj_buffer( &c ); ssyr2k_( &f77_uploc, &f77_transa, @@ -204,16 +202,16 @@ int main( int argc, char** argv ) } else if ( bli_is_double( dt ) ) { - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldb = bli_obj_col_stride( &b ); - f77_int ldc = bli_obj_col_stride( &c ); - double* alphap = bli_obj_buffer( &alpha ); - double* ap = bli_obj_buffer( &a ); - double* bp = bli_obj_buffer( &b ); - double* betap = bli_obj_buffer( &beta ); - double* cp = bli_obj_buffer( &c ); + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = bli_obj_buffer( &alpha ); + double* ap = bli_obj_buffer( &a ); + double* bp = bli_obj_buffer( &b ); + double* betap = bli_obj_buffer( &beta ); + double* cp = bli_obj_buffer( &c ); dsyr2k_( &f77_uploc, &f77_transa, @@ -227,16 +225,16 @@ int main( int argc, char** argv ) } else if ( bli_is_scomplex( dt ) ) { - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldb = bli_obj_col_stride( &b ); - f77_int ldc = bli_obj_col_stride( &c ); - scomplex* alphap = bli_obj_buffer( &alpha ); - scomplex* ap = bli_obj_buffer( &a ); - scomplex* bp = bli_obj_buffer( &b ); - float* betap = bli_obj_buffer( &beta ); - scomplex* cp = bli_obj_buffer( &c ); + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + scomplex* alphap = bli_obj_buffer( &alpha ); + scomplex* ap = bli_obj_buffer( &a ); + scomplex* bp = bli_obj_buffer( &b ); + float* betap = bli_obj_buffer( &beta ); + scomplex* cp = bli_obj_buffer( &c ); cher2k_( &f77_uploc, &f77_transa, @@ -250,16 +248,16 @@ int main( int argc, char** argv ) } else if ( bli_is_dcomplex( dt ) ) { - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldb = bli_obj_col_stride( &b ); - f77_int ldc = bli_obj_col_stride( &c ); - dcomplex* alphap = bli_obj_buffer( &alpha ); - dcomplex* ap = bli_obj_buffer( &a ); - dcomplex* bp = bli_obj_buffer( &b ); - double* betap = bli_obj_buffer( &beta ); - dcomplex* cp = bli_obj_buffer( &c ); + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + dcomplex* alphap = bli_obj_buffer( &alpha ); + dcomplex* ap = bli_obj_buffer( &a ); + dcomplex* bp = bli_obj_buffer( &b ); + double* betap = bli_obj_buffer( &beta ); + dcomplex* cp = bli_obj_buffer( &c ); zher2k_( &f77_uploc, &f77_transa, diff --git a/test/test_herk.c b/test/test_herk.c index dc5725612..cbf963a33 100644 --- a/test/test_herk.c +++ b/test/test_herk.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -83,14 +84,10 @@ int main( int argc, char** argv ) m_input = 3; k_input = 1; #endif - -#if 1 - //dt = BLIS_FLOAT; - dt = BLIS_DOUBLE; -#else + + // herk supports complex and double complex //dt = BLIS_SCOMPLEX; dt = BLIS_DCOMPLEX; -#endif uploc = BLIS_LOWER; //uploc = BLIS_UPPER; @@ -145,7 +142,7 @@ int main( int argc, char** argv ) bli_copym( &c, &c_save ); - + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) @@ -171,14 +168,14 @@ int main( int argc, char** argv ) #else if ( bli_is_float( dt ) ) { - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldc = bli_obj_col_stride( &c ); - float* alphap = bli_obj_buffer( &alpha ); - float* ap = bli_obj_buffer( &a ); - float* betap = bli_obj_buffer( &beta ); - float* cp = bli_obj_buffer( &c ); + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = bli_obj_buffer( &alpha ); + float* ap = bli_obj_buffer( &a ); + float* betap = bli_obj_buffer( &beta ); + float* cp = bli_obj_buffer( &c ); ssyrk_( &f77_uploc, &f77_transa, @@ -191,14 +188,14 @@ int main( int argc, char** argv ) } else if ( bli_is_double( dt ) ) { - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldc = bli_obj_col_stride( &c ); - double* alphap = bli_obj_buffer( &alpha ); - double* ap = bli_obj_buffer( &a ); - double* betap = bli_obj_buffer( &beta ); - double* cp = bli_obj_buffer( &c ); + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = bli_obj_buffer( &alpha ); + double* ap = bli_obj_buffer( &a ); + double* betap = bli_obj_buffer( &beta ); + double* cp = bli_obj_buffer( &c ); dsyrk_( &f77_uploc, &f77_transa, @@ -211,14 +208,14 @@ int main( int argc, char** argv ) } else if ( bli_is_scomplex( dt ) ) { - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldc = bli_obj_col_stride( &c ); - float* alphap = bli_obj_buffer( &alpha ); - scomplex* ap = bli_obj_buffer( &a ); - float* betap = bli_obj_buffer( &beta ); - scomplex* cp = bli_obj_buffer( &c ); + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = bli_obj_buffer( &alpha ); + scomplex* ap = bli_obj_buffer( &a ); + float* betap = bli_obj_buffer( &beta ); + scomplex* cp = bli_obj_buffer( &c ); cherk_( &f77_uploc, &f77_transa, @@ -231,14 +228,14 @@ int main( int argc, char** argv ) } else if ( bli_is_dcomplex( dt ) ) { - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldc = bli_obj_col_stride( &c ); - double* alphap = bli_obj_buffer( &alpha ); - dcomplex* ap = bli_obj_buffer( &a ); - double* betap = bli_obj_buffer( &beta ); - dcomplex* cp = bli_obj_buffer( &c ); + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = bli_obj_buffer( &alpha ); + dcomplex* ap = bli_obj_buffer( &a ); + double* betap = bli_obj_buffer( &beta ); + dcomplex* cp = bli_obj_buffer( &c ); zherk_( &f77_uploc, &f77_transa, From a4bc03b990fe0572001eb6409efd12cd70677dcf Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 19 Nov 2021 13:29:00 -0600 Subject: [PATCH 009/230] Brief mention/link to Addons.md in README.md. Details: - Add a blurb about the new addons feature to the "Documentation for BLIS developers" section of the README.md, which also links to the Addons.md document. --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 2abe79400..372b6857c 100644 --- a/README.md +++ b/README.md @@ -512,6 +512,11 @@ learn how to add new sub-configurations or configuration families, or are simply interested in learning how BLIS organizes its configurations and kernel sets, please read this thorough walkthrough of the configuration system. + * **[Addon Guide](docs/Addons.md).** If you are interested in learning +about using BLIS addons--that is, enabling existing (or creating new) bundles +of operation or API code that are built into a BLIS library--please read this +document. + * **[Sandbox Guide](docs/Sandboxes.md).** If you are interested in learning about using sandboxes in BLIS--that is, providing alternative implementations of the `gemm` operation--please read this document. From 12c66a4acc77bf4927b01e2358e2ac10b61e0a53 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 19 Nov 2021 14:43:53 -0600 Subject: [PATCH 010/230] Minor updates to README.md, docs/Addons.md. Details: - Add additional mentions of addons to README.md, including in the "What's New" section. - Removed mention of sandboxes from the long list of advantages provided by BLIS. - Very minor description update to opening line of Addons.md. --- README.md | 25 +++++++++++++++---------- docs/Addons.md | 4 ++-- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 372b6857c..211ebd6d5 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,16 @@ all of which are available for free via the [edX platform](http://www.edx.org/). What's New ---------- + * **Addons feature now available!** Have you ever wanted to quickly extend BLIS's +operation support or define new custom BLIS APIs for your application, but were +unsure of how to add your source code to BLIS? Do you want to isolate your custom +code so that it only gets enabled when the user requests it? Do you like +[sandboxes](docs/Sandboxes.md), but wish you didn't have to provide an +implementation of `gemm`? If so, you should check out our new +[addons](docs/Addons.md) feature. Addons act like optional extensions that can be +created, enabled, and combined to suit your application's needs, all without +formally integrating your code into the core BLIS framework. + * **Multithreaded small/skinny matrix support for sgemm now available!** Thanks to funding and hardware support from Oracle, we have now accelerated `gemm` for single-precision real matrix problems where one or two dimensions is exceedingly @@ -265,20 +275,13 @@ many will find BLIS's object-based APIs a delight to use when customizing or writing their own BLIS operations. (Objects are relatively lightweight `structs` and passed by address, which helps tame function calling overhead.) - * **Multilayered API, exposed kernels, and sandboxes.** The BLIS framework -exposes its + * **Multilayered API and exposed kernels.** The BLIS framework exposes its implementations in various layers, allowing expert developers to access exactly the functionality desired. This layered interface includes that of the lowest-level kernels, for those who wish to bypass the bulk of the framework. Optimizations can occur at various levels, in part thanks to exposed packing and unpacking facilities, which by default are highly parameterized and -flexible. And more recently, BLIS introduced sandboxes--a way to provide -alternative implementations of `gemm` that do not use any more of the BLIS -infrastructure than is desired. Sandboxes provide a convenient and -straightforward way of modifying the `gemm` implementation without disrupting -any other level-3 operation or any other part of the framework. This works -especially well when the developer wants to experiment with new optimizations -or try a different algorithm. +flexible. * **Functionality that grows with the community's needs.** As its name suggests, the BLIS framework is not a single library or static API, but rather @@ -286,7 +289,9 @@ a nearly-complete template for instantiating high-performance BLAS-like libraries. Furthermore, the framework is extensible, allowing developers to leverage existing components to support new operations as they are identified. If such operations require new kernels for optimal efficiency, the framework -and its APIs will be adjusted and extended accordingly. +and its APIs will be adjusted and extended accordingly. Community developers +who wish to experiment with creating new operations or APIs in BLIS can quickly +and easily do so via the [Addons](docs/Addons.md) feature. * **Code re-use.** Auto-generation approaches to achieving the aforementioned goals tend to quickly lead to code bloat due to the multiple dimensions of diff --git a/docs/Addons.md b/docs/Addons.md index 595cebfa4..bd4799fb7 100644 --- a/docs/Addons.md +++ b/docs/Addons.md @@ -10,8 +10,8 @@ ## Introduction -This file briefly describes the requirements for building a custom BLIS -*addon*. +This file briefly describes the requirements for enabling or creating a +custom BLIS *addon*. Simply put, an addon in BLIS provides additional APIs, operations, and/or implementations that may be useful to certain users. An addon can be From e229e049ca08dfbd45794669df08a71dba892925 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 1 Dec 2021 17:36:22 -0600 Subject: [PATCH 011/230] Added recu-sed.sh script to 'build' directory. Details: - Added a recursive sed script to the 'build' directory. --- build/recu-sed.sh | 488 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 488 insertions(+) create mode 100755 build/recu-sed.sh diff --git a/build/recu-sed.sh b/build/recu-sed.sh new file mode 100755 index 000000000..e7a1d43db --- /dev/null +++ b/build/recu-sed.sh @@ -0,0 +1,488 @@ +#!/bin/bash + +# +# recursive-sed.sh +# +# Field G. Van Zee +# + +print_usage() +{ + # Echo usage info + echo " " + echo " "$script_name + echo " " + echo " Field G. Van Zee" + echo " " + echo " Recusively descend a directory tree and perform sed commands, either on" + echo " the filename or the file contents, or both." + echo " " + echo " Usage:" + echo " ${script_name} [options]" + echo " " + echo " The following options are accepted:" + echo " " + echo " -d " + echo " Dry run. Go through all the motions, but don't actually" + echo " apply any of the sed expressions to file names or contents." + echo " -N " + echo " Do not proceed recursively into subdirectories; consider" + echo " only the files within the current directory. Default" + echo " behavior is to act recursively." + echo " -h " + echo " Consider hidden files and directories. Default behavior is" + echo " to ignore them." + echo " -n " + echo " Use svn mv instead of mv when renaming the file." + echo " Notice that this only applies if the filename changes." + echo " -p pattern " + echo " Specifies the filename pattern, as would be given to the" + echo " ls utility, to limit which files are affected. Default is" + echo " the to consider all files present." + echo " -r dir" + echo " The root directory for the recursive action to be performed." + echo " Default is to use the current working directory." + echo " -v [0|1|2]" + echo " verboseness level" + echo " level 0: silent (no output)" + echo " level 1: default (one line per directory; supress ls stderr)" + echo " level 2: verbose (one line per directory; show ls stderr)" + echo " " + echo " At least one of the following option-argument pairs is required:" + echo " " + echo " -f sed_expr " + echo " Specifies the sed expression that will be applied to the" + echo " filenames of the files touched by the script. This expression" + echo " must be a search-and-replace pattern." + echo " -c sed_expr " + echo " Specifies the sed expression that will be applied to the" + echo " contents of the files touched by the script. This expression" + echo " should be a search-and-replace pattern." + echo " -s sed_script" + echo " Specifies an arbitrary sed script that will be applied to the" + echo " file contents of the files touched by the script." + echo " " + echo " Note: -c and -s options are mutually exclusive." + echo " " + + # Exit with non-zero exit status + exit 1 +} + + + + +perform_sed() +{ + # Variables set by getopts. + local exist_dir="$1" + + #echo "exist_dir: $exist_dir" + + # The suffix used to create temporary files + local temp_file_suffix="sed_temp" + + # Check that exist_dir actually exists and is a directory + if [ ! -d "${exist_dir}" ]; then + echo "${script_name}: ${exist_dir} does not seem to be a valid directory." + exit 1 + fi + + # Check that the filename sed expression, if given, begins with an 's'. + if [ -n "$filename_sed_expr" ]; then + + # If it's a valid search-and-replace expression, this should return an 's'. + filename_sed_char=${filename_sed_expr%%/*} + + if [ "$filename_sed_char" != "s" ]; then + echo "${script_name}: sed expression given with -f must be search-and-replace." + exit 1 + fi + fi + + # Check that the sed script, if given, exists. + if [ -n "$contents_sed_script" ]; then + + if [ ! -f ${contents_sed_script} ]; then + echo "${script_name}: ${contents_sed_script} is not a regular file or does not exist." + exit 1 + fi + fi + + # Assume that the sed expression is a search-and-replace. Extract the patterns + # to match on. (Arbitrary sed expressions should be applied through a sed script.) + if [ "$filename_sed_expr" != "" ]; then + filename_sed_match=${filename_sed_expr#s/} + filename_sed_match=${filename_sed_match%%/*} + fi + + + # Get the list of source files in the directory given. Supress stderr if + # level 0 or 1 verbosity was requested. + #if [ "$verbose_level" != "2" ]; then + # old_filepaths=$(ls -d -b ${exist_dir}/${filename_pattern} 2> /dev/null) + #else + # old_filepaths="$(ls -d -b ${exist_dir}/${filename_pattern})" + #fi + + #echo $old_filepaths + #echo "$exist_dir/$filename_pattern" + + #for old_filepath in $old_filepaths; do + #echo "exist_dir: $exist_dir" + + # Find all files that match the pattern in the current directory. + find "${exist_dir}" -maxdepth 1 -name "${filename_pattern}" -print | while read old_filepath + do + #echo "old_filepath: $old_filepath" + + # Skip the current directory. + if [ "${old_filepath}" == "${exist_dir}" ]; then + continue + fi + + # Skip any non-regular files. + if [ ! -f "$old_filepath" ]; then + + # And say we are doing so if verboseness was requested. + if [ "$verbose_level" = "2" ]; then + echo "${script_name}: Ignoring $old_filepath" + fi + continue + fi + + # Strip exist_dir from filename. + old_filename=${old_filepath##*/} + + # Strip the filename from old_filepath to leave the directory path. + old_dirpath=${old_filepath%/*} + + # Create a new filename from the old one. If a filename sed expression was given, + # it will be applied now. + if [ "$filename_sed_expr" != "" ]; then + new_filename=$(echo "${old_filename}" | sed "${filename_sed_expr}") + else + new_filename="${old_filename}" + fi + + #echo "new_filename: $new_filename" + + # Create the filepath to the new file location. + new_filepath="${old_dirpath}/${new_filename}" + #echo "new_filepath: $new_filepath" + + # Grep for the filename pattern within the filename of the current file. + if [ "$filename_sed_expr" != "" ]; then + grep_filename=$(echo "${old_filename}" | grep "${filename_sed_match}") + fi + + + # If we are not performing a dry run, proceed. + if [ -z "$dry_run_flag" ]; then + + # Save the old file permissions so we can re-apply them to the + # new file if its contents change (ie: if it's not just a 'mv', + # which inherently preserves file permissions). + old_perms=$(stat -c %a "${old_filepath}") + + # If the old and new filepaths are different, then we start off by + # renaming the file. (Otherwise, if the old and new filepaths are + # identical, then we don't need to do anything to the file.) If + # the user requested that we use svn mv, then do that, otherwise we + # use regular mv. + if [ "${old_filepath}" != "${new_filepath}" ]; then + + if [ -n "$use_svn_mv_flag" ]; then + + svn mv "${old_filepath}" "${new_filepath}" + else + + mv -f "${old_filepath}" "${new_filepath}" + fi + fi + #else + + # A dry run still needs the act upon the "new" file, so if the + # filepaths are different, simply set the new filepath to the + # old one. (We won't need the previous value of new_filepath + # anymore.) + #if [ "${old_filepath}" != "${new_filepath}" ]; then + # new_filepath="${old_filepath}" + #fi + fi + + # Handle the cases that might change the contents of the file. + if [ "$contents_sed_expr" != "" ] || + [ "$contents_sed_script" != "" ]; then + + # Execute the sed command based on whether the sed action was given + # as a command line expression or a script residing in a file. + if [ "$contents_sed_script" != "" ]; then + + # Perform the action, saving the result to a temporary file. + cat "${new_filepath}" | sed -f ${contents_sed_script} \ + > ${new_filepath}.${temp_file_suffix} + + elif [ "$contents_sed_expr" != "" ]; then + + # Perform the action, saving the result to a temporary file. + cat "${new_filepath}" | sed -e "${contents_sed_expr}" \ + > ${new_filepath}.${temp_file_suffix} + fi + + # Check the difference. + file_diff=$(diff "${new_filepath}" "${new_filepath}.${temp_file_suffix}") + + + # If we are not performing a dry run, proceed. + if [ -z "$dry_run_flag" ]; then + + # If the file contents change. + if [ -n "$file_diff" ]; then + + # Apply the old file permissions to the new file (before we + # potentially overwrite the old file with the new one). + chmod ${old_perms} "${new_filepath}.${temp_file_suffix}" + + # Apply the file contents changes to the new filepath (which may + # or may not be the same as the old filepath). + mv -f "${new_filepath}.${temp_file_suffix}" "${new_filepath}" + + else + # Otherwise remove the new temporary file since it is identical + # to the original. + rm -f "${new_filepath}.${temp_file_suffix}" + fi + else + # Simply remove the file since we are only performing a dry run. + rm -f "${new_filepath}.${temp_file_suffix}" + fi + + fi + + # Check for dos2unix. If it's not here, we'll just substitute cat. + #type_dos2unix=$(type -path dos2unix) + #if [ -n "$type_dos2unix" ]; then + # dos2unix -q ${new_filepath} + #fi + + # Create a string that indicates what we are changing. We'll use this in + # the verbose progress echo to indicate how the file is or would be changed. + if [ -n "$grep_filename" ] && [ -n "$file_diff" ]; then + which_matches="filename/contents" + file_touched="yes" + elif [ -n "$grep_filename" ] && [ -z "$file_diff" ]; then + which_matches="filename " + file_touched="yes" + elif [ -z "$grep_filename" ] && [ -n "$file_diff" ]; then + which_matches=" contents" + file_touched="yes" + else + which_matches="" + file_touched="no" + fi + + # Be verbose, if requested, about which file we're looking at. + if [ "$verbose_level" != "0" ]; then + + # But we only need to output a line if the file was touched. + if [ "$file_touched" != "no" ]; then + + # Construct a relative filepath by stripping the initial root + # directory so that the output does not span as many columns on + # the terminal. + rel_old_filepath=${old_filepath#${initial_root_dir}/} + + # Add a "dry run" condition to the output if we're doing a dry-run + # so that the user knows we didn't really change anything. + if [ -z "$dry_run_flag" ]; then + echo "$script_name: Changing [${which_matches}] of ${rel_old_filepath}" + else + echo "$script_name: Changing (dry run) [${which_matches}] of ${rel_old_filepath}" + fi + fi + fi + + done + + # Exit peacefully. + return 0 +} + + + + +recursive_sed() +{ + # Local variable declarations + local item sub_items curr_dir this_dir + + + # Extract our argument + curr_dir="$1" + + + # Call our function to perform the sed operations on the files in the + # directory given. + perform_sed "${curr_dir}" + + + # If we were asked to act recursively, then continue processing + # curr_dir's contents. + if [ "$recursive_flag" = "1" ]; then + + # Get a listing of items in the directory according to the hidden + # files/directories flag. + if [ -n "$hidden_files_dirs_flag" ]; then + + # Get a listing of the directories in curr_dir (including hidden + # files and directories). + sub_items=$(ls -a "$curr_dir") + + else + + # Get a listing of the directories in curr_dir. + sub_items=$(ls "$curr_dir") + fi + + #echo "sub_items: $sub_items" + + # Descend into the contents of curr_dir, calling recursive_sed on + # any items that are directories. + find "${curr_dir}" -maxdepth 1 -name "*" -print | while read item + do + + #echo "conisdering item: $item" + + # Skip the current directory. + if [ "${item}" == "${curr_dir}" ]; then + continue + fi + + # If item is a directory, descend into it. + if [ -d "$item" ]; then + + #echo "item is dir: $item" + + recursive_sed "$item" + fi + done + + fi + + + # Return peacefully + return 0 +} + + + + +main() +{ + # Variables set by getopts. + dry_run_flag="" + hidden_files_dirs_flag="" + use_svn_mv_flag="" + filename_pattern="" + root_dir="" + initial_root_dir="" + verbose_level="" + filename_sed_expr="" + contents_sed_expr="" + contents_sed_script="" + + recursive_flag="1" + + + # Get the script name + script_name=${0##*/} + + + # Local variable declarations. + local item sub_items this_dir + + + # Process our command line options. + while getopts ":c:df:hp:r:s:nNv:" opt; do + case $opt in + d ) dry_run_flag="1" ;; + h ) hidden_files_dirs_flag="1" ;; + n ) use_svn_mv_flag="1" ;; + N ) recursive_flag="0" ;; + v ) verbose_level="$OPTARG" ;; + p ) filename_pattern="$OPTARG" ;; + r ) root_dir="$OPTARG" ;; + f ) filename_sed_expr="$OPTARG" ;; + c ) contents_sed_expr="$OPTARG" ;; + s ) contents_sed_script="$OPTARG" ;; + \? ) print_usage + esac + done + shift $(($OPTIND - 1)) + + + # Make sure we've parsed all command line arguments by now. + if [ $# != "0" ]; then + echo "${script_name}: Unparsed command line arguments! Try running with no arguments for help." + exit 1 + fi + + + # Make sure we received at least one of the required options. + if [ -z "$filename_sed_expr" ] && + [ -z "$contents_sed_expr" ] && + [ -z "$contents_sed_script" ]; then + print_usage + fi + + + # Make sure that both a file contents sed expression and sed script were + # not given. + if [ "$contents_sed_expr" != "" ] && + [ "$contents_sed_script" != "" ] ; then + echo "${script_name}: The -c and -s options may not be used at the same time." + exit 1 + fi + + + # Make sure that verboseness level is valid. + if [ "$verbose_level" != "0" ] && + [ "$verbose_level" != "1" ] && + [ "$verbose_level" != "2" ]; then + verbose_level="1" + fi + + # Prepare the filename pattern arguments to perform_sed(). + if [ "$filename_pattern" = "" ] ; then + filename_pattern='*' + fi + + # Prepare the directory arguments to perform_sed(). + if [ "$root_dir" != "" ] ; then + + # Strip / from end of directory paths, if there is one. + root_dir=${root_dir%/} + else + root_dir=$PWD + fi + initial_root_dir=${root_dir} + + + #echo "root_dir: $root_dir" + + + # Begin recursing on the root directory. + recursive_sed "$root_dir" + + + # Exit peacefully + return 0 +} + + + + +# The script's main entry point, passing all parameters given. +main "$@" + From cf7d616a2fd58e293b496770654040818bf5609c Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 2 Dec 2021 17:10:03 -0600 Subject: [PATCH 012/230] Enable user-customized packm ukernel/variant. (#549) Details: - Added four new fields to obj_t: .pack_fn, .pack_params, .ker_fn, and .ker_params. These fields store pointers to functions and data that will allow the user to more flexibly create custom operations while recycling BLIS's existing partitioning infrastructure. - Updated typed API to packm variant and structure-aware kernels to replace the diagonal offset with panel offsets, and changed strides of both C and P to inc/ldim semantics. Updated object API to the packm variant to include rntm_t*. - Removed the packm variant function pointer from the packm cntl_t node definition since it has been replaced by the .pack_fn pointer in the obj_t. - Updated bli_packm_int() to read the new packm variant function pointer from the obj_t and call it instead of from the cntl_t node. - Moved some of the logic of bli_l3_packm.c to a new file, bli_packm_alloc.c. - Rewrote bli_packm_blk_var1.c so that it uses byte (char*) pointers instead of typed pointers, allowing a single function to be used regardless of datatype. This obviated having a separate implementation in bli_packm_blk_var1_md.c. Also relegated handling of scalars to a new function, bli_packm_scalar(). - Employed a new standard whereby right-hand matrix operands ("B") are always packed as column-stored row panels -- that is, identically to that of left-hand matrix operands ("A"). This means that while we pack matrix A normally, we actually pack B in a transposed state. This allowed us to simplify a lot of code throughout the framework, and also affected some of the logic in bli_l3_packa() and _packb(). - Simplified bli_packm_init.c in light of the new B^T convention described above. bli_packm_init()--which is now called from within bli_packm_blk_var1()--also now calls bli_packm_alloc() and returns a bool that indicates whether packing should be performed (or skipped). - Consolidated bli_gemm_int() and bli_trsm_int() into a bli_l3_int(), which, among other things, defaults the new .pack_fn field of the obj_t to bli_packm_blk_var1() if the field is NULL. - Defined a new function, bli_obj_reset_origin(), which permanently refocuses the view of an object so that it "forgets" any offsets from its original pointer. This function also sets the object's root field to itself. Calls to bli_obj_reset_origin() for each matrix operand appear in the _front() functions, after the obj_t's are aliased. This resetting of the underlying matrices' origins is needed in preparation for more advanced features from within custom packm kernels. - Redefined bli_pba_rntm_set_pba() from a regular function to a static inline function. - Updated gemm_ukr, gemmtrsm_ukr, and trsm_ukr testsuite modules to use libblis_test_pobj_create() to create local packed objects. Previously, these packed objects were created by calling lower-level functions. --- build/libblis-symbols.def | 1 - frame/1m/bli_l1m_ft_ker.h | 18 +- frame/1m/bli_l1m_oft_var.h | 1 + frame/1m/packm/bli_packm.h | 8 +- .../{bli_packm_var.h => bli_packm_alloc.c} | 139 ++- .../packm/bli_packm_alloc.h} | 17 +- frame/1m/packm/bli_packm_blk_var1.c | 824 +++++------------- frame/1m/packm/bli_packm_blk_var1.h | 59 ++ frame/1m/packm/bli_packm_blk_var1_md.c | 344 -------- frame/1m/packm/bli_packm_blk_var1_md.h | 67 -- frame/1m/packm/bli_packm_cntl.c | 4 +- frame/1m/packm/bli_packm_cntl.h | 7 - frame/1m/packm/bli_packm_init.c | 437 ++-------- frame/1m/packm/bli_packm_init.h | 19 +- frame/1m/packm/bli_packm_int.c | 56 +- frame/1m/packm/bli_packm_int.h | 1 + .../packm/bli_packm_scalar.c} | 106 +-- .../{bli_packm_md.h => bli_packm_scalar.h} | 3 +- frame/1m/packm/bli_packm_struc_cxk.c | 327 +++---- frame/1m/packm/bli_packm_struc_cxk.h | 73 +- frame/1m/packm/bli_packm_struc_cxk_1er.c | 335 +++---- frame/1m/packm/bli_packm_struc_cxk_1er.h | 76 +- frame/1m/packm/bli_packm_struc_cxk_md.c | 59 +- frame/1m/packm/bli_packm_struc_cxk_md.h | 21 +- frame/1m/packm/bli_packm_unb_var1.c | 297 ------- frame/1m/packm/bli_packm_unb_var1.h | 66 -- frame/1m/unpackm/bli_unpackm.h | 2 - frame/1m/unpackm/bli_unpackm_unb_var1.c | 131 --- frame/1m/unpackm/bli_unpackm_unb_var1.h | 60 -- frame/3/bli_l3.h | 3 +- frame/3/bli_l3_check.c | 6 +- frame/3/{trsm/bli_trsm_int.c => bli_l3_int.c} | 77 +- frame/3/{gemm/bli_gemm_int.h => bli_l3_int.h} | 2 +- frame/3/bli_l3_oft_var.h | 19 +- .../bli_gemm_packab.c => bli_l3_packab.c} | 45 +- .../{trsm/bli_trsm_int.h => bli_l3_packab.h} | 15 +- frame/3/bli_l3_packm.c | 187 ---- frame/3/gemm/bli_gemm.h | 1 - frame/3/gemm/bli_gemm_blk_var1.c | 2 +- frame/3/gemm/bli_gemm_blk_var2.c | 2 +- frame/3/gemm/bli_gemm_blk_var3.c | 2 +- frame/3/gemm/bli_gemm_cntl.c | 15 +- frame/3/gemm/bli_gemm_front.c | 17 +- frame/3/gemm/bli_gemm_int.c | 127 --- frame/3/gemm/bli_gemm_var.h | 3 - frame/3/gemmt/bli_gemmt_front.c | 11 +- frame/3/gemmt/bli_gemmt_x_ker_var2.c | 6 +- frame/3/hemm/bli_hemm_front.c | 17 +- frame/3/symm/bli_symm_front.c | 17 +- frame/3/trmm/bli_trmm_front.c | 17 +- frame/3/trmm/bli_trmm_xx_ker_var2.c | 8 +- frame/3/trmm3/bli_trmm3_front.c | 17 +- frame/3/trsm/bli_trsm.h | 2 - frame/3/trsm/bli_trsm_blk_var1.c | 6 +- frame/3/trsm/bli_trsm_blk_var2.c | 4 +- frame/3/trsm/bli_trsm_blk_var3.c | 2 +- frame/3/trsm/bli_trsm_cntl.c | 27 +- frame/3/trsm/bli_trsm_front.c | 17 +- frame/3/trsm/bli_trsm_var.h | 2 - frame/3/trsm/bli_trsm_xx_ker_var2.c | 8 +- frame/base/bli_obj.c | 9 +- frame/base/bli_pba.c | 11 - frame/base/bli_pba.h | 11 +- frame/base/bli_sba.c | 84 +- frame/include/bli_obj_macro_defs.h | 60 +- frame/include/bli_type_defs.h | 240 +++-- testsuite/src/test_gemm_ukr.c | 78 +- testsuite/src/test_gemmtrsm_ukr.c | 128 ++- testsuite/src/test_libblis.c | 41 +- testsuite/src/test_libblis.h | 2 +- testsuite/src/test_trsm_ukr.c | 98 +-- 71 files changed, 1290 insertions(+), 3714 deletions(-) rename frame/1m/packm/{bli_packm_var.h => bli_packm_alloc.c} (50%) rename frame/{3/bli_l3_packm.h => 1m/packm/bli_packm_alloc.h} (88%) create mode 100644 frame/1m/packm/bli_packm_blk_var1.h delete mode 100644 frame/1m/packm/bli_packm_blk_var1_md.c delete mode 100644 frame/1m/packm/bli_packm_blk_var1_md.h rename frame/{3/trsm/bli_trsm_packab.c => 1m/packm/bli_packm_scalar.c} (53%) rename frame/1m/packm/{bli_packm_md.h => bli_packm_scalar.h} (95%) delete mode 100644 frame/1m/packm/bli_packm_unb_var1.c delete mode 100644 frame/1m/packm/bli_packm_unb_var1.h delete mode 100644 frame/1m/unpackm/bli_unpackm_unb_var1.c delete mode 100644 frame/1m/unpackm/bli_unpackm_unb_var1.h rename frame/3/{trsm/bli_trsm_int.c => bli_l3_int.c} (74%) rename frame/3/{gemm/bli_gemm_int.h => bli_l3_int.h} (99%) rename frame/3/{gemm/bli_gemm_packab.c => bli_l3_packab.c} (80%) rename frame/3/{trsm/bli_trsm_int.h => bli_l3_packab.h} (90%) delete mode 100644 frame/3/bli_l3_packm.c delete mode 100644 frame/3/gemm/bli_gemm_int.c diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def index 97146a786..8d29d73b2 100644 --- a/build/libblis-symbols.def +++ b/build/libblis-symbols.def @@ -1307,7 +1307,6 @@ bli_pba_init_pools bli_pba_pool_size bli_pba_query bli_pba_release -bli_pba_rntm_set_pba bli_memsys_finalize bli_memsys_init bli_mkherm diff --git a/frame/1m/bli_l1m_ft_ker.h b/frame/1m/bli_l1m_ft_ker.h index 1146ca7d2..2e813cf4a 100644 --- a/frame/1m/bli_l1m_ft_ker.h +++ b/frame/1m/bli_l1m_ft_ker.h @@ -50,21 +50,23 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ); INSERT_GENTDEF( packm ) diff --git a/frame/1m/bli_l1m_oft_var.h b/frame/1m/bli_l1m_oft_var.h index 15e9dae6f..0b60d4e2f 100644 --- a/frame/1m/bli_l1m_oft_var.h +++ b/frame/1m/bli_l1m_oft_var.h @@ -48,6 +48,7 @@ typedef void (*PASTECH(opname,_var_oft)) \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ + rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index e8aa36328..88657a712 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -33,15 +33,15 @@ */ +#include "bli_packm_alloc.h" #include "bli_packm_cntl.h" #include "bli_packm_check.h" #include "bli_packm_init.h" #include "bli_packm_int.h" +#include "bli_packm_scalar.h" #include "bli_packm_part.h" -#include "bli_packm_var.h" - #include "bli_packm_struc_cxk.h" #include "bli_packm_struc_cxk_1er.h" @@ -50,6 +50,8 @@ // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD -#include "bli_packm_md.h" +#include "bli_packm_struc_cxk_md.h" #endif +#include "bli_packm_blk_var1.h" + diff --git a/frame/1m/packm/bli_packm_var.h b/frame/1m/packm/bli_packm_alloc.c similarity index 50% rename from frame/1m/packm/bli_packm_var.h rename to frame/1m/packm/bli_packm_alloc.c index 723e6fdb4..df6750d7a 100644 --- a/frame/1m/packm/bli_packm_var.h +++ b/frame/1m/packm/bli_packm_alloc.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2016, Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -33,78 +33,67 @@ */ -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* c, \ - obj_t* p, \ - cntx_t* cntx, \ - cntl_t* cntl, \ - thrinfo_t* t \ - ); - -GENPROT( packm_unb_var1 ) -GENPROT( packm_blk_var1 ) - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_unb_var1 ) - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - pack_t schema, \ - bool invdiag, \ - bool revifup, \ - bool reviflo, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - void_fp packm_ker, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( packm_blk_var1 ) +#include "blis.h" + +void* bli_packm_alloc + ( + siz_t size_needed, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + // Query the pack buffer type from the control tree node. + packbuf_t pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); + + // Query the address of the mem_t entry within the control tree node. + mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl ); + + mem_t* local_mem_p; + mem_t local_mem_s; + + siz_t cntl_mem_size = 0; + + if ( bli_mem_is_alloc( cntl_mem_p ) ) + cntl_mem_size = bli_mem_size( cntl_mem_p ); + + if ( cntl_mem_size < size_needed ) + { + if ( bli_thread_am_ochief( thread ) ) + { + // The chief thread releases the existing block associated with + // the mem_t entry in the control tree, and then re-acquires a + // new block, saving the associated mem_t entry to local_mem_s. + if ( bli_mem_is_alloc( cntl_mem_p ) ) + { + bli_pba_release + ( + rntm, + cntl_mem_p + ); + } + bli_pba_acquire_m + ( + rntm, + size_needed, + pack_buf_type, + &local_mem_s + ); + } + + // Broadcast the address of the chief thread's local mem_t entry to + // all threads. + local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); + + // Save the chief thread's local mem_t entry to the mem_t field in + // this thread's control tree node. + *cntl_mem_p = *local_mem_p; + + // Barrier so that the master thread doesn't return from the function + // before we are done reading. + bli_thread_barrier( thread ); + } + + return bli_mem_buffer( cntl_mem_p ); +} diff --git a/frame/3/bli_l3_packm.h b/frame/1m/packm/bli_packm_alloc.h similarity index 88% rename from frame/3/bli_l3_packm.h rename to frame/1m/packm/bli_packm_alloc.h index 696dabf59..b433be350 100644 --- a/frame/3/bli_l3_packm.h +++ b/frame/1m/packm/bli_packm_alloc.h @@ -5,7 +5,6 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -33,13 +32,11 @@ */ -void bli_l3_packm - ( - obj_t* x, - obj_t* x_pack, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); +BLIS_EXPORT_BLIS void* bli_packm_alloc + ( + siz_t size_needed, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 5073f7812..edeeae2b9 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -35,35 +35,6 @@ #include "blis.h" -#define FUNCPTR_T packm_fp - -typedef void (*FUNCPTR_T) - ( - struc_t strucc, - doff_t diagoffc, - diag_t diagc, - uplo_t uploc, - trans_t transc, - pack_t schema, - bool invdiag, - bool revifup, - bool reviflo, - dim_t m, - dim_t n, - dim_t m_max, - dim_t n_max, - void* kappa, - void* c, inc_t rs_c, inc_t cs_c, - void* p, inc_t rs_p, inc_t cs_p, - inc_t is_p, - dim_t pd_p, inc_t ps_p, - void_fp packm_ker, - cntx_t* cntx, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1); - static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = { @@ -79,614 +50,265 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = NULL, bli_zpackm_struc_cxk_1er, } }, }; +static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md); void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, + rntm_t* rntm, cntl_t* cntl, - thrinfo_t* t + thrinfo_t* thread ) { -#ifdef BLIS_ENABLE_GEMM_MD - // Call a different packm implementation when the storage and target - // datatypes differ. - if ( bli_obj_dt( c ) != bli_obj_target_dt( c ) ) - { - bli_packm_blk_var1_md( c, p, cntx, cntl, t ); + // Extract various fields from the control tree. + pack_t schema = bli_cntl_packm_params_pack_schema( cntl ); + bool invdiag = bli_cntl_packm_params_does_invert_diag( cntl ); + bool revifup = bli_cntl_packm_params_rev_iter_if_upper( cntl ); + bool reviflo = bli_cntl_packm_params_rev_iter_if_lower( cntl ); + + // Every thread initializes p and determines the size of memory + // block needed (which gets embedded into the otherwise "blank" mem_t + // entry in the control tree node). Return early if no packing is required. + if ( !bli_packm_init( c, p, cntx, rntm, cntl, thread ) ) return; - } -#endif - num_t dt_p = bli_obj_dt( p ); + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_packm_int_check( c, p, cntx ); - struc_t strucc = bli_obj_struc( c ); - doff_t diagoffc = bli_obj_diag_offset( c ); - diag_t diagc = bli_obj_diag( c ); - uplo_t uploc = bli_obj_uplo( c ); - trans_t transc = bli_obj_conjtrans_status( c ); - pack_t schema = bli_obj_pack_schema( p ); - bool invdiag = bli_obj_has_inverted_diag( p ); - bool revifup = bli_obj_is_pack_rev_if_upper( p ); - bool reviflo = bli_obj_is_pack_rev_if_lower( p ); + num_t dt_c = bli_obj_dt( c ); + dim_t dt_c_size = bli_dt_size( dt_c ); - dim_t m_p = bli_obj_length( p ); - dim_t n_p = bli_obj_width( p ); - dim_t m_max_p = bli_obj_padded_length( p ); - dim_t n_max_p = bli_obj_padded_width( p ); + num_t dt_p = bli_obj_dt( p ); + dim_t dt_p_size = bli_dt_size( dt_p ); - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); + struc_t strucc = bli_obj_struc( c ); + doff_t diagoffc = bli_obj_diag_offset( c ); + diag_t diagc = bli_obj_diag( c ); + uplo_t uploc = bli_obj_uplo( c ); + conj_t conjc = bli_obj_conj_status( c ); - void* buf_p = bli_obj_buffer_at_off( p ); - inc_t rs_p = bli_obj_row_stride( p ); - inc_t cs_p = bli_obj_col_stride( p ); - inc_t is_p = bli_obj_imag_stride( p ); - dim_t pd_p = bli_obj_panel_dim( p ); - inc_t ps_p = bli_obj_panel_stride( p ); + dim_t iter_dim = bli_obj_length( p ); + dim_t panel_len_full = bli_obj_width( p ); + dim_t panel_len_max = bli_obj_padded_width( p ); - obj_t kappa; - void* buf_kappa; + char* c_cast = bli_obj_buffer_at_off( c ); + inc_t incc = bli_obj_row_stride( c ); + inc_t ldc = bli_obj_col_stride( c ); + dim_t panel_dim_off = bli_obj_row_off( c ); + dim_t panel_len_off = bli_obj_col_off( c ); - func_t* packm_kers; - void_fp packm_ker; + char* p_cast = bli_obj_buffer( p ); + inc_t ldp = bli_obj_col_stride( p ); + inc_t is_p = bli_obj_imag_stride( p ); + dim_t panel_dim_max = bli_obj_panel_dim( p ); + inc_t ps_p = bli_obj_panel_stride( p ); - FUNCPTR_T f; + doff_t diagoffc_inc = ( doff_t )panel_dim_max; + obj_t kappa_local; + char* kappa_cast = bli_packm_scalar( &kappa_local, p ); - // Treatment of kappa (ie: packing during scaling) depends on - // whether we are executing an induced method. - if ( bli_is_nat_packed( schema ) ) - { - // This branch is for native execution, where we assume that - // the micro-kernel will always apply the alpha scalar of the - // higher-level operation. Thus, we use BLIS_ONE for kappa so - // that the underlying packm implementation does not perform - // any scaling during packing. - buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE ); - } - else // if ( bli_is_ind_packed( schema ) ) - { - obj_t* kappa_p; - - // The value for kappa we use will depend on whether the scalar - // attached to A has a nonzero imaginary component. If it does, - // then we will apply the scalar during packing to facilitate - // implementing induced complex domain algorithms in terms of - // real domain micro-kernels. (In the aforementioned situation, - // applying a real scalar is easy, but applying a complex one is - // harder, so we avoid the need altogether with the code below.) - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { - //printf( "applying non-zero imag kappa\n" ); + // we use the default lookup table to determine the right func_t + // for the current schema. + func_t* packm_kers = &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ]; - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); + // Query the datatype-specific function pointer from the func_t object. + packm_ker_vft packm_ker_cast = bli_func_get_dt( dt_p, packm_kers ); - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); + // For mixed-precision gemm, select the proper kernel (only dense panels). + if ( dt_c != dt_p ) + { + packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ]; + } - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } + // Query the address of the packm params field of the obj_t. The user might + // have set this field in order to specify a custom packm kernel. + packm_blk_var1_params_t* params = bli_obj_pack_params( c ); - // Acquire the buffer to the kappa chosen above. - buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p ); + if ( params && params->ukr_fn[ dt_c ][ dt_p ] ) + { + // Query the user-provided packing kernel from the obj_t. If provided, + // this overrides the kernel determined above. + packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ]; } + /* Compute the total number of iterations we'll need. */ + dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); - // The original idea here was to read the packm_ukr from the context - // if it is non-NULL. The problem is, it requires that we be able to - // assume that the packm_ukr field is initialized to NULL, which it - // currently is not. - - //func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx ); + /* Set the initial values and increments for indices related to C and P + based on whether reverse iteration was requested. */ + dim_t ic0, ip0; + doff_t ic_inc, ip_inc; - //if ( bli_func_is_null_dt( dt_c, cntx_packm_kers ) ) + if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || + ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) { - // If the packm structure-aware kernel func_t in the context is - // NULL (which is the default value after the context is created), - // we use the default lookup table to determine the right func_t - // for the current schema. - const dim_t i = bli_pack_schema_index( schema ); - - packm_kers = &packm_struc_cxk_kers[ i ]; + ic0 = (n_iter - 1) * panel_dim_max; + ic_inc = -panel_dim_max; + ip0 = n_iter - 1; + ip_inc = -1; } -#if 0 - else // cntx's packm func_t overrides + else { - // If the packm structure-aware kernel func_t in the context is - // non-NULL (ie: assumed to be valid), we use that instead. - //packm_kers = bli_cntx_packm_ukrs( cntx ); - packm_kers = cntx_packm_kers; + ic0 = 0; + ic_inc = panel_dim_max; + ip0 = 0; + ip_inc = 1; } -#endif - // Query the datatype-specific function pointer from the func_t object. - packm_ker = bli_func_get_dt( dt_p, packm_kers ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_p]; - - // Invoke the function. - f( strucc, - diagoffc, - diagc, - uploc, - transc, - schema, - invdiag, - revifup, - reviflo, - m_p, - n_p, - m_max_p, - n_max_p, - buf_kappa, - buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p, - is_p, - pd_p, ps_p, - packm_ker, - cntx, - t ); -} + // Query the number of threads and thread ids from the current thread's + // packm thrinfo_t node. + const dim_t nt = bli_thread_n_way( thread ); + const dim_t tid = bli_thread_work_id( thread ); + // Determine the thread range and increment using the current thread's + // packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + // will depend on whether slab or round-robin partitioning was requested + // at configure-time. + dim_t it_start, it_end, it_inc; + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); -#undef GENTFUNCR -#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - pack_t schema, \ - bool invdiag, \ - bool revifup, \ - bool reviflo, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - void_fp packm_ker, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ) \ -{ \ - PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \ -\ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict c_cast = c; \ - ctype* restrict p_cast = p; \ - ctype* restrict c_begin; \ - ctype* restrict p_begin; \ -\ - dim_t iter_dim; \ - dim_t n_iter; \ - dim_t it, ic, ip; \ - dim_t ic0, ip0; \ - doff_t ic_inc, ip_inc; \ - doff_t diagoffc_i; \ - doff_t diagoffc_inc; \ - dim_t panel_len_full; \ - dim_t panel_len_i; \ - dim_t panel_len_max; \ - dim_t panel_len_max_i; \ - dim_t panel_dim_i; \ - dim_t panel_dim_max; \ - dim_t panel_off_i; \ - inc_t vs_c; \ - inc_t ldc; \ - inc_t ldp, p_inc; \ - dim_t* m_panel_full; \ - dim_t* n_panel_full; \ - dim_t* m_panel_use; \ - dim_t* n_panel_use; \ - dim_t* m_panel_max; \ - dim_t* n_panel_max; \ - conj_t conjc; \ - bool row_stored; \ - bool col_stored; \ - inc_t is_p_use; \ -\ - ctype* restrict c_use; \ - ctype* restrict p_use; \ - doff_t diagoffp_i; \ -\ -\ - /* If C is zeros and part of a triangular matrix, then we don't need - to pack it. */ \ - if ( bli_is_zeros( uploc ) && \ - bli_is_triangular( strucc ) ) return; \ -\ - /* Extract the conjugation bit from the transposition argument. */ \ - conjc = bli_extract_conj( transc ); \ -\ - /* If c needs a transposition, induce it so that we can more simply - express the remaining parameters and code. */ \ - if ( bli_does_trans( transc ) ) \ - { \ - bli_swap_incs( &rs_c, &cs_c ); \ - bli_negate_diag_offset( &diagoffc ); \ - bli_toggle_uplo( &uploc ); \ - bli_toggle_trans( &transc ); \ - } \ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ -\ - /* If the row storage flag indicates row storage, then we are packing - to column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( row_stored ) \ - { \ - /* Prepare to pack to row-stored column panels. */ \ - iter_dim = n; \ - panel_len_full = m; \ - panel_len_max = m_max; \ - panel_dim_max = pd_p; \ - ldc = rs_c; \ - vs_c = cs_c; \ - diagoffc_inc = -( doff_t )panel_dim_max; \ - ldp = rs_p; \ - m_panel_full = &m; \ - n_panel_full = &panel_dim_i; \ - m_panel_use = &panel_len_i; \ - n_panel_use = &panel_dim_i; \ - m_panel_max = &panel_len_max_i; \ - n_panel_max = &panel_dim_max; \ - } \ - else /* if ( col_stored ) */ \ - { \ - /* Prepare to pack to column-stored row panels. */ \ - iter_dim = m; \ - panel_len_full = n; \ - panel_len_max = n_max; \ - panel_dim_max = pd_p; \ - ldc = cs_c; \ - vs_c = rs_c; \ - diagoffc_inc = ( doff_t )panel_dim_max; \ - ldp = cs_p; \ - m_panel_full = &panel_dim_i; \ - n_panel_full = &n; \ - m_panel_use = &panel_dim_i; \ - n_panel_use = &panel_len_i; \ - m_panel_max = &panel_dim_max; \ - n_panel_max = &panel_len_max_i; \ - } \ -\ - /* Compute the total number of iterations we'll need. */ \ - n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ -\ - /* Set the initial values and increments for indices related to C and P - based on whether reverse iteration was requested. */ \ - if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \ - ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \ - { \ - ic0 = (n_iter - 1) * panel_dim_max; \ - ic_inc = -panel_dim_max; \ - ip0 = n_iter - 1; \ - ip_inc = -1; \ - } \ - else \ - { \ - ic0 = 0; \ - ic_inc = panel_dim_max; \ - ip0 = 0; \ - ip_inc = 1; \ - } \ -\ - p_begin = p_cast; \ -\ - /* Query the number of threads and thread ids from the current thread's - packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ -\ - dim_t it_start, it_end, it_inc; \ -\ - /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() - will depend on whether slab or round-robin partitioning was requested - at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ -\ - /* Iterate over every logical micropanel in the source matrix. */ \ - for ( ic = ic0, ip = ip0, it = 0; it < n_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ - { \ - panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ -\ - diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ - c_begin = c_cast + (ic )*vs_c; \ -\ - if ( bli_is_triangular( strucc ) && \ - bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ - { \ - /* This case executes if the panel belongs to a triangular - matrix AND is completely unstored (ie: zero). If the panel - is unstored, we do nothing. (Notice that we don't even - increment p_begin.) */ \ -\ - continue; \ - } \ - else if ( bli_is_triangular( strucc ) && \ - bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \ - { \ - /* This case executes if the panel belongs to a triangular - matrix AND is diagonal-intersecting. Notice that we - cannot bury the following conditional logic into - packm_struc_cxk() because we need to know the value of - panel_len_max_i so we can properly increment p_inc. */ \ -\ - /* Sanity check. Diagonals should not intersect the short end of - a micro-panel. If they do, then somehow the constraints on - cache blocksizes being a whole multiple of the register - blocksizes was somehow violated. */ \ - if ( ( col_stored && diagoffc_i < 0 ) || \ - ( row_stored && diagoffc_i > 0 ) ) \ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ -\ - if ( ( row_stored && bli_is_upper( uploc ) ) || \ - ( col_stored && bli_is_lower( uploc ) ) ) \ - { \ - panel_off_i = 0; \ - panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \ - panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \ - panel_len_max ); \ - diagoffp_i = diagoffc_i; \ - } \ - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ - ( col_stored && bli_is_upper( uploc ) ) ) */ \ - { \ - panel_off_i = bli_abs( diagoffc_i ); \ - panel_len_i = panel_len_full - panel_off_i; \ - panel_len_max_i = panel_len_max - panel_off_i; \ - diagoffp_i = 0; \ - } \ -\ - c_use = c_begin + (panel_off_i )*ldc; \ - p_use = p_begin; \ -\ - /* We need to re-compute the imaginary stride as a function of - panel_len_max_i since triangular packed matrices have panels - of varying lengths. NOTE: This imaginary stride value is - only referenced by the packm kernels for induced methods. */ \ - is_p_use = ldp * panel_len_max_i; \ -\ - /* We nudge the imaginary stride up by one if it is odd. */ \ - is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \ -\ - /* NOTE: We MUST use round-robin partitioning when packing - micropanels of a triangular matrix. Hermitian/symmetric - and general packing may use slab or round-robin, depending - on which was selected at configure-time. */ \ - if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \ - { \ - packm_ker_cast( strucc, \ - diagoffp_i, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p_use, \ - cntx ); \ - } \ -\ - /* NOTE: This value is usually LESS than ps_p because triangular - matrices usually have several micro-panels that are shorter - than a "full" micro-panel. */ \ - p_inc = is_p_use; \ - } \ - else if ( bli_is_herm_or_symm( strucc ) ) \ - { \ - /* This case executes if the panel belongs to a Hermitian or - symmetric matrix, which includes stored, unstored, and - diagonal-intersecting panels. */ \ -\ - c_use = c_begin; \ - p_use = p_begin; \ -\ - panel_len_i = panel_len_full; \ - panel_len_max_i = panel_len_max; \ -\ - is_p_use = is_p; \ -\ - /* The definition of bli_packm_my_iter() will depend on whether slab - or round-robin partitioning was requested at configure-time. */ \ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ - { \ - packm_ker_cast( strucc, \ - diagoffc_i, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p_use, \ - cntx ); \ - } \ -\ - p_inc = ps_p; \ - } \ - else \ - { \ - /* This case executes if the panel is general, or, if the - panel is part of a triangular matrix and is neither unstored - (ie: zero) nor diagonal-intersecting. */ \ -\ - c_use = c_begin; \ - p_use = p_begin; \ -\ - panel_len_i = panel_len_full; \ - panel_len_max_i = panel_len_max; \ -\ - is_p_use = is_p; \ -\ - /* The definition of bli_packm_my_iter() will depend on whether slab - or round-robin partitioning was requested at configure-time. */ \ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ - { \ - packm_ker_cast( BLIS_GENERAL, \ - 0, \ - diagc, \ - BLIS_DENSE, \ - conjc, \ - schema, \ - invdiag, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p_use, \ - cntx ); \ - } \ -\ - /* NOTE: This value is equivalent to ps_p. */ \ - p_inc = ps_p; \ - } \ -\ - p_begin += p_inc; \ -\ - } \ -} + char* p_begin = p_cast; -INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 ) + // Iterate over every logical micropanel in the source matrix. + for ( dim_t ic = ic0, ip = ip0, it = 0; it < n_iter; + ic += ic_inc, ip += ip_inc, it += 1 ) + { + dim_t panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); + dim_t panel_dim_off_i = panel_dim_off + ic; + + doff_t diagoffc_i = diagoffc + (ip )*diagoffc_inc; + char* c_begin = c_cast + (ic )*incc*dt_c_size; + + inc_t p_inc = ps_p; + + // NOTE: We MUST use round-robin partitioning when packing + // micropanels of a triangular matrix. Hermitian/symmetric + // and general packing may use slab or round-robin, depending + // on which was selected at configure-time. + // The definition of bli_packm_my_iter() will depend on whether slab + // or round-robin partitioning was requested at configure-time. + bool my_iter = bli_is_triangular( strucc ) + ? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) + : bli_packm_my_iter ( it, it_start, it_end, tid, nt ); + + if ( bli_is_triangular( strucc ) && + bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) ) + { + // This case executes if the panel belongs to a triangular + // matrix AND is completely unstored (ie: zero). If the panel + // is unstored, we do nothing. (Notice that we don't even + // increment p_begin.) + continue; + } + else if ( bli_is_triangular( strucc ) && + bli_intersects_diag_n( diagoffc_i, panel_dim_i, panel_len_full ) ) + { + // This case executes if the panel belongs to a triangular + // matrix AND is diagonal-intersecting. Notice that we + // cannot bury the following conditional logic into + // packm_struc_cxk() because we need to know the value of + // panel_len_max_i so we can properly increment p_inc. + + // Sanity check. Diagonals should not intersect the short end of + // a micro-panel. If they do, then somehow the constraints on + // cache blocksizes being a whole multiple of the register + // blocksizes was somehow violated. + if ( diagoffc_i < 0 ) + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + + dim_t panel_off_i; + dim_t panel_len_i; + dim_t panel_len_max_i; + + if ( bli_is_lower( uploc ) ) + { + panel_off_i = 0; + panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; + panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, + panel_len_max ); + } + else // if ( bli_is_upper( uploc ) ) + { + panel_off_i = bli_abs( diagoffc_i ); + panel_len_i = panel_len_full - panel_off_i; + panel_len_max_i = panel_len_max - panel_off_i; + } + + dim_t panel_len_off_i = panel_off_i + panel_len_off; + + char* c_use = c_begin + (panel_off_i )*ldc*dt_c_size; + char* p_use = p_begin; + + // We need to re-compute the imaginary stride as a function of + // panel_len_max_i since triangular packed matrices have panels + // of varying lengths. NOTE: This imaginary stride value is + // only referenced by the packm kernels for induced methods. + inc_t is_p_use = ldp * panel_len_max_i; + + // We nudge the imaginary stride up by one if it is odd. + is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); + + if ( my_iter ) + { + packm_ker_cast( strucc, + diagc, + uploc, + conjc, + schema, + invdiag, + panel_dim_i, + panel_len_i, + panel_dim_max, + panel_len_max_i, + panel_dim_off_i, + panel_len_off_i, + kappa_cast, + c_use, incc, ldc, + p_use, ldp, + is_p_use, + cntx, + params ); + } + + // NOTE: This value is usually LESS than ps_p because triangular + // matrices usually have several micro-panels that are shorter + // than a "full" micro-panel. + p_inc = is_p_use; + } + else + { + // This case executes if the panel is either dense, or belongs + // to a Hermitian or symmetric matrix, which includes stored, + // unstored, and diagonal-intersecting panels. + + if ( my_iter ) + { + packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc, + diagc, + uploc, + conjc, + schema, + invdiag, + panel_dim_i, + panel_len_full, + panel_dim_max, + panel_len_max, + panel_dim_off_i, + panel_len_off, + kappa_cast, + c_begin, incc, ldc, + p_begin, ldp, is_p, + cntx, + params ); + } + } + p_begin += p_inc*dt_p_size; + } +} -/* -if ( row_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \ - c_cast, rs_c, cs_c, "%4.1f", "" ); \ -if ( col_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ - c_cast, rs_c, cs_c, "%4.1f", "" ); \ -*/ -/* -if ( row_stored ) \ -PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \ - p_use, rs_p, cs_p, "%5.2f", "" ); \ -else \ -PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, \ - p_use, rs_p, cs_p, "%5.2f", "" ); \ -*/ \ -\ -/* -if ( col_stored ) { \ - if ( bli_thread_work_id( thread ) == 0 ) \ - { \ - printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ - fflush( stdout ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \ - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - fflush( stdout ); \ - } \ -bli_thread_barrier( thread ); \ - if ( bli_thread_work_id( thread ) == 1 ) \ - { \ - printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ - fflush( stdout ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \ - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - fflush( stdout ); \ - } \ -bli_thread_barrier( thread ); \ -} \ -else { \ - if ( bli_thread_work_id( thread ) == 0 ) \ - { \ - printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ - fflush( stdout ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \ - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - fflush( stdout ); \ - } \ -bli_thread_barrier( thread ); \ - if ( bli_thread_work_id( thread ) == 1 ) \ - { \ - printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ - fflush( stdout ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ - ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ - PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \ - ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - fflush( stdout ); \ - } \ -bli_thread_barrier( thread ); \ -} \ -*/ -/* - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ -*/ -/* - if ( row_stored ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \ - (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - inc_t is_b = rs_p * *m_panel_max; \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \ - } \ -*/ -/* - if ( col_stored ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \ - (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ - PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \ - ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \ - } \ -*/ diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h new file mode 100644 index 000000000..9cda5828b --- /dev/null +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -0,0 +1,59 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// +// packm params types. +// + +typedef struct +{ + // Type of C Type of P + packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; +} packm_blk_var1_params_t; + +// +// Prototype object-based interfaces. +// + +BLIS_EXPORT_BLIS void bli_packm_blk_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* t + ); + diff --git a/frame/1m/packm/bli_packm_blk_var1_md.c b/frame/1m/packm/bli_packm_blk_var1_md.c deleted file mode 100644 index a7c694e4f..000000000 --- a/frame/1m/packm/bli_packm_blk_var1_md.c +++ /dev/null @@ -1,344 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#ifdef BLIS_ENABLE_GEMM_MD - -#define FUNCPTR_T packm_fp - -typedef void (*FUNCPTR_T)( - trans_t transc, - pack_t schema, - dim_t m, - dim_t n, - dim_t m_max, - dim_t n_max, - void* kappa, - void* c, inc_t rs_c, inc_t cs_c, - void* p, inc_t rs_p, inc_t cs_p, - inc_t is_p, - dim_t pd_p, inc_t ps_p, - cntx_t* cntx, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY2_ALL(ftypes,packm_blk_var1_md); - - -void bli_packm_blk_var1_md - ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* t - ) -{ - num_t dt_c = bli_obj_dt( c ); - num_t dt_p = bli_obj_dt( p ); - - trans_t transc = bli_obj_conjtrans_status( c ); - pack_t schema = bli_obj_pack_schema( p ); - - dim_t m_p = bli_obj_length( p ); - dim_t n_p = bli_obj_width( p ); - dim_t m_max_p = bli_obj_padded_length( p ); - dim_t n_max_p = bli_obj_padded_width( p ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - void* buf_p = bli_obj_buffer_at_off( p ); - inc_t rs_p = bli_obj_row_stride( p ); - inc_t cs_p = bli_obj_col_stride( p ); - inc_t is_p = bli_obj_imag_stride( p ); - dim_t pd_p = bli_obj_panel_dim( p ); - inc_t ps_p = bli_obj_panel_stride( p ); - - obj_t kappa; - void* buf_kappa; - - FUNCPTR_T f; - - - // Treatment of kappa (ie: packing during scaling) depends on - // whether we are executing an induced method. - if ( bli_is_nat_packed( schema ) ) - { - // This branch is for native execution, where we assume that - // the micro-kernel will always apply the alpha scalar of the - // higher-level operation. Thus, we use BLIS_ONE for kappa so - // that the underlying packm implementation does not perform - // any scaling during packing. - buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE ); - } - else // if ( bli_is_ind_packed( schema ) ) - { - obj_t* kappa_p; - - // The value for kappa we use will depend on whether the scalar - // attached to A has a nonzero imaginary component. If it does, - // then we will apply the scalar during packing to facilitate - // implementing induced complex domain algorithms in terms of - // real domain micro-kernels. (In the aforementioned situation, - // applying a real scalar is easy, but applying a complex one is - // harder, so we avoid the need altogether with the code below.) - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); - - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); - - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } - - // Acquire the buffer to the kappa chosen above. - buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p ); - } - - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_c][dt_p]; - - // Invoke the function. - f( - transc, - schema, - m_p, - n_p, - m_max_p, - n_max_p, - buf_kappa, - buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p, - is_p, - pd_p, ps_p, - cntx, - t ); -} - - -#undef GENTFUNC2 -#define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \ -\ -void PASTEMAC2(chc,chp,varname) \ - ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ) \ -{ \ - ctype_p* restrict kappa_cast = kappa; \ - ctype_c* restrict c_cast = c; \ - ctype_p* restrict p_cast = p; \ - ctype_c* restrict c_begin; \ - ctype_p* restrict p_begin; \ -\ - dim_t iter_dim; \ - dim_t n_iter; \ - dim_t it, ic, ip; \ - doff_t ic_inc, ip_inc; \ - dim_t panel_len_full; \ - dim_t panel_len_i; \ - dim_t panel_len_max; \ - dim_t panel_len_max_i; \ - dim_t panel_dim_i; \ - dim_t panel_dim_max; \ - inc_t vs_c; \ - inc_t p_inc; \ - dim_t* m_panel_use; \ - dim_t* n_panel_use; \ - dim_t* m_panel_max; \ - dim_t* n_panel_max; \ - conj_t conjc; \ - bool row_stored; \ - bool col_stored; \ -\ - ctype_c* restrict c_use; \ - ctype_p* restrict p_use; \ -\ -\ - /* Extract the conjugation bit from the transposition argument. */ \ - conjc = bli_extract_conj( transc ); \ -\ - /* If c needs a transposition, induce it so that we can more simply - express the remaining parameters and code. */ \ - if ( bli_does_trans( transc ) ) \ - { \ - bli_swap_incs( &rs_c, &cs_c ); \ - bli_toggle_trans( &transc ); \ - } \ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ -\ - ( void )col_stored; \ -\ - /* If the row storage flag indicates row storage, then we are packing - to column panels; otherwise, if the strides indicate column storage, - we are packing to row panels. */ \ - if ( row_stored ) \ - { \ - /* Prepare to pack to row-stored column panels. */ \ - iter_dim = n; \ - panel_len_full = m; \ - panel_len_max = m_max; \ - panel_dim_max = pd_p; \ - vs_c = cs_c; \ - m_panel_use = &panel_len_i; \ - n_panel_use = &panel_dim_i; \ - m_panel_max = &panel_len_max_i; \ - n_panel_max = &panel_dim_max; \ - } \ - else /* if ( col_stored ) */ \ - { \ - /* Prepare to pack to column-stored row panels. */ \ - iter_dim = m; \ - panel_len_full = n; \ - panel_len_max = n_max; \ - panel_dim_max = pd_p; \ - vs_c = rs_c; \ - m_panel_use = &panel_dim_i; \ - n_panel_use = &panel_len_i; \ - m_panel_max = &panel_dim_max; \ - n_panel_max = &panel_len_max_i; \ - } \ -\ - /* Compute the total number of iterations we'll need. */ \ - n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ -\ - { \ - ic_inc = panel_dim_max; \ - ip_inc = 1; \ - } \ -\ - p_begin = p_cast; \ -\ - /* Query the number of threads and thread ids from the current thread's - packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ -\ - /* Suppress unused variable warnings when slab partitioning is enabled, - since the slab-based definition of bli_packm_my_iter() does not - actually use tid or nt. */ \ - ( void )nt; ( void )tid; \ -\ - dim_t it_start, it_end, it_inc; \ -\ - /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() - will depend on whether slab or round-robin partitioning was requested - at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ -\ - for ( ic = 0, ip = 0, it = 0; it < n_iter; \ - ic += ic_inc, ip += ip_inc, it += 1 ) \ - { \ - panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ -\ - c_begin = c_cast + (ic )*vs_c; \ -\ - { \ - c_use = c_begin; \ - p_use = p_begin; \ -\ - panel_len_i = panel_len_full; \ - panel_len_max_i = panel_len_max; \ -\ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ - { \ - PASTEMAC2(chc,chp,packm_struc_cxk_md) \ - ( \ - conjc, \ - schema, \ - *m_panel_use, \ - *n_panel_use, \ - *m_panel_max, \ - *n_panel_max, \ - kappa_cast, \ - c_use, rs_c, cs_c, \ - p_use, rs_p, cs_p, \ - is_p, \ - cntx \ - ); \ - } \ -\ - p_inc = ps_p; \ - } \ -\ -/* -if ( row_stored ) \ -PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: b packed", *m_panel_max, *n_panel_max, \ - p_use, rs_p, cs_p, "%5.2f", "" ); \ -else \ -PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: a packed", *m_panel_max, *n_panel_max, \ - p_use, rs_p, cs_p, "%5.2f", "" ); \ -*/ \ -\ - p_begin += p_inc; \ -\ - } \ -} - -INSERT_GENTFUNC2_BASIC0( packm_blk_var1_md ) -INSERT_GENTFUNC2_MIXDP0( packm_blk_var1_md ) - -#endif diff --git a/frame/1m/packm/bli_packm_blk_var1_md.h b/frame/1m/packm/bli_packm_blk_var1_md.h deleted file mode 100644 index e6bf151d0..000000000 --- a/frame/1m/packm/bli_packm_blk_var1_md.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_packm_blk_var1_md - ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* t - ); - - -#undef GENTPROT2 -#define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ -\ -void PASTEMAC2(chc,chp,varname) \ - ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - dim_t pd_p, inc_t ps_p, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT2_BASIC0( packm_blk_var1_md ) -INSERT_GENTPROT2_MIXDP0( packm_blk_var1_md ) - diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index fc6ba8052..e99ed9cf3 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -35,11 +35,10 @@ #include "blis.h" -cntl_t* bli_packm_cntl_create_node +BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, - void_fp packm_var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, @@ -62,7 +61,6 @@ cntl_t* bli_packm_cntl_create_node // Initialize the packm_params_t struct. params->size = sizeof( packm_params_t ); - params->var_func = packm_var_func; params->bmid_m = bmid_m; params->bmid_n = bmid_n; params->does_invert_diag = does_invert_diag; diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index 17aa196e8..14bfe1ce8 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -36,7 +36,6 @@ struct packm_params_s { uint64_t size; // size field must be present and come first. - packm_var_oft var_func; bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; @@ -47,11 +46,6 @@ struct packm_params_s }; typedef struct packm_params_s packm_params_t; -BLIS_INLINE packm_var_oft bli_cntl_packm_params_var_func( cntl_t* cntl ) -{ - packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->var_func; -} - BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; @@ -93,7 +87,6 @@ cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, - void_fp packm_var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 739fd5f1d..5a7d716fe 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -35,12 +35,14 @@ #include "blis.h" -siz_t bli_packm_init +bool bli_packm_init ( - obj_t* a, + obj_t* c, obj_t* p, cntx_t* cntx, - cntl_t* cntl + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { bli_init_once(); @@ -51,139 +53,27 @@ siz_t bli_packm_init // suitable block of memory from the memory allocator (if such a block // of memory has not already been allocated previously). - bszid_t bmult_id_m; - bszid_t bmult_id_n; - bool does_invert_diag; - bool rev_iter_if_upper; - bool rev_iter_if_lower; - pack_t schema; - //packbuf_t pack_buf_type; - siz_t size_needed; - // Check parameters. if ( bli_error_checking_is_enabled() ) - bli_packm_init_check( a, p, cntx ); - - // Extract various fields from the control tree. - bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); - bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); - does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl ); - rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl ); - rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl ); - schema = bli_cntl_packm_params_pack_schema( cntl ); - //pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); - -#if 0 - // Let us now check to see if the object has already been packed. First - // we check if it has been packed to an unspecified (row or column) - // format, in which case we can alias the object and return. - // NOTE: The reason we don't need to even look at the control tree in - // this case is as follows: an object's pack status is only set to - // BLIS_PACKED_UNSPEC for situations when the actual format used is - // not important, as long as its packed into contiguous rows or - // contiguous columns. A good example of this is packing for matrix - // operands in the level-2 operations. - if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC ) - { - bli_obj_alias_to( a, p ); - return 0; - } + bli_packm_init_check( c, p, cntx ); - // Now we check if the object has already been packed to the desired - // schema (as encoded in the control tree). If so, we can alias and - // return 0. - // NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED - // and thus packing will be called for (but in some cases packing has - // already taken place, or does not need to take place, and so that will - // be indicated by the pack status). Also, not all combinations of - // current pack status and desired pack schema are valid. - if ( bli_obj_pack_schema( a ) == pack_schema ) - { - bli_obj_alias_to( a, p ); - return 0; - } -#endif + // We begin by copying the fields of A. + bli_obj_alias_to( c, p ); // If the object is marked as being filled with zeros, then we can skip // the packm operation entirely and alias. - if ( bli_obj_is_zeros( a ) ) - { - bli_obj_alias_to( a, p ); - return 0; - } - - // Prepare a few other variables based on properties of the control - // tree. - - invdiag_t invert_diag; - packord_t pack_ord_if_up; - packord_t pack_ord_if_lo; - - if ( does_invert_diag ) invert_diag = BLIS_INVERT_DIAG; - else invert_diag = BLIS_NO_INVERT_DIAG; - - if ( rev_iter_if_upper ) pack_ord_if_up = BLIS_PACK_REV_IF_UPPER; - else pack_ord_if_up = BLIS_PACK_FWD_IF_UPPER; - - if ( rev_iter_if_lower ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER; - else pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER; - - // Initialize object p for the final packed matrix. - size_needed - = - bli_packm_init_pack - ( - invert_diag, - schema, - pack_ord_if_up, - pack_ord_if_lo, - bmult_id_m, - bmult_id_n, - a, - p, - cntx - ); - - // Return the size needed for memory allocation of the packed buffer. - return size_needed; -} + if ( bli_obj_is_zeros( c ) ) + return false; - -siz_t bli_packm_init_pack - ( - invdiag_t invert_diag, - pack_t schema, - packord_t pack_ord_if_up, - packord_t pack_ord_if_lo, - bszid_t bmult_id_m, - bszid_t bmult_id_n, - obj_t* a, - obj_t* p, - cntx_t* cntx - ) -{ - bli_init_once(); - - num_t dt_tar = bli_obj_target_dt( a ); - num_t dt_scalar = bli_obj_scalar_dt( a ); - trans_t transa = bli_obj_onlytrans_status( a ); - dim_t m_a = bli_obj_length( a ); - dim_t n_a = bli_obj_width( a ); - dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); - dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); - dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); - dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_n, cntx ); - - dim_t m_p, n_p; - dim_t m_p_pad, n_p_pad; - siz_t size_p; - siz_t elem_size_p; - inc_t rs_p, cs_p; - inc_t is_p; - - - // We begin by copying the fields of A. - bli_obj_alias_to( a, p ); + // Extract various fields from the control tree. + bszid_t bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); + bszid_t bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); + pack_t schema = bli_cntl_packm_params_pack_schema( cntl ); + num_t dt_tar = bli_obj_target_dt( c ); + num_t dt_scalar = bli_obj_scalar_dt( c ); + dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); + dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); + dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); // Typecast the internal scalar value to the target datatype. // Note that if the typecasting is needed, this must happen BEFORE we @@ -195,51 +85,21 @@ siz_t bli_packm_init_pack // Update the storage datatype of P to be the target datatype of A. bli_obj_set_dt( dt_tar, p ); + bli_obj_set_elem_size( bli_dt_size( dt_tar ), p ); - // Update the dimension fields to explicitly reflect a transposition, - // if needed. - // Then, clear the conjugation and transposition fields from the object - // since matrix packing in BLIS is deemed to take care of all conjugation - // and transposition necessary. - // Then, we adjust the properties of P when A needs a transposition. - // We negate the diagonal offset, and if A is upper- or lower-stored, - // we either toggle the uplo of P. - // Finally, if we mark P as dense since we assume that all matrices, - // regardless of structure, will be densified. - bli_obj_set_dims_with_trans( transa, m_a, n_a, p ); - bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, p ); - if ( bli_does_trans( transa ) ) - { - bli_obj_negate_diag_offset( p ); - if ( bli_obj_is_upper_or_lower( a ) ) - bli_obj_toggle_uplo( p ); - } + // Store the pack schema to the object. + bli_obj_set_pack_schema( schema, p ); - // If we are packing micropanels, mark P as dense. Otherwise, we are - // probably being called in the context of a level-2 operation, in - // which case we do not want to overwrite the uplo field of P (inherited - // from A) with BLIS_DENSE because that information may be needed by - // the level-2 operation's unblocked variant to decide whether to - // execute a "lower" or "upper" branch of code. - if ( bli_is_panel_packed( schema ) ) - { - bli_obj_set_uplo( BLIS_DENSE, p ); - } + // Clear the conjugation field from the object since matrix packing + // in BLIS is deemed to take care of all conjugation necessary. + bli_obj_set_conj( BLIS_NO_CONJUGATE, p ); + + // Since we are packing micropanels, mark P as dense. + bli_obj_set_uplo( BLIS_DENSE, p ); // Reset the view offsets to (0,0). bli_obj_set_offs( 0, 0, p ); - // Set the invert diagonal field. - bli_obj_set_invert_diag( invert_diag, p ); - - // Set the pack status of P to the pack schema prescribed in the control - // tree node. - bli_obj_set_pack_schema( schema, p ); - - // Set the packing order bits. - bli_obj_set_pack_order_if_upper( pack_ord_if_up, p ); - bli_obj_set_pack_order_if_lower( pack_ord_if_lo, p ); - // Compute the dimensions padded by the dimension multiples. These // dimensions will be the dimensions of the packed matrices, including // zero-padding, and will be used by the macro- and micro-kernels. @@ -247,10 +107,10 @@ siz_t bli_packm_init_pack // in P) and aligning them to the dimension multiples (typically equal // to register blocksizes). This does waste a little bit of space for // level-2 operations, but that's okay with us. - m_p = bli_obj_length( p ); - n_p = bli_obj_width( p ); - m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def ); - n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def ); + dim_t m_p = bli_obj_length( p ); + dim_t n_p = bli_obj_width( p ); + dim_t m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def ); + dim_t n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def ); // Save the padded dimensions into the packed object. It is important // to save these dimensions since they represent the actual dimensions @@ -258,177 +118,70 @@ siz_t bli_packm_init_pack bli_obj_set_padded_dims( m_p_pad, n_p_pad, p ); // Now we prepare to compute strides, align them, and compute the - // total number of bytes needed for the packed buffer. The caller - // will then use that value to acquire an appropriate block of memory - // from the memory allocator. + // total number of bytes needed for the packed buffer. Then we use + // that value to acquire an appropriate block of memory from the + // memory allocator. // Extract the element size for the packed object. - elem_size_p = bli_obj_elem_size( p ); - - // Set the row and column strides of p based on the pack schema. - if ( bli_is_row_packed( schema ) && - !bli_is_panel_packed( schema ) ) - { - // For regular row storage, the padded width of our matrix - // should be used for the row stride, with the column stride set - // to one. By using the WIDTH of the mem_t region, we allow for - // zero-padding (if necessary/desired) along the right edge of - // the matrix. - rs_p = n_p_pad; - cs_p = 1; - - // Align the leading dimension according to the heap stride - // alignment size so that the second, third, etc rows begin at - // aligned addresses. - rs_p = bli_align_dim_to_size( rs_p, elem_size_p, - BLIS_HEAP_STRIDE_ALIGN_SIZE ); - - // Store the strides in P. - bli_obj_set_strides( rs_p, cs_p, p ); - - // Compute the size of the packed buffer. - size_p = m_p_pad * rs_p * elem_size_p; - } - else if ( bli_is_col_packed( schema ) && - !bli_is_panel_packed( schema ) ) - { - // For regular column storage, the padded length of our matrix - // should be used for the column stride, with the row stride set - // to one. By using the LENGTH of the mem_t region, we allow for - // zero-padding (if necessary/desired) along the bottom edge of - // the matrix. - cs_p = m_p_pad; - rs_p = 1; - - // Align the leading dimension according to the heap stride - // alignment size so that the second, third, etc columns begin at - // aligned addresses. - cs_p = bli_align_dim_to_size( cs_p, elem_size_p, - BLIS_HEAP_STRIDE_ALIGN_SIZE ); - - // Store the strides in P. - bli_obj_set_strides( rs_p, cs_p, p ); - - // Compute the size of the packed buffer. - size_p = cs_p * n_p_pad * elem_size_p; - } - else if ( bli_is_row_packed( schema ) && - bli_is_panel_packed( schema ) ) - { - dim_t m_panel; - dim_t ps_p; - - // The panel dimension (for each datatype) should be equal to the - // default (logical) blocksize multiple in the m dimension. - m_panel = bmult_m_def; - - // The "column stride" of a row-micropanel packed object is interpreted - // as the column stride WITHIN a micropanel. Thus, this is equal to the - // packing (storage) blocksize multiple, which may be equal to the - // default (logical) blocksize multiple). - cs_p = bmult_m_pack; - - // The "row stride" of a row-micropanel packed object is interpreted - // as the row stride WITHIN a micropanel. Thus, it is unit. - rs_p = 1; - - // The "panel stride" of a micropanel packed object is interpreted as - // the distance between the (0,0) element of panel k and the (0,0) - // element of panel k+1. We use the padded width computed above to - // allow for zero-padding (if necessary/desired) along the far end - // of each micropanel (ie: the right edge of the matrix). Zero-padding - // can also occur along the long edge of the last micropanel if the m - // dimension of the matrix is not a whole multiple of MR. - ps_p = cs_p * n_p_pad; - - // As a general rule, we don't want micropanel strides to be odd. - // NOTE: This safety feature *may* not be necessary anymore, but was - // definitely needed to support certain variations of the 3m method. - if ( bli_is_odd( ps_p ) ) ps_p += 1; - - // Set the imaginary stride (in units of fundamental elements). - // This is the number of real elements that must be traversed before - // reaching the imaginary part of the packed micropanel. NOTE: the - // imaginary stride is mostly vestigial and left over from the 3m - // and 4m implementations. - is_p = 1; - - // Store the strides and panel dimension in P. - bli_obj_set_strides( rs_p, cs_p, p ); - bli_obj_set_imag_stride( is_p, p ); - bli_obj_set_panel_dim( m_panel, p ); - bli_obj_set_panel_stride( ps_p, p ); - bli_obj_set_panel_length( m_panel, p ); - bli_obj_set_panel_width( n_p, p ); - - // Compute the size of the packed buffer. - size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p; - } - else if ( bli_is_col_packed( schema ) && - bli_is_panel_packed( schema ) ) - { - dim_t n_panel; - dim_t ps_p; - - // The panel dimension (for each datatype) should be equal to the - // default (logical) blocksize multiple in the n dimension. - n_panel = bmult_n_def; - - // The "row stride" of a column-micropanel packed object is interpreted - // as the row stride WITHIN a micropanel. Thus, this is equal to the - // packing (storage) blocksize multiple (which may be equal to the - // default (logical) blocksize multiple. - rs_p = bmult_n_pack; - - // The "column stride" of a column-micropanel packed object is - // interpreted as the column stride WITHIN a micropanel. Thus, it is - // unit. - cs_p = 1; - - // The "panel stride" of a micropanel packed object is interpreted as - // the distance between the (0,0) element of panel k and the (0,0) - // element of panel k+1. We use the padded length computed above to - // allow for zero-padding (if necessary/desired) along the far end - // of each micropanel (ie: the bottom edge of the matrix). Zero-padding - // can also occur along the long edge of the last micropanel if the n - // dimension of the matrix is not a whole multiple of NR. - ps_p = m_p_pad * rs_p; - - // As a general rule, we don't want micropanel strides to be odd. - // NOTE: This safety feature *may* not be necessary anymore, but was - // definitely needed to support certain variations of the 3m method. - if ( bli_is_odd( ps_p ) ) ps_p += 1; - - // Set the imaginary stride (in units of fundamental elements). - // This is the number of real elements that must be traversed before - // reaching the imaginary part of the packed micropanel. NOTE: the - // imaginary stride is mostly vestigial and left over from the 3m - // and 4m implementations. - is_p = 1; - - // Store the strides and panel dimension in P. - bli_obj_set_strides( rs_p, cs_p, p ); - bli_obj_set_imag_stride( is_p, p ); - bli_obj_set_panel_dim( n_panel, p ); - bli_obj_set_panel_stride( ps_p, p ); - bli_obj_set_panel_length( m_p, p ); - bli_obj_set_panel_width( n_panel, p ); - - // Compute the size of the packed buffer. - size_p = ps_p * ( n_p_pad / n_panel ) * elem_size_p; - } - else - { - // NOTE: When implementing block storage, we only need to implement - // the following two cases: - // - row-stored blocks in row-major order - // - column-stored blocks in column-major order - // The other two combinations coincide with that of packed row-panel - // and packed column- panel storage. - - size_p = 0; - } - - return size_p; + siz_t elem_size_p = bli_obj_elem_size( p ); + + // The panel dimension (for each datatype) should be equal to the + // default (logical) blocksize multiple in the m dimension. + dim_t m_panel = bmult_m_def; + + // The "column stride" of a row-micropanel packed object is interpreted + // as the column stride WITHIN a micropanel. Thus, this is equal to the + // packing (storage) blocksize multiple, which may be equal to the + // default (logical) blocksize multiple). + inc_t cs_p = bmult_m_pack; + + // The "row stride" of a row-micropanel packed object is interpreted + // as the row stride WITHIN a micropanel. Thus, it is unit. + inc_t rs_p = 1; + + // The "panel stride" of a micropanel packed object is interpreted as + // the distance between the (0,0) element of panel k and the (0,0) + // element of panel k+1. We use the padded width computed above to + // allow for zero-padding (if necessary/desired) along the far end + // of each micropanel (ie: the right edge of the matrix). Zero-padding + // can also occur along the long edge of the last micropanel if the m + // dimension of the matrix is not a whole multiple of MR. + inc_t ps_p = cs_p * n_p_pad; + + // As a general rule, we don't want micropanel strides to be odd. There + // are very few instances where this can happen, but we've seen it happen + // more than zero times (such as for certain small problems), and so we + // check for it here. + if ( bli_is_odd( ps_p ) ) ps_p += 1; + + // Set the imaginary stride (in units of fundamental elements). + // This is the number of real elements that must be traversed before + // reaching the imaginary part of the packed micropanel. NOTE: the + // imaginary stride is mostly vestigial and left over from the 3m + // and 4m implementations. + inc_t is_p = 1; + + // Store the strides and panel dimension in P. + bli_obj_set_strides( rs_p, cs_p, p ); + bli_obj_set_imag_stride( is_p, p ); + bli_obj_set_panel_dim( m_panel, p ); + bli_obj_set_panel_stride( ps_p, p ); + bli_obj_set_panel_length( m_panel, p ); + bli_obj_set_panel_width( n_p, p ); + + // Compute the size of the packed buffer. + siz_t size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p; + + // If the requested size is zero, then we don't need to do any allocation. + if ( size_p == 0 ) + return false; + + // Update the buffer address in p to point to the buffer associated + // with the mem_t entry acquired from the memory broker (now cached in + // the control tree node). + void* buffer = bli_packm_alloc( size_p, rntm, cntl, thread ); + bli_obj_set_buffer( buffer, p ); + + return true; } diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h index 9365a131e..152c6f15c 100644 --- a/frame/1m/packm/bli_packm_init.h +++ b/frame/1m/packm/bli_packm_init.h @@ -32,24 +32,13 @@ */ -siz_t bli_packm_init +BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, - cntl_t* cntl - ); - -BLIS_EXPORT_BLIS siz_t bli_packm_init_pack - ( - invdiag_t invert_diag, - pack_t schema, - packord_t pack_ord_if_up, - packord_t pack_ord_if_lo, - bszid_t bmult_id_m, - bszid_t bmult_id_n, - obj_t* a, - obj_t* p, - cntx_t* cntx + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index 6dc9ec85a..c9a2bb9db 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -39,59 +39,19 @@ void bli_packm_int obj_t* a, obj_t* p, cntx_t* cntx, + rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { bli_init_once(); - packm_var_oft f; + // Extract the function pointer from the object. + packm_var_oft f = bli_obj_pack_fn( a ); - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_packm_int_check( a, p, cntx ); - - // Sanity check; A should never have a zero dimension. If we must support - // it, then we should fold it into the next alias-and-early-exit block. - //if ( bli_obj_has_zero_dim( a ) ) bli_abort(); - - // Let us now check to see if the object has already been packed. First - // we check if it has been packed to an unspecified (row or column) - // format, in which case we can return, since by now aliasing has already - // taken place in packm_init(). - // NOTE: The reason we don't need to even look at the control tree in - // this case is as follows: an object's pack status is only set to - // BLIS_PACKED_UNSPEC for situations when the actual format used is - // not important, as long as its packed into contiguous rows or - // contiguous columns. A good example of this is packing for matrix - // operands in the level-2 operations. - if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC ) - { - return; - } - - // At this point, we can be assured that cntl is not NULL. Now we check - // if the object has already been packed to the desired schema (as en- - // coded in the control tree). If so, we can return, as above. - // NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED - // and thus packing will be called for (but in some cases packing has - // already taken place, or does not need to take place, and so that will - // be indicated by the pack status). Also, not all combinations of - // current pack status and desired pack schema are valid. - if ( bli_obj_pack_schema( a ) == bli_cntl_packm_params_pack_schema( cntl ) ) - { - return; - } - - // If the object is marked as being filled with zeros, then we can skip - // the packm operation entirely. - if ( bli_obj_is_zeros( a ) ) - { - return; - } - - // Extract the function pointer from the current control tree node. - f = bli_cntl_packm_params_var_func( cntl ); + // Barrier so that we know threads are done with previous computation + // with the same packing buffer before starting to pack. + bli_thread_barrier( thread ); // Invoke the variant with kappa_use. f @@ -99,8 +59,12 @@ void bli_packm_int a, p, cntx, + rntm, cntl, thread ); + + // Barrier so that packing is done before computation. + bli_thread_barrier( thread ); } diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h index 573a299d6..16a5c2c34 100644 --- a/frame/1m/packm/bli_packm_int.h +++ b/frame/1m/packm/bli_packm_int.h @@ -37,6 +37,7 @@ void bli_packm_int obj_t* a, obj_t* p, cntx_t* cntx, + rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); diff --git a/frame/3/trsm/bli_trsm_packab.c b/frame/1m/packm/bli_packm_scalar.c similarity index 53% rename from frame/3/trsm/bli_trsm_packab.c rename to frame/1m/packm/bli_packm_scalar.c index 841230d80..f613028c9 100644 --- a/frame/3/trsm/bli_trsm_packab.c +++ b/frame/1m/packm/bli_packm_scalar.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016, Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -34,83 +35,42 @@ #include "blis.h" -void bli_trsm_packa - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) +void* bli_packm_scalar( obj_t* kappa, obj_t* p ) { - obj_t a_pack; + num_t dt_p = bli_obj_dt( p ); + pack_t schema = bli_obj_pack_schema( p ); - // Pack matrix A according to the control tree node. - bli_l3_packm - ( - a, - &a_pack, - cntx, - rntm, - cntl, - thread - ); + // The value for kappa we use will depends on whether the scalar + // attached to A has a nonzero imaginary component. If it does, + // then we will apply the scalar during packing to facilitate + // implementing induced complex domain algorithms in terms of + // real domain micro-kernels. (In the aforementioned situation, + // applying a real scalar is easy, but applying a complex one is + // harder, so we avoid the need altogether with the code below.) + if ( bli_obj_scalar_has_nonzero_imag( p ) && + !bli_is_nat_packed( schema ) ) + { + //printf( "applying non-zero imag kappa\n_p" ); - // Proceed with execution using packed matrix A. - bli_trsm_int - ( - &BLIS_ONE, - &a_pack, - b, - &BLIS_ONE, - c, - cntx, - rntm, - bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread ) - ); -} - -// ----------------------------------------------------------------------------- - -void bli_trsm_packb - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - obj_t b_pack; + // Detach the scalar. + bli_obj_scalar_detach( p, kappa ); - // Pack matrix B according to the control tree node. - bli_l3_packm - ( - b, - &b_pack, - cntx, - rntm, - cntl, - thread - ); + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); - // Proceed with execution using packed matrix B. - bli_trsm_int - ( - &BLIS_ONE, - a, - &b_pack, - &BLIS_ONE, - c, - cntx, - rntm, - bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread ) - ); + return bli_obj_buffer_for_1x1( dt_p, kappa ); + } + // This branch is also for native execution, where we assume that + // the micro-kernel will always apply the alpha scalar of the + // higher-level operation. Thus, we use BLIS_ONE for kappa so + // that the underlying packm implementation does not perform + // any scaling during packing. + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + return bli_obj_buffer_for_1x1( dt_p, &BLIS_ONE ); + } } diff --git a/frame/1m/packm/bli_packm_md.h b/frame/1m/packm/bli_packm_scalar.h similarity index 95% rename from frame/1m/packm/bli_packm_md.h rename to frame/1m/packm/bli_packm_scalar.h index bb9d6d613..3745accf9 100644 --- a/frame/1m/packm/bli_packm_md.h +++ b/frame/1m/packm/bli_packm_scalar.h @@ -32,6 +32,5 @@ */ -#include "bli_packm_blk_var1_md.h" -#include "bli_packm_struc_cxk_md.h" +BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c index a3b2d66e6..2a52c42de 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.c +++ b/frame/1m/packm/bli_packm_struc_cxk.c @@ -40,57 +40,24 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ) \ { \ - dim_t panel_dim; \ - dim_t panel_dim_max; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ -\ - /* Determine the dimensions and relative strides of the micro-panel - based on its pack schema. */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_dim_max = n_panel_max; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_dim_max = m_panel_max; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ -\ /* Handle micro-panel packing based on the structure of the matrix being packed. */ \ if ( bli_is_general( strucc ) ) \ @@ -118,23 +85,21 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_herm_cxk) \ ( \ strucc, \ - diagoffc, \ + diagc, \ uploc, \ conjc, \ schema, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ + invdiag, \ panel_dim, \ - panel_dim_max, \ panel_len, \ + panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - ldp, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ cntx \ ); \ } \ @@ -145,130 +110,24 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_tri_cxk) \ ( \ strucc, \ - diagoffc, \ diagc, \ uploc, \ conjc, \ schema, \ invdiag, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ panel_dim, \ - panel_dim_max, \ panel_len, \ + panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - ldp, \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ cntx \ ); \ } \ -\ -\ - /* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally - fill the edge region (the bottom m_panel_max - m_panel rows or right- - side n_panel_max - n_panel columns) of the micropanel with zeros. - However, this responsibility has been moved to the packm microkernel. - This change allows experts to use custom kernels that pack to custom - packing formats when the problem size is not a nice multiple of the - register blocksize. */ \ -\ -/* - if ( m_panel != m_panel_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - dim_t i = m_panel; \ - dim_t m_edge = m_panel_max - i; \ - dim_t n_edge = n_panel_max; \ - ctype* p_edge = p + (i )*rs_p; \ -\ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - dim_t j = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - j; \ - ctype* p_edge = p + (j )*cs_p; \ -\ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -*/ \ -\ -\ - if ( bli_is_triangular( strucc ) ) \ - { \ - /* If this panel is an edge case in both panel dimension and length, - then it must be a bottom-right corner case. Set the part of the - diagonal that extends into the zero-padded region to identity. - NOTE: This is actually only necessary when packing for trsm, as - it helps prevent NaNs and Infs from creeping into the computation. - However, we set the region to identity for trmm as well. Those - 1.0's end up getting muliplied by the 0.0's in the zero-padded - region of the other matrix, so there is no harm in this. */ \ - if ( m_panel != m_panel_max && \ - n_panel != n_panel_max ) \ - { \ - ctype* restrict one = PASTEMAC(ch,1); \ - dim_t i = m_panel; \ - dim_t j = n_panel; \ - dim_t m_br = m_panel_max - i; \ - dim_t n_br = n_panel_max - j; \ - ctype* p_br = p + (i )*rs_p + (j )*cs_p; \ -\ - PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - m_br, \ - n_br, \ - one, \ - p_br, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ - } \ -\ -\ -/* - if ( bli_is_col_packed( schema ) ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: bp copied", m_panel_max, n_panel_max, \ - p, rs_p, cs_p, "%4.1f", "" ); \ - else if ( bli_is_row_packed( schema ) ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: ap copied", m_panel_max, n_panel_max, \ - p, rs_p, cs_p, "%4.1f", "" ); \ -*/ \ } INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk ) @@ -282,42 +141,31 @@ INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ + diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ) \ { \ - doff_t diagoffc_abs; \ - dim_t i, j; \ - bool row_stored; \ - bool col_stored; \ -\ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ + doff_t diagoffc_abs; \ + dim_t i, j; \ \ /* Handle the case where the micro-panel does NOT intersect the diagonal separately from the case where it does intersect. */ \ - if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \ { \ /* If the current panel is unstored, we need to make a few adjustments so we refer to the data where it is actually @@ -325,10 +173,10 @@ void PASTEMAC(ch,varname) \ implicitly assumes we are operating on a dense panel within a larger symmetric or Hermitian matrix, since a general matrix would not contain any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \ { \ - c = c + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ + c = c + diagoffc * ( doff_t )ldc + \ + -diagoffc * ( doff_t )incc; \ bli_swap_incs( &incc, &ldc ); \ \ if ( bli_is_hermitian( strucc ) ) \ @@ -350,7 +198,7 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ } \ - else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \ { \ ctype* restrict c10; \ ctype* restrict p10; \ @@ -370,14 +218,12 @@ void PASTEMAC(ch,varname) \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( col_stored && diagoffc < 0 ) || \ - ( row_stored && diagoffc > 0 ) ) \ + if ( diagoffc < 0 ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ diagoffc_abs = bli_abs( diagoffc ); \ \ - if ( ( row_stored && bli_is_upper( uploc ) ) || \ - ( col_stored && bli_is_lower( uploc ) ) ) \ + if ( bli_is_lower( uploc ) ) \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs; \ @@ -393,8 +239,8 @@ void PASTEMAC(ch,varname) \ diagoffc12 = diagoffc_abs - j; \ p12 = p + (j )*ldp; \ c12 = c + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ - -diagoffc12 * ( doff_t )rs_c; \ + c12 = c12 + diagoffc12 * ( doff_t )ldc + \ + -diagoffc12 * ( doff_t )incc; \ incc12 = ldc; \ ldc12 = incc; \ conjc12 = conjc; \ @@ -402,16 +248,15 @@ void PASTEMAC(ch,varname) \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc12 ); \ } \ - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ - ( col_stored && bli_is_upper( uploc ) ) ) */ \ + else /* if ( bli_is_upper( uploc ) ) */ \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs + panel_dim; \ diagoffc10 = diagoffc; \ p10 = p; \ c10 = c; \ - c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ - -diagoffc10 * ( doff_t )rs_c; \ + c10 = c10 + diagoffc10 * ( doff_t )ldc + \ + -diagoffc10 * ( doff_t )incc; \ incc10 = ldc; \ ldc10 = incc; \ conjc10 = conjc; \ @@ -486,8 +331,8 @@ void PASTEMAC(ch,varname) \ transc, \ p11_m, \ p11_n, \ - c11, rs_c, cs_c, \ - p11, rs_p, cs_p, \ + c11, incc, ldc, \ + p11, 1, ldp, \ cntx, \ NULL \ ); \ @@ -503,7 +348,7 @@ void PASTEMAC(ch,varname) \ { \ PASTEMAC(ch,seti0s)( *pi11 ); \ \ - pi11 += rs_p + cs_p; \ + pi11 += 1 + ldp; \ } \ } \ \ @@ -519,7 +364,7 @@ void PASTEMAC(ch,varname) \ p11_m, \ p11_n, \ kappa, \ - p11, rs_p, cs_p, \ + p11, 1, ldp, \ cntx, \ NULL \ ); \ @@ -539,28 +384,26 @@ INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffp, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ) \ { \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ +\ /* Pack the panel. */ \ PASTEMAC(ch,kername) \ ( \ @@ -584,11 +427,11 @@ void PASTEMAC(ch,varname) \ PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ - m_panel, \ - n_panel, \ + diagoffc, \ + panel_dim, \ + panel_len, \ kappa, \ - p, rs_p, cs_p, \ + p, 1, ldp, \ cntx, \ NULL \ ); \ @@ -599,10 +442,10 @@ void PASTEMAC(ch,varname) \ { \ PASTEMAC2(ch,invertd,BLIS_TAPI_EX_SUF) \ ( \ - diagoffp, \ - m_panel, \ - n_panel, \ - p, rs_p, cs_p, \ + diagoffc, \ + panel_dim, \ + panel_len, \ + p, 1, ldp, \ cntx, \ NULL \ ); \ @@ -621,23 +464,53 @@ void PASTEMAC(ch,varname) \ uplo_t uplop = uploc; \ \ bli_toggle_uplo( &uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \ \ PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - diagoffp, \ + diagoffc, \ BLIS_NONUNIT_DIAG, \ uplop, \ - m_panel, \ - n_panel, \ + panel_dim, \ + panel_len, \ zero, \ - p, rs_p, cs_p, \ + p, 1, ldp, \ cntx, \ NULL \ ); \ } \ \ + /* If this panel is an edge case in both panel dimension and length, + then it must be a bottom-right corner case. Set the part of the + diagonal that extends into the zero-padded region to identity. + NOTE: This is actually only necessary when packing for trsm, as + it helps prevent NaNs and Infs from creeping into the computation. + However, we set the region to identity for trmm as well. Those + 1.0's end up getting muliplied by the 0.0's in the zero-padded + region of the other matrix, so there is no harm in this. */ \ + if ( panel_dim != panel_dim_max && \ + panel_len != panel_len_max ) \ + { \ + ctype* restrict one = PASTEMAC(ch,1); \ + dim_t i = panel_dim; \ + dim_t j = panel_len; \ + dim_t m_br = panel_dim_max - i; \ + dim_t n_br = panel_len_max - j; \ + ctype* p_br = p + (i ) + (j )*ldp; \ +\ + PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + m_br, \ + n_br, \ + one, \ + p_br, 1, ldp, \ + cntx, \ + NULL \ + ); \ + } \ } INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk ) diff --git a/frame/1m/packm/bli_packm_struc_cxk.h b/frame/1m/packm/bli_packm_struc_cxk.h index 08afb19bd..973a02612 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.h +++ b/frame/1m/packm/bli_packm_struc_cxk.h @@ -38,84 +38,25 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffp, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_struc_cxk ) - - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_herm_cxk ) - - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ dim_t panel_dim_max, \ - dim_t panel_len, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ cntx_t* cntx \ ); +INSERT_GENTPROT_BASIC0( packm_struc_cxk ) +INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.c b/frame/1m/packm/bli_packm_struc_cxk_1er.c index a66ba5ff6..b3be9dff9 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_1er.c +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.c @@ -40,57 +40,25 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ - dim_t panel_dim; \ - dim_t panel_dim_max; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ -\ - /* Determine the dimensions and relative strides of the micro-panel - based on its pack schema. */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_dim_max = n_panel_max; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_dim_max = m_panel_max; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ -\ /* Handle micro-panel packing based on the structure of the matrix being packed. */ \ if ( bli_is_general( strucc ) ) \ @@ -108,7 +76,7 @@ void PASTEMAC(ch,varname) \ kappa, \ c, incc, ldc, \ p, ldp, \ - cntx \ + cntx \ ); \ } \ else if ( bli_is_herm_or_symm( strucc ) ) \ @@ -118,24 +86,23 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_herm_cxk_1er) \ ( \ strucc, \ - diagoffc, \ + diagc, \ uploc, \ conjc, \ schema, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ + invdiag, \ panel_dim, \ - panel_dim_max, \ panel_len, \ + panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - ldp, \ - cntx \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx, \ + params \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ @@ -145,125 +112,25 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,packm_tri_cxk_1er) \ ( \ strucc, \ - diagoffc, \ diagc, \ uploc, \ conjc, \ schema, \ invdiag, \ - m_panel, \ - n_panel, \ - m_panel_max, \ - n_panel_max, \ panel_dim, \ - panel_dim_max, \ panel_len, \ + panel_dim_max, \ panel_len_max, \ + panel_dim_off, \ + panel_len_off, \ kappa, \ - c, rs_c, cs_c, \ - incc, ldc, \ - p, rs_p, cs_p, \ - ldp, \ - cntx \ - ); \ - } \ -\ -\ - /* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally - fill the edge region (the bottom m_panel_max - m_panel rows or right- - side n_panel_max - n_panel columns) of the micropanel with zeros. - However, this responsibility has been moved to the packm microkernel. - This change allows experts to use custom kernels that pack to custom - packing formats when the problem size is not a nice multiple of the - register blocksize. */ \ -/* - if ( m_panel != m_panel_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - dim_t offm = m_panel; \ - dim_t offn = 0; \ - dim_t m_edge = m_panel_max - m_panel; \ - dim_t n_edge = n_panel_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, rs_p, cs_p, ldp \ - ); \ - } \ -\ - if ( n_panel != n_panel_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - dim_t offm = 0; \ - dim_t offn = n_panel; \ - dim_t m_edge = m_panel_max; \ - dim_t n_edge = n_panel_max - n_panel; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, rs_p, cs_p, ldp \ + c, incc, ldc, \ + p, ldp, \ + is_p, \ + cntx, \ + params \ ); \ } \ -*/ \ -\ - if ( bli_is_triangular( strucc ) ) \ - { \ - /* If this micro-panel is an edge case in both panel dimension and - length, then it must be a bottom-right corner case, which - typically only happens for micro-panels being packed for trsm. - (It also happens for trmm if kr > 1.) Here, we set the part of - the diagonal that extends into the zero-padded region to - identity. This prevents NaNs and Infs from creeping into the - computation. If this code does execute for trmm, it is okay, - because those 1.0's that extend into the bottom-right region - end up getting muliplied by the 0.0's in the zero-padded region - of the other matrix. */ \ - if ( m_panel != m_panel_max && \ - n_panel != n_panel_max ) \ - { \ - ctype* restrict one = PASTEMAC(ch,1); \ - dim_t offm = m_panel; \ - dim_t offn = n_panel; \ - dim_t m_edge = m_panel_max - m_panel; \ - dim_t n_edge = n_panel_max - n_panel; \ -\ - PASTEMAC(ch,set1ms_mxn_diag) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - one, \ - p, rs_p, cs_p, ldp \ - ); \ - } \ - } \ -\ -\ -/* - if ( bli_is_1r_packed( schema ) ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1r): bp", m_panel_max, 2*n_panel_max, \ - ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ - } \ - \ - if ( bli_is_1e_packed( schema ) ) { \ - PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1e): ap", 2*m_panel_max, 2*n_panel_max, \ - ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ - } \ -*/ \ } INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er ) @@ -277,42 +144,32 @@ INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffc, \ + diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ + cntx_t* cntx, \ + void* params \ ) \ { \ - doff_t diagoffc_abs; \ - dim_t j; \ - bool row_stored; \ - bool col_stored; \ -\ -\ - /* Create flags to incidate row or column storage. Note that the - schema bit that encodes row or column is describing the form of - micro-panel, not the storage in the micro-panel. Hence the - mismatch in "row" and "column" semantics. */ \ - row_stored = bli_is_col_packed( schema ); \ - col_stored = bli_is_row_packed( schema ); \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ + doff_t diagoffc_abs; \ + dim_t j; \ \ /* Handle the case where the micro-panel does NOT intersect the diagonal separately from the case where it does intersect. */ \ - if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \ { \ /* If the current panel is unstored, we need to make a few adjustments so we refer to the data where it is actually @@ -320,10 +177,10 @@ void PASTEMAC(ch,varname) \ implicitly assumes we are operating on a dense panel within a larger symmetric or Hermitian matrix, since a general matrix would not contain any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \ { \ - c = c + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ + c = c + diagoffc * ( doff_t )ldc + \ + -diagoffc * ( doff_t )incc; \ bli_swap_incs( &incc, &ldc ); \ \ if ( bli_is_hermitian( strucc ) ) \ @@ -345,7 +202,7 @@ void PASTEMAC(ch,varname) \ cntx \ ); \ } \ - else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \ { \ ctype* restrict c10; \ ctype* restrict p10; \ @@ -366,14 +223,12 @@ void PASTEMAC(ch,varname) \ a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ - if ( ( col_stored && diagoffc < 0 ) || \ - ( row_stored && diagoffc > 0 ) ) \ + if ( diagoffc < 0 ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ diagoffc_abs = bli_abs( diagoffc ); \ \ - if ( ( row_stored && bli_is_upper( uploc ) ) || \ - ( col_stored && bli_is_lower( uploc ) ) ) \ + if ( bli_is_lower( uploc ) ) \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs; \ @@ -389,8 +244,8 @@ void PASTEMAC(ch,varname) \ diagoffc12 = diagoffc_abs - j; \ p12 = p + (j )*ldp; \ c12 = c + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ - -diagoffc12 * ( doff_t )rs_c; \ + c12 = c12 + diagoffc12 * ( doff_t )ldc + \ + -diagoffc12 * ( doff_t )incc; \ incc12 = ldc; \ ldc12 = incc; \ conjc12 = conjc; \ @@ -398,16 +253,15 @@ void PASTEMAC(ch,varname) \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc12 ); \ } \ - else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ - ( col_stored && bli_is_upper( uploc ) ) ) */ \ + else /* if ( bli_is_upper( uploc ) ) */ \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs + panel_dim; \ diagoffc10 = diagoffc; \ p10 = p; \ c10 = c; \ - c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ - -diagoffc10 * ( doff_t )rs_c; \ + c10 = c10 + diagoffc10 * ( doff_t )ldc + \ + -diagoffc10 * ( doff_t )incc; \ incc10 = ldc; \ ldc10 = incc; \ conjc10 = conjc; \ @@ -478,8 +332,8 @@ void PASTEMAC(ch,varname) \ conjc, \ panel_dim, \ kappa, \ - c11, rs_c, cs_c, \ - p11, rs_p, cs_p, ldp \ + c11, incc, ldc, \ + p11, 1, ldp, ldp \ ); \ \ /* If we are packing a micro-panel with Hermitian structure, @@ -495,8 +349,8 @@ void PASTEMAC(ch,varname) \ if ( bli_is_hermitian( strucc ) ) \ { \ ctype_r* restrict c11_r = ( ctype_r* )c11; \ - const dim_t rs_c2 = 2*rs_c; \ - const dim_t cs_c2 = 2*cs_c; \ + const dim_t incc2 = 2*incc; \ + const dim_t ldc2 = 2*ldc; \ \ PASTEMAC3(ch,chr,ch,scal21ms_mxn_diag) \ ( \ @@ -504,8 +358,8 @@ void PASTEMAC(ch,varname) \ panel_dim, \ panel_dim, \ kappa, \ - c11_r, rs_c2, cs_c2, \ - p11, rs_p, cs_p, ldp \ + c11_r, incc2, ldc2, \ + p11, 1, ldp, ldp \ ); \ } \ } \ @@ -523,30 +377,28 @@ INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_1er, packm_cxk_1er ) void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffp, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ + dim_t panel_dim_max, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ + cntx_t* cntx, \ + void* params \ ) \ { \ - doff_t diagoffp_abs = bli_abs( diagoffp ); \ - ctype* p11 = p + (diagoffp_abs )*ldp; \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ + doff_t diagoffc_abs = bli_abs( diagoffc ); \ + ctype* p11 = p + (diagoffc_abs )*ldp; \ \ \ /* Pack the panel. */ \ @@ -579,7 +431,7 @@ void PASTEMAC(ch,varname) \ panel_dim, \ panel_dim, \ kappa, \ - p11, rs_p, cs_p, ldp \ + p11, 1, ldp, ldp \ ); \ } \ \ @@ -594,7 +446,7 @@ void PASTEMAC(ch,varname) \ 0, \ panel_dim, \ panel_dim, \ - p11, rs_p, cs_p, ldp \ + p11, 1, ldp, ldp \ ); \ } \ \ @@ -610,11 +462,11 @@ void PASTEMAC(ch,varname) \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ uplo_t uplop = uploc; \ - doff_t diagoffp11_0 = 0; \ + doff_t diagoffc11_0 = 0; \ dim_t p11_0_dim = panel_dim - 1; \ \ bli_toggle_uplo( &uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp11_0 ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc11_0 ); \ \ /* Note that this macro works a little differently than the setm operation. Here, we pass in the dimensions of only p11, rather @@ -622,20 +474,51 @@ void PASTEMAC(ch,varname) \ "shrunken" dimensions of p11, corresponding to the toggling and shrinking of the diagonal above. The macro will do the right thing, incrementing the pointer to p11 by the appropriate - leading dimension (cs_p or rs_p), and setting only the lower + leading dimension (ldp or rs_p), and setting only the lower or upper triangle to zero. */ \ PASTEMAC(ch,set1ms_mxn_uplo) \ ( \ schema, \ - diagoffp11_0, \ + diagoffc11_0, \ uplop, \ p11_0_dim, \ p11_0_dim, \ zero, \ - p11, rs_p, cs_p, ldp \ + p11, 1, ldp, ldp \ ); \ } \ } \ +\ + /* If this micro-panel is an edge case in both panel dimension and + length, then it must be a bottom-right corner case, which + typically only happens for micro-panels being packed for trsm. + (It also happens for trmm if kr > 1.) Here, we set the part of + the diagonal that extends into the zero-padded region to + identity. This prevents NaNs and Infs from creeping into the + computation. If this code does execute for trmm, it is okay, + because those 1.0's that extend into the bottom-right region + end up getting muliplied by the 0.0's in the zero-padded region + of the other matrix. */ \ + if ( panel_dim != panel_dim_max && \ + panel_len != panel_len_max ) \ + { \ + ctype* restrict one = PASTEMAC(ch,1); \ + dim_t offm = panel_dim; \ + dim_t offn = panel_len; \ + dim_t m_edge = panel_dim_max - panel_dim; \ + dim_t n_edge = panel_len_max - panel_len; \ +\ + PASTEMAC(ch,set1ms_mxn_diag) \ + ( \ + schema, \ + offm, \ + offn, \ + m_edge, \ + n_edge, \ + one, \ + p, 1, ldp, ldp \ + ); \ + } \ } INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_1er, packm_cxk_1er ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.h b/frame/1m/packm/bli_packm_struc_cxk_1er.h index 6e62d8f69..a953e9367 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_1er.h +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.h @@ -38,84 +38,26 @@ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ - doff_t diagoffp, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ dim_t panel_dim, \ - dim_t panel_dim_max, \ dim_t panel_len, \ - dim_t panel_len_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) - - - -#undef GENTPROTCO -#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - dim_t panel_dim, \ dim_t panel_dim_max, \ - dim_t panel_len, \ dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t ldp, \ - cntx_t* cntx \ + ctype* restrict c, inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t ldp, \ + inc_t is_p, \ + cntx_t* cntx, \ + void* params \ ); +INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) +INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.c b/frame/1m/packm/bli_packm_struc_cxk_md.c index 52a1f9817..650b6178c 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_md.c +++ b/frame/1m/packm/bli_packm_struc_cxk_md.c @@ -41,53 +41,26 @@ \ void PASTEMAC2(chc,chp,varname) \ ( \ + struc_t strucc, \ + diag_t diagc, \ + uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype_p* restrict kappa, \ - ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype_c* restrict c, inc_t incc, inc_t ldc, \ + ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ) \ { \ - dim_t panel_dim; \ - dim_t panel_dim_max; \ - dim_t panel_len; \ - dim_t panel_len_max; \ - inc_t incc, ldc; \ - inc_t ldp; \ -\ -\ - /* Determine the dimensions and relative strides of the micro-panel - based on its pack schema. */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - /* Prepare to pack to row-stored column panel. */ \ - panel_dim = n_panel; \ - panel_dim_max = n_panel_max; \ - panel_len = m_panel; \ - panel_len_max = m_panel_max; \ - incc = cs_c; \ - ldc = rs_c; \ - ldp = rs_p; \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - /* Prepare to pack to column-stored row panel. */ \ - panel_dim = m_panel; \ - panel_dim_max = m_panel_max; \ - panel_len = n_panel; \ - panel_len_max = n_panel_max; \ - incc = rs_c; \ - ldc = cs_c; \ - ldp = cs_p; \ - } \ -\ -\ if ( bli_is_nat_packed( schema ) ) \ { \ /* Sanity check: Make sure that kappa is 1.0. Mixed-datatype alpha @@ -318,7 +291,7 @@ void PASTEMAC2(cha,chp,opname) \ conj_t conja, \ dim_t m, \ dim_t n, \ - ctype_p* restrict kappa, \ + ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ) \ @@ -445,7 +418,7 @@ void PASTEMAC2(cha,chp,opname) \ conj_t conja, \ dim_t m, \ dim_t n, \ - ctype_p* restrict kappa, \ + ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ) \ diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.h b/frame/1m/packm/bli_packm_struc_cxk_md.h index 72ca67937..f493838b3 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_md.h +++ b/frame/1m/packm/bli_packm_struc_cxk_md.h @@ -37,17 +37,24 @@ \ void PASTEMAC2(chc,chp,varname) \ ( \ + struc_t strucc, \ + diag_t diagc, \ + uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ + bool invdiag, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ ctype_p* restrict kappa, \ - ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \ + ctype_c* restrict c, inc_t incc, inc_t ldc, \ + ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ - cntx_t* cntx \ + cntx_t* cntx, \ + void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) diff --git a/frame/1m/packm/bli_packm_unb_var1.c b/frame/1m/packm/bli_packm_unb_var1.c deleted file mode 100644 index 6e72b3e9d..000000000 --- a/frame/1m/packm/bli_packm_unb_var1.c +++ /dev/null @@ -1,297 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T packm_fp - -typedef void (*FUNCPTR_T)( - struc_t strucc, - doff_t diagoffc, - diag_t diagc, - uplo_t uploc, - trans_t transc, - dim_t m, - dim_t n, - dim_t m_max, - dim_t n_max, - void* kappa, - void* c, inc_t rs_c, inc_t cs_c, - void* p, inc_t rs_p, inc_t cs_p, - cntx_t* cntx - ); - -static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1); - - -void bli_packm_unb_var1 - ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_cp = bli_obj_dt( c ); - - struc_t strucc = bli_obj_struc( c ); - doff_t diagoffc = bli_obj_diag_offset( c ); - diag_t diagc = bli_obj_diag( c ); - uplo_t uploc = bli_obj_uplo( c ); - trans_t transc = bli_obj_conjtrans_status( c ); - - dim_t m_p = bli_obj_length( p ); - dim_t n_p = bli_obj_width( p ); - dim_t m_max_p = bli_obj_padded_length( p ); - dim_t n_max_p = bli_obj_padded_width( p ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - void* buf_p = bli_obj_buffer_at_off( p ); - inc_t rs_p = bli_obj_row_stride( p ); - inc_t cs_p = bli_obj_col_stride( p ); - - void* buf_kappa; - - FUNCPTR_T f; - - - // This variant assumes that the computational kernel will always apply - // the alpha scalar of the higher-level operation. Thus, we use BLIS_ONE - // for kappa so that the underlying packm implementation does not scale - // during packing. - buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_cp]; - - if( bli_thread_am_ochief( thread ) ) { - // Invoke the function. - f - ( - strucc, - diagoffc, - diagc, - uploc, - transc, - m_p, - n_p, - m_max_p, - n_max_p, - buf_kappa, - buf_c, rs_c, cs_c, - buf_p, rs_p, cs_p, - cntx - ); - } -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - cntx_t* cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict c_cast = c; \ - ctype* restrict p_cast = p; \ - ctype* restrict zero = PASTEMAC(ch,0); \ -\ - /* We begin by packing the region indicated by the parameters. If - matrix c is dense (either because the structure is general or - because the structure has already been "densified"), this ends - up being the only action we take. Note that if kappa is unit, - the data is simply copied (rather than scaled by one). */ \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - diagoffc, \ - diagc, \ - uploc, \ - transc, \ - m, \ - n, \ - kappa_cast, \ - c_cast, rs_c, cs_c, \ - p_cast, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ -\ - /* If uploc is upper or lower, then the structure of c is necessarily - non-dense (ie: Hermitian, symmetric, or triangular, where part of the - matrix is unstored). In these cases, we want to fill in the unstored - part of the matrix. How this is done depends on the structure of c. */ \ - if ( bli_is_upper_or_lower( uploc ) ) \ - { \ - /* The Hermitian and symmetric cases are almost identical, so we - handle them in one conditional block. */ \ - if ( bli_is_hermitian( strucc ) || bli_is_symmetric( strucc ) ) \ - { \ - /* First we must reflect the region referenced to the opposite - side of the diagonal. */ \ - c_cast = c_cast + diagoffc * ( doff_t )cs_c + \ - -diagoffc * ( doff_t )rs_c; \ - bli_negate_diag_offset( &diagoffc ); \ - bli_toggle_trans( &transc ); \ - if ( bli_is_upper( uploc ) ) diagoffc += 1; \ - else if ( bli_is_lower( uploc ) ) diagoffc -= 1; \ -\ - /* If c is Hermitian, we need to apply a conjugation when - copying the region opposite the diagonal. */ \ - if ( bli_is_hermitian( strucc ) ) \ - transc = bli_trans_toggled_conj( transc ); \ -\ - /* Copy the data from the region opposite the diagonal of c - (as specified by the original value of diagoffc). Notice - that we use a diag parameter of non-unit since we can - assume nothing about the neighboring off-diagonal. */ \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - diagoffc, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - transc, \ - m, \ - n, \ - kappa_cast, \ - c_cast, rs_c, cs_c, \ - p_cast, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ - else /* if ( bli_is_triangular( strucc ) ) */ \ - { \ - doff_t diagoffp = diagoffc; \ - uplo_t uplop = uploc; \ -\ - /* For this step we need the uplo and diagonal offset of p, which - we can derive from the parameters given. */ \ - if ( bli_does_trans( transc ) ) \ - { \ - bli_negate_diag_offset( &diagoffp ); \ - bli_toggle_uplo( &uplop ); \ - } \ -\ - /* For triangular matrices, we wish to reference the region - strictly opposite the diagonal of C. This amounts to - toggling uploc and then shifting the diagonal offset to - shrink the stored region (by one diagonal). */ \ - bli_toggle_uplo( &uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \ -\ - /* Set the region opposite the diagonal of p to zero. */ \ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - diagoffp, \ - BLIS_NONUNIT_DIAG, \ - uplop, \ - m, \ - n, \ - zero, \ - p_cast, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ - } \ -\ - /* The packed memory region was acquired/allocated with "aligned" - dimensions (ie: dimensions that were possibly inflated up to a - multiple). When these dimension are inflated, it creates empty - regions along the bottom and/or right edges of the matrix. If - eithe region exists, we set them to zero. This simplifies the - register level micro kernel in that it does not need to support - different register blockings for the edge cases. */ \ - if ( m != m_max ) \ - { \ - ctype* p_edge = p_cast + (m )*rs_p; \ -\ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_max - m, \ - n_max, \ - zero, \ - p_edge, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -\ - if ( n != n_max ) \ - { \ - ctype* p_edge = p_cast + (n )*cs_p; \ -\ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_max, \ - n_max - n, \ - zero, \ - p_edge, rs_p, cs_p, \ - cntx, \ - NULL \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_unb_var1 ) - diff --git a/frame/1m/packm/bli_packm_unb_var1.h b/frame/1m/packm/bli_packm_unb_var1.h deleted file mode 100644 index 8960c8661..000000000 --- a/frame/1m/packm/bli_packm_unb_var1.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_packm_unb_var1 - ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ); - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - void* kappa, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* p, inc_t rs_p, inc_t cs_p, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( packm_unb_var1 ) - diff --git a/frame/1m/unpackm/bli_unpackm.h b/frame/1m/unpackm/bli_unpackm.h index b32d02d9b..5e4542841 100644 --- a/frame/1m/unpackm/bli_unpackm.h +++ b/frame/1m/unpackm/bli_unpackm.h @@ -36,8 +36,6 @@ #include "bli_unpackm_check.h" #include "bli_unpackm_int.h" -#include "bli_unpackm_unb_var1.h" - #include "bli_unpackm_blk_var1.h" #include "bli_unpackm_cxk.h" diff --git a/frame/1m/unpackm/bli_unpackm_unb_var1.c b/frame/1m/unpackm/bli_unpackm_unb_var1.c deleted file mode 100644 index c1033c2cb..000000000 --- a/frame/1m/unpackm/bli_unpackm_unb_var1.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define FUNCPTR_T unpackm_fp - -typedef void (*FUNCPTR_T)( - doff_t diagoffp, - uplo_t uplop, - trans_t transp, - dim_t m, - dim_t n, - void* p, inc_t rs_p, inc_t cs_p, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx - ); - -static FUNCPTR_T GENARRAY(ftypes,unpackm_unb_var1); - - -void bli_unpackm_unb_var1 - ( - obj_t* p, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_pc = bli_obj_dt( p ); - - doff_t diagoffp = bli_obj_diag_offset( p ); - uplo_t uplop = bli_obj_uplo( p ); - trans_t transc = bli_obj_onlytrans_status( c ); - - dim_t m_c = bli_obj_length( c ); - dim_t n_c = bli_obj_width( c ); - - void* buf_p = bli_obj_buffer_at_off( p ); - inc_t rs_p = bli_obj_row_stride( p ); - inc_t cs_p = bli_obj_col_stride( p ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - FUNCPTR_T f; - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_pc]; - - // Invoke the function. - f( diagoffp, - uplop, - transc, - m_c, - n_c, - buf_p, rs_p, cs_p, - buf_c, rs_c, cs_c, - cntx - ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffp, \ - uplo_t uplop, \ - trans_t transp, \ - dim_t m, \ - dim_t n, \ - void* p, inc_t rs_p, inc_t cs_p, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx \ - ) \ -{ \ - ctype* p_cast = p; \ - ctype* c_cast = c; \ -\ - PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \ - ( \ - diagoffp,\ - BLIS_NONUNIT_DIAG, \ - uplop, \ - transp, \ - m, \ - n, \ - p_cast, rs_p, cs_p, \ - c_cast, rs_c, cs_c, \ - cntx, \ - NULL \ - ); \ -} - -INSERT_GENTFUNC_BASIC( unpackm, unpackm_unb_var1 ) - diff --git a/frame/1m/unpackm/bli_unpackm_unb_var1.h b/frame/1m/unpackm/bli_unpackm_unb_var1.h deleted file mode 100644 index 5119aaa7f..000000000 --- a/frame/1m/unpackm/bli_unpackm_unb_var1.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_unpackm_unb_var1 - ( - obj_t* p, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread - ); - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffp, \ - uplo_t uplop, \ - trans_t transp, \ - dim_t m, \ - dim_t n, \ - void* p, inc_t rs_p, inc_t cs_p, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC0( unpackm_unb_var1 ) - diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index da9348844..4dc1a9d54 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -35,6 +35,8 @@ #include "bli_l3_cntl.h" #include "bli_l3_check.h" +#include "bli_l3_int.h" +#include "bli_l3_packab.h" // Define function types. //#include "bli_l3_ft_ex.h" @@ -45,7 +47,6 @@ #include "bli_l3_blocksize.h" #include "bli_l3_direct.h" #include "bli_l3_prune.h" -#include "bli_l3_packm.h" #include "bli_l3_schema.h" // Prototype object APIs (basic and expert). diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index 50da4627c..3e7882bc3 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -53,7 +53,7 @@ void bli_gemm_check // Check object structure. // NOTE: Can't perform these checks as long as bli_gemm_check() is called - // from bli_gemm_int(), which is in the execution path for structured + // from bli_l3_int(), which is in the execution path for structured // level-3 operations such as hemm. //e_val = bli_check_general_object( a ); @@ -109,7 +109,7 @@ void bli_hemm_check } void bli_herk_check - ( + ( obj_t* alpha, obj_t* a, obj_t* beta, @@ -197,7 +197,7 @@ void bli_symm_check } void bli_syrk_check - ( + ( obj_t* alpha, obj_t* a, obj_t* beta, diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/bli_l3_int.c similarity index 74% rename from frame/3/trsm/bli_trsm_int.c rename to frame/3/bli_l3_int.c index 53a22c355..d4b974030 100644 --- a/frame/3/trsm/bli_trsm_int.c +++ b/frame/3/bli_l3_int.c @@ -34,7 +34,7 @@ #include "blis.h" -void bli_trsm_int +void bli_l3_int ( obj_t* alpha, obj_t* a, @@ -47,10 +47,9 @@ void bli_trsm_int thrinfo_t* thread ) { - obj_t a_local; - obj_t b_local; - obj_t c_local; - trsm_var_oft f; + obj_t a_local; + obj_t b_local; + obj_t c_local; // Return early if the current control tree node is NULL. if ( bli_cntl_is_null( cntl ) ) return; @@ -60,72 +59,82 @@ void bli_trsm_int bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); // If C has a zero dimension, return early. - if ( bli_obj_has_zero_dim( c ) ) return; + if ( bli_obj_has_zero_dim( c ) ) + { + return; + } // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( a ) || bli_obj_has_zero_dim( b ) ) { if ( bli_thread_am_ochief( thread ) ) - bli_scalm( beta, c ); + bli_scalm( beta, c ); bli_thread_barrier( thread ); return; } - // Alias A and B in case we need to update attached scalars. + // If A or B is marked as being filled with zeros, scale C by beta and + // return early. + if ( bli_obj_is_zeros( a ) || + bli_obj_is_zeros( b ) ) + { + // This should never execute. + bli_abort(); + + if ( bli_thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + bli_thread_barrier( thread ); + return; + } + + // Alias A, B, and C in case we need to update attached scalars. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); - - // Alias C in case we need to induce a transposition. bli_obj_alias_to( c, &c_local ); + // Ensure that a valid packing function is set on A and B. + if ( !bli_obj_pack_fn( &a_local ) ) + bli_obj_set_pack_fn( bli_packm_blk_var1, &a_local ); + + if ( !bli_obj_pack_fn( &b_local ) ) + bli_obj_set_pack_fn( bli_packm_blk_var1, &b_local ); + // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. - if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) ) + //if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) ) + if ( bli_obj_has_trans( c ) ) { bli_obj_induce_trans( &c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local ); } - // If beta is non-unit, apply it to the scalar attached to C. - if ( !bli_obj_equals( beta, &BLIS_ONE ) ) + // If alpha is non-unit, typecast and apply it to the scalar attached + // to B, unless it happens to be triangular. + if ( bli_obj_root_is_triangular( b ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); - } - - // Set two bools: one based on the implied side parameter (the structure - // of the root object) and one based on the uplo field of the triangular - // matrix's root object (whether that is matrix A or matrix B). - if ( bli_obj_root_is_triangular( a ) ) - { - // If alpha is non-unit, typecast and apply it to the scalar - // attached to B (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) - { - bli_obj_scalar_apply_scalar( alpha, &b_local ); - } + bli_obj_scalar_apply_scalar( alpha, &a_local ); } else // if ( bli_obj_root_is_triangular( b ) ) { - // If alpha is non-unit, typecast and apply it to the scalar - // attached to A (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) - { - bli_obj_scalar_apply_scalar( alpha, &a_local ); - } + bli_obj_scalar_apply_scalar( alpha, &b_local ); } - // FGVZ->TMS: Is this barrier still needed? - bli_thread_barrier( thread ); + // If beta is non-unit, typecast and apply it to the scalar attached + // to C. + if ( !bli_obj_equals( beta, &BLIS_ONE ) ) + bli_obj_scalar_apply_scalar( beta, &c_local ); // Create the next node in the thrinfo_t structure. bli_thrinfo_grow( rntm, cntl, thread ); // Extract the function pointer from the current control tree node. - f = bli_cntl_var_func( cntl ); + l3_var_oft f = bli_cntl_var_func( cntl ); // Invoke the variant. f diff --git a/frame/3/gemm/bli_gemm_int.h b/frame/3/bli_l3_int.h similarity index 99% rename from frame/3/gemm/bli_gemm_int.h rename to frame/3/bli_l3_int.h index 2bbe5480a..d76b0ac3e 100644 --- a/frame/3/gemm/bli_gemm_int.h +++ b/frame/3/bli_l3_int.h @@ -32,7 +32,7 @@ */ -void bli_gemm_int +void bli_l3_int ( obj_t* alpha, obj_t* a, diff --git a/frame/3/bli_l3_oft_var.h b/frame/3/bli_l3_oft_var.h index 1456f8eff..ea10d8090 100644 --- a/frame/3/bli_l3_oft_var.h +++ b/frame/3/bli_l3_oft_var.h @@ -54,24 +54,7 @@ typedef void (*PASTECH(opname,_var_oft)) \ thrinfo_t* thread \ ); -GENTDEF( gemm ) - - -#undef GENTDEF -#define GENTDEF( opname ) \ -\ -typedef void (*PASTECH(opname,_var_oft)) \ -( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ -); - -GENTDEF( trsm ) +GENTDEF( l3 ) diff --git a/frame/3/gemm/bli_gemm_packab.c b/frame/3/bli_l3_packab.c similarity index 80% rename from frame/3/gemm/bli_gemm_packab.c rename to frame/3/bli_l3_packab.c index a15192994..d91181942 100644 --- a/frame/3/gemm/bli_gemm_packab.c +++ b/frame/3/bli_l3_packab.c @@ -34,7 +34,7 @@ #include "blis.h" -void bli_gemm_packa +void bli_l3_packa ( obj_t* a, obj_t* b, @@ -45,12 +45,19 @@ void bli_gemm_packa thrinfo_t* thread ) { - obj_t a_pack; + obj_t a_local, a_pack; + + bli_obj_alias_to( a, &a_local ); + if ( bli_obj_has_trans( a ) ) + { + bli_obj_induce_trans( &a_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); + } // Pack matrix A according to the control tree node. - bli_l3_packm + bli_packm_int ( - a, + &a_local, &a_pack, cntx, rntm, @@ -59,7 +66,7 @@ void bli_gemm_packa ); // Proceed with execution using packed matrix A. - bli_gemm_int + bli_l3_int ( &BLIS_ONE, &a_pack, @@ -75,7 +82,7 @@ void bli_gemm_packa // ----------------------------------------------------------------------------- -void bli_gemm_packb +void bli_l3_packb ( obj_t* a, obj_t* b, @@ -86,25 +93,39 @@ void bli_gemm_packb thrinfo_t* thread ) { - obj_t b_pack; + obj_t bt_local, bt_pack; + + // We always pass B^T to bli_l3_packm. + bli_obj_alias_to( b, &bt_local ); + if ( bli_obj_has_trans( b ) ) + { + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &bt_local ); + } + else + { + bli_obj_induce_trans( &bt_local ); + } // Pack matrix B according to the control tree node. - bli_l3_packm + bli_packm_int ( - b, - &b_pack, + &bt_local, + &bt_pack, cntx, rntm, cntl, thread ); + // Transpose packed object back to B. + bli_obj_induce_trans( &bt_pack ); + // Proceed with execution using packed matrix B. - bli_gemm_int + bli_l3_int ( &BLIS_ONE, a, - &b_pack, + &bt_pack, &BLIS_ONE, c, cntx, diff --git a/frame/3/trsm/bli_trsm_int.h b/frame/3/bli_l3_packab.h similarity index 90% rename from frame/3/trsm/bli_trsm_int.h rename to frame/3/bli_l3_packab.h index aabb2a8aa..380ca7212 100644 --- a/frame/3/trsm/bli_trsm_int.h +++ b/frame/3/bli_l3_packab.h @@ -32,12 +32,21 @@ */ -void bli_trsm_int +void bli_l3_packa + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ); + +void bli_l3_packb ( - obj_t* alpha, obj_t* a, obj_t* b, - obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c deleted file mode 100644 index 48f55c360..000000000 --- a/frame/3/bli_l3_packm.c +++ /dev/null @@ -1,187 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_l3_packm - ( - obj_t* x, - obj_t* x_pack, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - packbuf_t pack_buf_type; - mem_t* cntl_mem_p; - siz_t size_needed; - - // FGVZ: Not sure why we need this barrier, but we do. - bli_thread_barrier( thread ); - - // Every thread initializes x_pack and determines the size of memory - // block needed (which gets embedded into the otherwise "blank" mem_t - // entry in the control tree node). - size_needed - = - bli_packm_init - ( - x, - x_pack, - cntx, - cntl - ); - - // If zero was returned, no memory needs to be allocated and so we can - // return early. - if ( size_needed == 0 ) return; - - // Query the pack buffer type from the control tree node. - pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); - - // Query the address of the mem_t entry within the control tree node. - cntl_mem_p = bli_cntl_pack_mem( cntl ); - - // Check the mem_t field in the control tree. If it is unallocated, then - // we need to acquire a block from the memory broker and broadcast it to - // all threads in the chief's thread group. - if ( bli_mem_is_unalloc( cntl_mem_p ) ) - { - mem_t* local_mem_p; - mem_t local_mem_s; - - if ( bli_thread_am_ochief( thread ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_packm(): acquiring mem pool block\n" ); - #endif - - // The chief thread acquires a block from the memory broker - // and saves the associated mem_t entry to local_mem_s. - bli_pba_acquire_m - ( - rntm, - size_needed, - pack_buf_type, - &local_mem_s - ); - } - - // Broadcast the address of the chief thread's local mem_t entry to - // all threads. - local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); - - // Save the contents of the chief thread's local mem_t entry to the - // mem_t field in this thread's control tree node. - *cntl_mem_p = *local_mem_p; - } - else // ( bli_mem_is_alloc( cntl_mem_p ) ) - { - mem_t* local_mem_p; - mem_t local_mem_s; - - // If the mem_t entry in the control tree does NOT contain a NULL - // buffer, then a block has already been acquired from the memory - // broker and cached in the control tree. - - // As a sanity check, we should make sure that the mem_t object isn't - // associated with a block that is too small compared to the size of - // the packed matrix buffer that is needed, according to the return - // value from packm_init(). - siz_t cntl_mem_size = bli_mem_size( cntl_mem_p ); - - if ( cntl_mem_size < size_needed ) - { - if ( bli_thread_am_ochief( thread ) ) - { - // The chief thread releases the existing block associated with - // the mem_t entry in the control tree, and then re-acquires a - // new block, saving the associated mem_t entry to local_mem_s. - bli_pba_release - ( - rntm, - cntl_mem_p - ); - bli_pba_acquire_m - ( - rntm, - size_needed, - pack_buf_type, - &local_mem_s - ); - } - - // Broadcast the address of the chief thread's local mem_t entry to - // all threads. - local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); - - // Save the chief thread's local mem_t entry to the mem_t field in - // this thread's control tree node. - *cntl_mem_p = *local_mem_p; - } - else - { - // If the mem_t entry is already allocated and sufficiently large, - // then we use it as-is. No action is needed, because all threads - // will already have the cached values in their local control - // trees' mem_t entries, currently pointed to by cntl_mem_p. - - bli_thread_barrier( thread ); - } - } - - - // Update the buffer address in x_pack to point to the buffer associated - // with the mem_t entry acquired from the memory broker (now cached in - // the control tree node). - void* buf = bli_mem_buffer( cntl_mem_p ); - bli_obj_set_buffer( buf, x_pack ); - - - // Pack the contents of object x to object x_pack. - bli_packm_int - ( - x, - x_pack, - cntx, - cntl, - thread - ); - - // Barrier so that packing is done before computation. - bli_thread_barrier( thread ); -} - diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h index a6f8b4e1e..ddd88e163 100644 --- a/frame/3/gemm/bli_gemm.h +++ b/frame/3/gemm/bli_gemm.h @@ -34,7 +34,6 @@ #include "bli_gemm_cntl.h" #include "bli_gemm_front.h" -#include "bli_gemm_int.h" #include "bli_gemm_var.h" diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c index 3b7634338..de077e5ad 100644 --- a/frame/3/gemm/bli_gemm_blk_var1.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -77,7 +77,7 @@ void bli_gemm_blk_var1 i, b_alg, c, &c1 ); // Perform gemm subproblem. - bli_gemm_int + bli_l3_int ( &BLIS_ONE, &a1, diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c index d89a71053..53943e47c 100644 --- a/frame/3/gemm/bli_gemm_blk_var2.c +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -77,7 +77,7 @@ void bli_gemm_blk_var2 i, b_alg, c, &c1 ); // Perform gemm subproblem. - bli_gemm_int + bli_l3_int ( &BLIS_ONE, a, diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index 7883dfd6d..28029777d 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -71,7 +71,7 @@ void bli_gemm_blk_var3 i, b_alg, b, &b1 ); // Perform gemm subproblem. - bli_gemm_int + bli_l3_int ( &BLIS_ONE, &a1, diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 27678e0bf..72d78efe1 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -57,8 +57,6 @@ cntl_t* bli_gemmbp_cntl_create ) { void_fp macro_kernel_fp; - void_fp packa_fp; - void_fp packb_fp; // Use the function pointers to the macrokernels that use slab // assignment of micropanels to threads in the jr and ir loops. @@ -67,9 +65,6 @@ cntl_t* bli_gemmbp_cntl_create else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2; else /* should never execute */ macro_kernel_fp = NULL; - packa_fp = bli_packm_blk_var1; - packb_fp = bli_packm_blk_var1; - // Create two nodes for the macro-kernel. cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node ( @@ -93,8 +88,7 @@ cntl_t* bli_gemmbp_cntl_create cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( rntm, - bli_gemm_packa, // pack the left-hand operand - packa_fp, + bli_l3_packa, // pack the left-hand operand BLIS_MR, BLIS_KR, FALSE, // do NOT invert diagonal @@ -119,10 +113,9 @@ cntl_t* bli_gemmbp_cntl_create cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( rntm, - bli_gemm_packb, // pack the right-hand operand - packb_fp, - BLIS_KR, + bli_l3_packb, // pack the right-hand operand BLIS_NR, + BLIS_KR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? @@ -194,8 +187,8 @@ cntl_t* bli_gemmpb_cntl_create ( bli_gemm_packb, // pack the right-hand operand bli_packm_blk_var1, - BLIS_KR, BLIS_MR, + BLIS_KR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 792d69af5..a9ea21dc4 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -87,13 +87,14 @@ void bli_gemm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); -#ifdef BLIS_ENABLE_GEMM_MD - // Don't perform the following optimization for ccr or crc cases, as - // those cases are sensitive to the ukernel storage preference (ie: - // transposing the operation would break them). - if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) && - !bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) ) -#endif + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); + // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the @@ -251,7 +252,7 @@ void bli_gemm_front // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_GEMM, // operation family id alpha, &a_local, diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c deleted file mode 100644 index 208e9bdca..000000000 --- a/frame/3/gemm/bli_gemm_int.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_gemm_int - ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - obj_t a_local; - obj_t b_local; - obj_t c_local; - gemm_var_oft f; - - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); - - // If C has a zero dimension, return early. - if ( bli_obj_has_zero_dim( c ) ) - { - return; - } - - // If A or B has a zero dimension, scale C by beta and return early. - if ( bli_obj_has_zero_dim( a ) || - bli_obj_has_zero_dim( b ) ) - { - if ( bli_thread_am_ochief( thread ) ) - bli_scalm( beta, c ); - bli_thread_barrier( thread ); - return; - } - - // If A or B is marked as being filled with zeros, scale C by beta and - // return early. - if ( bli_obj_is_zeros( a ) || - bli_obj_is_zeros( b ) ) - { - // This should never execute. - bli_abort(); - - if ( bli_thread_am_ochief( thread ) ) - bli_scalm( beta, c ); - bli_thread_barrier( thread ); - return; - } - - // Alias A, B, and C in case we need to update attached scalars. - bli_obj_alias_to( a, &a_local ); - bli_obj_alias_to( b, &b_local ); - bli_obj_alias_to( c, &c_local ); - - // If alpha is non-unit, typecast and apply it to the scalar attached - // to B. - if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) - { - bli_obj_scalar_apply_scalar( alpha, &b_local ); - } - - // If beta is non-unit, typecast and apply it to the scalar attached - // to C. - if ( !bli_obj_equals( beta, &BLIS_ONE ) ) - { - bli_obj_scalar_apply_scalar( beta, &c_local ); - } - - // Create the next node in the thrinfo_t structure. - bli_thrinfo_grow( rntm, cntl, thread ); - - // Extract the function pointer from the current control tree node. - f = bli_cntl_var_func( cntl ); - - // Invoke the variant. - f - ( - &a_local, - &b_local, - &c_local, - cntx, - rntm, - cntl, - thread - ); -} - diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index 7bcc8a013..e7befc5b4 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -55,11 +55,8 @@ void PASTEMAC0(opname) \ GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) -GENPROT( gemm_packa ) -GENPROT( gemm_packb ) GENPROT( gemm_ker_var1 ) - GENPROT( gemm_ker_var2 ) diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c index 9f18a717d..2a9d91759 100644 --- a/frame/3/gemmt/bli_gemmt_front.c +++ b/frame/3/gemmt/bli_gemmt_front.c @@ -73,7 +73,14 @@ void bli_gemmt_front bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); - bli_obj_set_as_root( &c_local ); + + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel @@ -107,7 +114,7 @@ void bli_gemmt_front // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_GEMMT, // operation family id alpha, &a_local, diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c index 6d24ea496..3a1d681c3 100644 --- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -static gemm_var_oft vars[2] = +static l3_var_oft vars[2] = { bli_gemmt_l_ker_var2, bli_gemmt_u_ker_var2, }; @@ -51,8 +51,8 @@ void bli_gemmt_x_ker_var2 thrinfo_t* thread ) { - dim_t uplo; - gemm_var_oft f; + dim_t uplo; + l3_var_oft f; // Set a bool based on the uplo field of C's root object. if ( bli_obj_root_is_lower( c ) ) uplo = 0; diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 7869f800a..9835de9c1 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -65,6 +65,14 @@ void bli_hemm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); + #ifdef BLIS_DISABLE_HEMM_RIGHT // NOTE: This case casts right-side hemm in terms of left side. This is // necessary when the current subconfiguration uses a gemm microkernel @@ -129,13 +137,6 @@ void bli_hemm_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. @@ -152,7 +153,7 @@ void bli_hemm_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_GEMM, // operation family id alpha, &a_local, diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 52ef4cf36..be94c44c1 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -65,6 +65,14 @@ void bli_symm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); + #ifdef BLIS_DISABLE_SYMM_RIGHT // NOTE: This case casts right-side symm in terms of left side. This is // necessary when the current subconfiguration uses a gemm microkernel @@ -128,13 +136,6 @@ void bli_symm_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. @@ -151,7 +152,7 @@ void bli_symm_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_GEMM, // operation family id alpha, &a_local, diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index fac7349f5..1de28958e 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -64,6 +64,14 @@ void bli_trmm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( b, &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); + // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This @@ -147,13 +155,6 @@ void bli_trmm_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. @@ -170,7 +171,7 @@ void bli_trmm_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_TRMM, // operation family id alpha, &a_local, diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c index b9c176d97..898cfe242 100644 --- a/frame/3/trmm/bli_trmm_xx_ker_var2.c +++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -static gemm_var_oft vars[2][2] = +static l3_var_oft vars[2][2] = { { bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 }, { bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 } @@ -52,9 +52,9 @@ void bli_trmm_xx_ker_var2 thrinfo_t* thread ) { - dim_t side; - dim_t uplo; - gemm_var_oft f; + dim_t side; + dim_t uplo; + l3_var_oft f; // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 0ce961d1c..3b9753960 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -65,6 +65,14 @@ void bli_trmm3_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); + // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This @@ -139,13 +147,6 @@ void bli_trmm3_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. @@ -162,7 +163,7 @@ void bli_trmm3_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_gemm_int, + bli_l3_int, BLIS_TRMM, // operation family id alpha, &a_local, diff --git a/frame/3/trsm/bli_trsm.h b/frame/3/trsm/bli_trsm.h index 00b604de6..964422d01 100644 --- a/frame/3/trsm/bli_trsm.h +++ b/frame/3/trsm/bli_trsm.h @@ -34,7 +34,5 @@ #include "bli_trsm_cntl.h" #include "bli_trsm_front.h" -#include "bli_trsm_int.h" - #include "bli_trsm_var.h" diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index 578c37c32..30bf6921c 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -58,7 +58,7 @@ void bli_trsm_blk_var1 bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Isolate the diagonal block A11 and its corresponding row panel C1. - const dim_t kc = bli_obj_width( a ); + const dim_t kc = bli_obj_width_after_trans( a ); obj_t a11, c1; bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, 0, kc, a, &a11 ); @@ -96,7 +96,7 @@ void bli_trsm_blk_var1 #endif // Perform trsm subproblem. - bli_trsm_int + bli_l3_int ( &BLIS_ONE, &a11_1, @@ -169,7 +169,7 @@ void bli_trsm_blk_var1 // Perform gemm subproblem. (Note that we use the same backend // function as before, since we're calling the same macrokernel.) - bli_trsm_int + bli_l3_int ( &BLIS_ONE, &a11, diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index 23fd3ed4c..5691c964a 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -60,7 +60,7 @@ void bli_trsm_blk_var2 bli_thread_range_ndim ( direct, thread, a, b, c, cntl, cntx, - &my_start, &my_end + &my_start, &my_end ); // Partition along the n dimension. @@ -77,7 +77,7 @@ void bli_trsm_blk_var2 i, b_alg, c, &c1 ); // Perform trsm subproblem. - bli_trsm_int + bli_l3_int ( &BLIS_ONE, a, diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c index a68cc853b..43fc25f16 100644 --- a/frame/3/trsm/bli_trsm_blk_var3.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -71,7 +71,7 @@ void bli_trsm_blk_var3 i, b_alg, b, &b1 ); // Perform trsm subproblem. - bli_trsm_int + bli_l3_int ( &BLIS_ONE, &a1, diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 4a7a4de8f..a8196ebb9 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -57,16 +57,11 @@ cntl_t* bli_trsm_l_cntl_create ) { void_fp macro_kernel_p; - void_fp packa_fp; - void_fp packb_fp; // Use the function pointer to the macrokernels that use slab // assignment of micropanels to threads in the jr and ir loops. macro_kernel_p = bli_trsm_xx_ker_var2; - packa_fp = bli_packm_blk_var1; - packb_fp = bli_packm_blk_var1; - const opid_t family = BLIS_TRSM; // @@ -95,8 +90,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( rntm, - bli_trsm_packa, // trsm operation's packm function for A. - packa_fp, + bli_l3_packa, // trsm operation's packm function for A. BLIS_MR, BLIS_MR, FALSE, // do NOT invert diagonal @@ -133,8 +127,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( rntm, - bli_trsm_packa, // trsm operation's packm function for A. - packa_fp, + bli_l3_packa, // trsm operation's packm function for A. BLIS_MR, BLIS_MR, #ifdef BLIS_ENABLE_TRSM_PREINVERSION @@ -171,10 +164,9 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( rntm, - bli_trsm_packb, - packb_fp, - BLIS_MR, + bli_l3_packb, BLIS_NR, + BLIS_MR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? @@ -208,7 +200,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* bli_trsm_r_cntl_create ( - rntm_t* rntm, + rntm_t* rntm, pack_t schema_a, pack_t schema_b ) @@ -216,9 +208,6 @@ cntl_t* bli_trsm_r_cntl_create // NOTE: trsm macrokernels are presently disabled for right-side execution. void_fp macro_kernel_p = bli_trsm_xx_ker_var2; - void_fp packa_fp = bli_packm_blk_var1; - void_fp packb_fp = bli_packm_blk_var1; - const opid_t family = BLIS_TRSM; // Create two nodes for the macro-kernel. @@ -244,8 +233,7 @@ cntl_t* bli_trsm_r_cntl_create cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( rntm, - bli_trsm_packa, - packa_fp, + bli_l3_packa, BLIS_NR, BLIS_MR, FALSE, // do NOT invert diagonal @@ -270,8 +258,7 @@ cntl_t* bli_trsm_r_cntl_create cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( rntm, - bli_trsm_packb, - packb_fp, + bli_l3_packb, BLIS_MR, BLIS_MR, TRUE, // do NOT invert diagonal diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 68a60b5bd..7f3d17aef 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -71,6 +71,14 @@ void bli_trsm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( b, &c_local ); + // Set the obj_t buffer field to the location currently implied by the row + // and column offsets and then zero the offsets. If any of the original + // obj_t's were views into larger matrices, this step effectively makes + // those obj_t's "forget" their lineage. + bli_obj_reset_origin( &a_local ); + bli_obj_reset_origin( &b_local ); + bli_obj_reset_origin( &c_local ); + // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This @@ -121,13 +129,6 @@ void bli_trsm_front // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); - // Set each alias as the root object. - // NOTE: We MUST wait until we are done potentially swapping the objects - // before setting the root fields! - bli_obj_set_as_root( &a_local ); - bli_obj_set_as_root( &b_local ); - bli_obj_set_as_root( &c_local ); - // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. @@ -144,7 +145,7 @@ void bli_trsm_front // Invoke the internal back-end. bli_l3_thread_decorator ( - bli_trsm_int, + bli_l3_int, BLIS_TRSM, // operation family id alpha, &a_local, diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h index de7c65936..8322a8b5b 100644 --- a/frame/3/trsm/bli_trsm_var.h +++ b/frame/3/trsm/bli_trsm_var.h @@ -55,8 +55,6 @@ void PASTEMAC0(opname) \ GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) -GENPROT( trsm_packa ) -GENPROT( trsm_packb ) GENPROT( trsm_xx_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c index e30e6d751..c30a5828a 100644 --- a/frame/3/trsm/bli_trsm_xx_ker_var2.c +++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c @@ -35,7 +35,7 @@ #include "blis.h" -static trsm_var_oft vars[2][2] = +static l3_var_oft vars[2][2] = { { bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 }, { bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 } @@ -52,9 +52,9 @@ void bli_trsm_xx_ker_var2 thrinfo_t* thread ) { - dim_t side; - dim_t uplo; - trsm_var_oft f; + dim_t side; + dim_t uplo; + l3_var_oft f; // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index 43e5101b5..23fbb4cd1 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -118,6 +118,11 @@ void bli_obj_create_without_buffer bli_obj_set_offs( 0, 0, obj ); bli_obj_set_diag_offset( 0, obj ); + bli_obj_set_pack_fn( NULL, obj ); + bli_obj_set_pack_params( NULL, obj ); + bli_obj_set_ker_fn( NULL, obj ); + bli_obj_set_ker_params( NULL, obj ); + // Set the internal scalar to 1.0. bli_obj_set_scalar_dt( dt, obj ); s = bli_obj_internal_scalar_buffer( obj ); @@ -356,7 +361,7 @@ void bli_obj_free buf_a = bli_obj_buffer_at_off( a ); - bli_zzsets( 0.0, 0.0, value ); + bli_zzsets( 0.0, 0.0, value ); if ( bli_obj_is_float( a ) ) { @@ -500,7 +505,7 @@ void bli_adjust_strides // Set the column stride to indicate that this is a column vector // stored in column-major order. This is done for legacy reasons, // because we at one time we had to satisify the error checking - // in the underlying BLAS library, which expects the leading + // in the underlying BLAS library, which expects the leading // dimension to be set to at least m, even if it will never be // used for indexing since it is a vector and thus only has one // column of data. diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c index a924bbefc..f8835e5de 100644 --- a/frame/base/bli_pba.c +++ b/frame/base/bli_pba.c @@ -282,17 +282,6 @@ void bli_pba_acquire_v #endif -void bli_pba_rntm_set_pba - ( - rntm_t* rntm - ) -{ - pba_t* pba = bli_pba_query(); - - bli_rntm_set_pba( pba, rntm ); -} - - siz_t bli_pba_pool_size ( pba_t* pba, diff --git a/frame/base/bli_pba.h b/frame/base/bli_pba.h index ce19991f5..6431607ec 100644 --- a/frame/base/bli_pba.h +++ b/frame/base/bli_pba.h @@ -119,7 +119,7 @@ BLIS_INLINE void bli_pba_unlock( pba_t* pba ) // ----------------------------------------------------------------------------- -pba_t* bli_pba_query( void ); +BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( @@ -144,10 +144,15 @@ void bli_pba_release mem_t* mem ); -void bli_pba_rntm_set_pba +BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm - ); + ) +{ + pba_t* pba = bli_pba_query(); + + bli_rntm_set_pba( pba, rntm ); +} siz_t bli_pba_pool_size ( diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c index 1da6723c7..5b6ff6a0f 100644 --- a/frame/base/bli_sba.c +++ b/frame/base/bli_sba.c @@ -76,24 +76,39 @@ void* bli_sba_acquire // Query the small block pool from the rntm. pool_t* restrict pool = bli_rntm_sba_pool( rntm ); - // Query the block_size of the pool_t so that we can request the exact - // size present. - const siz_t block_size = bli_pool_block_size( pool ); - - // Sanity check: Make sure the requested size is no larger than the - // block_size field of the pool. - if ( block_size < req_size ) + // We don't expect NULL sba_pool pointers in the normal course of BLIS + // operation. However, there are rare instances where it is convenient + // to support use of bli_sba_acquire() without having to pass in a valid + // sba pool data structure. The case that inspired this branch was the + // gemm_ukr and related test modules in the BLIS testsuite. (There, it + // is convenient to not have to checkout an array_t from the sba, and it + // does no harm since the malloc() happens outside of the region that + // would be timed.) + if ( pool == NULL ) { - printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n", - ( int )block_size, ( int )req_size ); - bli_abort(); + block = bli_malloc_intl( req_size, &r_val ); + } + else + { + // Query the block_size of the pool_t so that we can request the exact + // size present. + const siz_t block_size = bli_pool_block_size( pool ); + + // Sanity check: Make sure the requested size is no larger than the + // block_size field of the pool. + if ( block_size < req_size ) + { + printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n", + ( int )block_size, ( int )req_size ); + bli_abort(); + } + + // Check out a block using the block_size queried above. + bli_pool_checkout_block( block_size, &pblk, pool ); + + // The block address is stored within the pblk_t. + block = bli_pblk_buf( &pblk ); } - - // Check out a block using the block_size queried above. - bli_pool_checkout_block( block_size, &pblk, pool ); - - // The block address is stored within the pblk_t. - block = bli_pblk_buf( &pblk ); } #else @@ -123,21 +138,28 @@ void bli_sba_release // Query the small block pool from the rntm. pool_t* restrict pool = bli_rntm_sba_pool( rntm ); - // Query the block_size field from the pool. This is not super-important - // for this particular application of the pool_t (that is, the "leaf" - // component of the sba), but it seems like good housekeeping to maintain - // the block_size field of the pblk_t in case its ever needed/read. - const siz_t block_size = bli_pool_block_size( pool ); - - // Embed the block's memory address into a pblk_t, along with the - // block_size queried from the pool. - bli_pblk_set_buf( block, &pblk ); - bli_pblk_set_block_size( block_size, &pblk ); - - // Check the pblk_t back into the pool_t. (It's okay that the pblk_t is - // a local variable since its contents are copied into the pool's internal - // data structure--an array of pblk_t.) - bli_pool_checkin_block( &pblk, pool ); + if ( pool == NULL ) + { + bli_free_intl( block ); + } + else + { + // Query the block_size field from the pool. This is not super-important + // for this particular application of the pool_t (that is, the "leaf" + // component of the sba), but it seems like good housekeeping to maintain + // the block_size field of the pblk_t in case its ever needed/read. + const siz_t block_size = bli_pool_block_size( pool ); + + // Embed the block's memory address into a pblk_t, along with the + // block_size queried from the pool. + bli_pblk_set_buf( block, &pblk ); + bli_pblk_set_block_size( block_size, &pblk ); + + // Check the pblk_t back into the pool_t. (It's okay that the pblk_t is + // a local variable since its contents are copied into the pool's internal + // data structure--an array of pblk_t.) + bli_pool_checkin_block( &pblk, pool ); + } } #else diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 84c977289..fe174202c 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -1189,52 +1189,48 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) // -- User-provided information macros -- -// User data query - -BLIS_INLINE void* bli_obj_user_data( obj_t* obj ) -{ - return obj->user_data; -} - -// User data modification +// Function pointer query -BLIS_INLINE void bli_obj_set_user_data( void* data, obj_t* obj ) +BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { - obj->user_data = data; + return obj->pack_fn; } -// Function pointer query - -BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) +BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { - return obj->pack; + return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { - return obj->ker; + return obj->ker_fn; } -BLIS_INLINE obj_ukr_fn_t bli_obj_ukr_fn( obj_t* obj ) +BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { - return obj->ukr; + return obj->ker_params; } // Function pointer modification -BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack, obj_t* obj ) +BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { - obj->pack = pack; + obj->pack_fn = pack_fn; } -BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker, obj_t* obj ) +BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { - obj->ker = ker; + obj->pack_params = params; } -BLIS_INLINE void bli_obj_set_ukr_fn( obj_ukr_fn_t ukr, obj_t* obj ) +BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { - obj->ukr = ukr; + obj->ker_fn = ker_fn; +} + +BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) +{ + obj->ker_params = params; } @@ -1357,6 +1353,18 @@ BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) ); } +// Adjust the pointer based on current offsets, zero the offsets, and then +// set the current object as the root. For obj_t's with at least one non-zero +// offset, this effectively makes the obj_t "forget" that it was ever a view +// into a larger matrix. + +BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) +{ + bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); + bli_obj_set_offs( 0, 0, obj ); + bli_obj_set_as_root( obj ); +} + // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) @@ -1482,7 +1490,13 @@ BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { + bool a_root_is_self = ( bli_obj_root( a ) == a ); + bool b_root_is_self = ( bli_obj_root( b ) == b ); + obj_t t = *b; *b = *a; *a = t; + + if ( a_root_is_self ) bli_obj_set_as_root( b ); + if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index f1a7e8f8d..5be0ceeb4 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1174,12 +1174,11 @@ struct thrinfo_s; typedef void (*obj_pack_fn_t) ( - mdim_t mat, - mem_t* mem, struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, + struct cntl_s* cntl, struct thrinfo_s* thread ); @@ -1190,23 +1189,10 @@ typedef void (*obj_ker_fn_t) struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, + struct cntl_s* cntl, struct thrinfo_s* thread ); -typedef void (*obj_ukr_fn_t) - ( - dim_t m, - dim_t n, - dim_t k, - void* restrict alpha, - void* restrict a, inc_t rs_a, inc_t cs_a, - void* restrict b, inc_t rs_b, inc_t cs_b, - void* restrict beta, - void* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - struct cntx_s* restrict cntx - ); - typedef struct obj_s { // Basic fields @@ -1237,13 +1223,11 @@ typedef struct obj_s dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel - // User data pointer - void* user_data; - - // Function pointers - obj_pack_fn_t pack; - obj_ker_fn_t ker; - obj_ukr_fn_t ukr; + // User-customizable fields + obj_pack_fn_t pack_fn; + void* pack_params; + obj_ker_fn_t ker_fn; + void* ker_params; } obj_t; @@ -1258,70 +1242,68 @@ typedef struct obj_s #define BLIS_OBJECT_INITIALIZER \ { \ - .root = NULL, \ + .root = NULL, \ \ - .off = { 0, 0 }, \ - .dim = { 0, 0 }, \ - .diag_off = 0, \ + .off = { 0, 0 }, \ + .dim = { 0, 0 }, \ + .diag_off = 0, \ \ - .info = 0x0 | BLIS_BITVAL_DENSE | \ - BLIS_BITVAL_GENERAL, \ - .info2 = 0x0, \ - .elem_size = sizeof( float ), /* this is changed later. */ \ + .info = 0x0 | BLIS_BITVAL_DENSE | \ + BLIS_BITVAL_GENERAL, \ + .info2 = 0x0, \ + .elem_size = sizeof( float ), /* this is changed later. */ \ \ - .buffer = NULL, \ - .rs = 0, \ - .cs = 0, \ - .is = 1, \ + .buffer = NULL, \ + .rs = 0, \ + .cs = 0, \ + .is = 1, \ \ - .scalar = { 0.0, 0.0 }, \ + .scalar = { 0.0, 0.0 }, \ \ - .m_padded = 0, \ - .n_padded = 0, \ - .ps = 0, \ - .pd = 0, \ - .m_panel = 0, \ - .n_panel = 0, \ + .m_padded = 0, \ + .n_padded = 0, \ + .ps = 0, \ + .pd = 0, \ + .m_panel = 0, \ + .n_panel = 0, \ \ - .user_data = NULL, \ -\ - .pack = NULL, \ - .ker = NULL, \ - .ukr = NULL \ + .pack_fn = NULL, \ + .pack_params = NULL, \ + .ker_fn = NULL, \ + .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ - .root = NULL, \ -\ - .off = { 0, 0 }, \ - .dim = { 1, 1 }, \ - .diag_off = 0, \ + .root = NULL, \ \ - .info = 0x0 | BLIS_BITVAL_DENSE | \ - BLIS_BITVAL_GENERAL, \ - .info2 = 0x0, \ - .elem_size = sizeof( float ), /* this is changed later. */ \ + .off = { 0, 0 }, \ + .dim = { 1, 1 }, \ + .diag_off = 0, \ \ - .buffer = NULL, \ - .rs = 0, \ - .cs = 0, \ - .is = 1, \ + .info = 0x0 | BLIS_BITVAL_DENSE | \ + BLIS_BITVAL_GENERAL, \ + .info2 = 0x0, \ + .elem_size = sizeof( float ), /* this is changed later. */ \ \ - .scalar = { 0.0, 0.0 }, \ + .buffer = NULL, \ + .rs = 0, \ + .cs = 0, \ + .is = 1, \ \ - .m_padded = 0, \ - .n_padded = 0, \ - .ps = 0, \ - .pd = 0, \ - .m_panel = 0, \ - .n_panel = 0, \ + .scalar = { 0.0, 0.0 }, \ \ - .user_data = NULL, \ + .m_padded = 0, \ + .n_padded = 0, \ + .ps = 0, \ + .pd = 0, \ + .m_panel = 0, \ + .n_panel = 0, \ \ - .pack = NULL, \ - .ker = NULL, \ - .ukr = NULL \ + .pack_fn = NULL, \ + .pack_params = NULL, \ + .ker_fn = NULL, \ + .ker_params = NULL \ } // Define these macros here since they must be updated if contents of @@ -1329,77 +1311,75 @@ typedef struct obj_s BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { - b->root = a->root; - - b->off[0] = a->off[0]; - b->off[1] = a->off[1]; - b->dim[0] = a->dim[0]; - b->dim[1] = a->dim[1]; - b->diag_off = a->diag_off; - - b->info = a->info; - b->info2 = a->info2; - b->elem_size = a->elem_size; - - b->buffer = a->buffer; - b->rs = a->rs; - b->cs = a->cs; - b->is = a->is; - - b->scalar = a->scalar; - - //b->pack_mem = a->pack_mem; - b->m_padded = a->m_padded; - b->n_padded = a->n_padded; - b->ps = a->ps; - b->pd = a->pd; - b->m_panel = a->m_panel; - b->n_panel = a->n_panel; - - b->user_data = a->user_data; - - b->pack = a->pack; - b->ker = a->ker; - b->ukr = a->ukr; + b->root = a->root; + + b->off[0] = a->off[0]; + b->off[1] = a->off[1]; + b->dim[0] = a->dim[0]; + b->dim[1] = a->dim[1]; + b->diag_off = a->diag_off; + + b->info = a->info; + b->info2 = a->info2; + b->elem_size = a->elem_size; + + b->buffer = a->buffer; + b->rs = a->rs; + b->cs = a->cs; + b->is = a->is; + + b->scalar = a->scalar; + + //b->pack_mem = a->pack_mem; + b->m_padded = a->m_padded; + b->n_padded = a->n_padded; + b->ps = a->ps; + b->pd = a->pd; + b->m_panel = a->m_panel; + b->n_panel = a->n_panel; + + b->pack_fn = a->pack_fn; + b->pack_params = a->pack_params; + b->ker_fn = a->ker_fn; + b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { - b->root = a->root; + b->root = a->root; - b->off[0] = a->off[0]; - b->off[1] = a->off[1]; + b->off[0] = a->off[0]; + b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. - //b->dim[0] = a->dim[0]; - //b->dim[1] = a->dim[1]; - b->diag_off = a->diag_off; + //b->dim[0] = a->dim[0]; + //b->dim[1] = a->dim[1]; + b->diag_off = a->diag_off; - b->info = a->info; - b->info2 = a->info2; - b->elem_size = a->elem_size; + b->info = a->info; + b->info2 = a->info2; + b->elem_size = a->elem_size; - b->buffer = a->buffer; - b->rs = a->rs; - b->cs = a->cs; - b->is = a->is; + b->buffer = a->buffer; + b->rs = a->rs; + b->cs = a->cs; + b->is = a->is; - b->scalar = a->scalar; + b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. - //b->pack_mem = a->pack_mem; - b->m_padded = a->m_padded; - b->n_padded = a->n_padded; - b->ps = a->ps; - b->pd = a->pd; - b->m_panel = a->m_panel; - b->n_panel = a->n_panel; - - b->user_data = a->user_data; - - b->pack = a->pack; - b->ker = a->ker; - b->ukr = a->ukr; + //b->pack_mem = a->pack_mem; + b->m_padded = a->m_padded; + b->n_padded = a->n_padded; + b->ps = a->ps; + b->pd = a->pd; + b->m_panel = a->m_panel; + b->n_panel = a->n_panel; + + b->pack_fn = a->pack_fn; + b->pack_params = a->pack_params; + b->ker_fn = a->ker_fn; + b->ker_params = a->ker_params; } // Initializors for global scalar constants. diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 48996f28e..d37005b28 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -169,7 +169,6 @@ void libblis_test_gemm_ukr_experiment num_t datatype; dim_t m, n, k; - inc_t ldap, ldbp; char sc_a = 'c'; char sc_b = 'r'; @@ -194,11 +193,6 @@ void libblis_test_gemm_ukr_experiment m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx ); n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx ); - // Also query PACKMR and PACKNR as the leading dimensions to ap and bp, - // respectively. - ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx ); - ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx ); - // Store the register blocksizes so that the driver can retrieve the // values later when printing results. op->dim_aux[0] = m; @@ -237,7 +231,13 @@ void libblis_test_gemm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); -#if 0 + rntm_t rntm; + bli_rntm_init( &rntm ); + bli_pba_rntm_set_pba( &rntm ); + + // Transpose B to B^T for packing. + bli_obj_induce_trans( &b ); + // Create pack objects for a and b, and pack them to ap and bp, // respectively. cntl_t* cntl_a = libblis_test_pobj_create @@ -248,56 +248,26 @@ void libblis_test_gemm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - cntx + cntx, + &rntm ); cntl_t* cntl_b = libblis_test_pobj_create ( - BLIS_KR, BLIS_NR, + BLIS_KR, BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL, &b, &bp, - cntx + cntx, + &rntm ); -#endif - - // Create the packed objects. Use packmr and packnr as the leading - // dimensions of ap and bp, respectively. Note that we use the ldims - // instead of the matrix dimensions for allocation purposes here. - // This is a little hacky and was prompted when trying to support - // configurations such as power9 that employ duplication/broadcasting - // of elements in one of the packed matrix objects. Thankfully, packm - // doesn't care about those dimensions and instead relies on - // information taken from the source object. Thus, this is merely - // about coaxing bli_obj_create() in allocating enough space for our - // purposes. - bli_obj_create( datatype, ldap, k, 1, ldap, &ap ); - bli_obj_create( datatype, k, ldbp, ldbp, 1, &bp ); - - // Set up the objects for packing. Calling packm_init_pack() does everything - // except checkout a memory pool block and save its address to the obj_t's. - // However, it does overwrite the buffer field of packed object with that of - // the source object (as a side-effect of bli_obj_alias_to(); that buffer - // field would normally be overwritten yet again by the address from the - // memory pool block). So, we have to save the buffer address that was - // allocated so we can re-store it to the object afterward. - void* buf_ap = bli_obj_buffer( &ap ); - void* buf_bp = bli_obj_buffer( &bp ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_MR, BLIS_KR, &a, &ap, cntx ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_KR, BLIS_NR, &b, &bp, cntx ); - bli_obj_set_buffer( buf_ap, &ap ); - bli_obj_set_buffer( buf_bp, &bp ); - - // Pack the data from the source objects. - bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - - // Repeat the experiment n_repeats times and record results. + + // Transpose B^T back to B and Bp^T back to Bp. + bli_obj_induce_trans( &b ); + bli_obj_induce_trans( &bp ); + + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c_save, &c ); @@ -321,16 +291,10 @@ void libblis_test_gemm_ukr_experiment // Zero out performance and residual if output matrix is empty. libblis_test_check_empty_problem( &c, perf, resid ); -#if 0 // Free the control tree nodes and release their cached mem_t entries - // back to the memory broker. - bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); -#endif - - // Free the packed objects. - bli_obj_free( &ap ); - bli_obj_free( &bp ); + // back to the pba. + bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index b3916db6a..48fcb78db 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -283,7 +283,10 @@ void libblis_test_gemmtrsm_ukr_experiment bli_copym( &b11, &c11 ); bli_copym( &c11, &c11_save ); -#if 0 + rntm_t rntm; + bli_rntm_init( &rntm ); + bli_pba_rntm_set_pba( &rntm ); + // Create pack objects for a and b, and pack them to ap and bp, // respectively. cntl_t* cntl_a = libblis_test_pobj_create @@ -294,59 +297,9 @@ void libblis_test_gemmtrsm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - &cntx - ); - cntl_t* cntl_b = libblis_test_pobj_create - ( - BLIS_MR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - &cntx + cntx, + &rntm ); -#endif - - // Create the packed objects. Use packmr and packnr as the leading - // dimensions of ap and bp, respectively. Note that we use the ldims - // instead of the matrix dimensions for allocation purposes here. - // This is a little hacky and was prompted when trying to support - // configurations such as power9 that employ duplication/broadcasting - // of elements in one of the packed matrix objects. Thankfully, packm - // doesn't care about those dimensions and instead relies on - // information taken from the source object. Thus, this is merely - // about coaxing bli_obj_create() in allocating enough space for our - // purposes. - bli_obj_create( datatype, ldap, k+m, 1, ldap, &ap ); - bli_obj_create( datatype, k+m, ldbp, ldbp, 1, &bp ); - - // We overwrite the m dimension of ap and n dimension of bp with - // m and n, respectively, so that these objects contain the correct - // logical dimensions. Recall that ldap and ldbp were used only to - // induce bli_obj_create() to allocate sufficient memory for the - // duplication in rare instances where the subconfig uses a gemm - // ukernel that duplicates elements in one of the operands. - bli_obj_set_length( m, &ap ); - bli_obj_set_width( n, &bp ); - - // Set up the objects for packing. Calling packm_init_pack() does everything - // except checkout a memory pool block and save its address to the obj_t's. - // However, it does overwrite the buffer field of packed object with that of - // the source object (as a side-effect of bli_obj_alias_to(); that buffer - // field would normally be overwritten yet again by the address from the - // memory pool block). So, we have to save the buffer address that was - // allocated so we can re-store it to the object afterward. - void* buf_ap = bli_obj_buffer( &ap ); - void* buf_bp = bli_obj_buffer( &bp ); - bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_MR, BLIS_KR, &a, &ap, cntx ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_KR, BLIS_NR, &b, &bp, cntx ); - bli_obj_set_buffer( buf_ap, &ap ); - bli_obj_set_buffer( buf_bp, &bp ); // Set the diagonal offset of ap. if ( bli_is_lower( uploa ) ) { bli_obj_set_diag_offset( k, &ap ); } @@ -357,32 +310,45 @@ void libblis_test_gemmtrsm_ukr_experiment // to know how to initialize the subpartitions. bli_obj_set_uplo( uploa, &ap ); - // Pack the data from the source objects. - bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - - // Create subpartitions from the a and b panels. - bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, - &a1xp, &a11p, &bx1p, &b11p ); - - // Set the uplo field of a11p since the default for packed objects is - // BLIS_DENSE, and the _ukernel() wrapper needs this information to - // know which set of micro-kernels (lower or upper) to choose from. - bli_obj_set_uplo( uploa, &a11p ); - #if 0 bli_printm( "a", &a, "%5.2f", "" ); bli_printm( "ap", &ap, "%5.2f", "" ); #endif - // Repeat the experiment n_repeats times and record results. + cntl_t* cntl_b = NULL; + + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c11_save, &c11 ); - // Re-pack (restore) the contents of b to bp. - //bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); + // Transpose B to B^T for packing. + bli_obj_induce_trans( &b ); + + cntl_b = libblis_test_pobj_create + ( + BLIS_NR, + BLIS_MR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + cntx, + &rntm + ); + + // Transpose B^T back to B and Bp^T back to Bp. + bli_obj_induce_trans( &b ); + bli_obj_induce_trans( &bp ); + + // Create subpartitions from the a and b panels. + bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, + &a1xp, &a11p, &bx1p, &b11p ); + + // Set the uplo field of a11p since the default for packed objects is + // BLIS_DENSE, and the _ukernel() wrapper needs this information to + // know which set of micro-kernels (lower or upper) to choose from. + bli_obj_set_uplo( uploa, &a11p ); time = bli_clock(); @@ -391,6 +357,15 @@ bli_printm( "ap", &ap, "%5.2f", "" ); cntx ); time_min = bli_clock_min_diff( time_min, time ); + + // On the last pass, we must keep the packed B buffer checked out in order + // to perform the correctness check later. + if ( i < n_repeats - 1 ) + { + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + } } // Estimate the performance of the best experiment repeat. @@ -426,16 +401,11 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c11, perf, resid ); -#if 0 // Free the control tree nodes and release their cached mem_t entries - // back to the memory broker. - bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); -#endif - - // Free the packed objects. - bli_obj_free( &ap ); - bli_obj_free( &bp ); + // back to the pba. + bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + if ( cntl_b ) + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a_big ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index bbfd0ac63..edab9796d 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -636,7 +636,7 @@ void libblis_test_read_op_info( test_ops_t* ops, int i, p; // Initialize the operation type field. - op->opid = opid; + op->opid = opid; // Read the line for the overall operation switch. libblis_test_read_next_line( buffer, input_stream ); @@ -671,7 +671,7 @@ void libblis_test_read_op_info( test_ops_t* ops, //printf( "buffer[p]: %s\n", &buffer[p] ); // Advance until we hit non-whitespace (ie: the next number). - for ( ; isspace( buffer[p] ); ++p ) ; + for ( ; isspace( buffer[p] ); ++p ) ; //printf( "buffer[p] after: %s\n", &buffer[p] ); @@ -680,7 +680,7 @@ void libblis_test_read_op_info( test_ops_t* ops, //printf( "dim[%d] = %d\n", i, op->dim_spec[i] ); // Advance until we hit whitespace (ie: the space before the next number). - for ( ; !isspace( buffer[p] ); ++p ) ; + for ( ; !isspace( buffer[p] ); ++p ) ; } } @@ -778,11 +778,11 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) // convert these values into strings, with "unset" being used if the // value returned was -1 (indicating the environment variable was unset). dim_t nt = bli_thread_get_num_threads(); - dim_t jc_nt = bli_thread_get_jc_nt(); - dim_t pc_nt = bli_thread_get_pc_nt(); - dim_t ic_nt = bli_thread_get_ic_nt(); - dim_t jr_nt = bli_thread_get_jr_nt(); - dim_t ir_nt = bli_thread_get_ir_nt(); + dim_t jc_nt = bli_thread_get_jc_nt(); + dim_t pc_nt = bli_thread_get_pc_nt(); + dim_t ic_nt = bli_thread_get_ic_nt(); + dim_t jr_nt = bli_thread_get_jr_nt(); + dim_t ir_nt = bli_thread_get_ir_nt(); if ( nt == -1 ) sprintf( nt_str, "unset" ); else sprintf( nt_str, "%d", ( int ) nt ); @@ -1739,7 +1739,7 @@ void libblis_test_op_driver = ( char* ) malloc( ( n_operands + 1 ) * sizeof( char ) ); for ( o = 0; o < n_operands; ++o ) - { + { unsigned int ij; operand_t operand_type = libblis_test_get_operand_type_for_char( o_types[o] ); @@ -2181,7 +2181,7 @@ void libblis_test_op_driver ind_str = bli_ind_oper_get_avail_impl_string( op->opid, datatype ); // Loop over the requested parameter combinations. - for ( pci = 0; pci < n_param_combos; ++pci ) + for ( pci = 0; pci < n_param_combos; ++pci ) { // Loop over the requested problem sizes. for ( p_cur = p_first, pi = 1; p_cur <= p_max; p_cur += p_inc, ++pi ) @@ -2403,7 +2403,7 @@ void libblis_test_build_function_string if ( strlen( funcname_str ) > MAX_FUNC_STRING_LENGTH ) libblis_test_printf_error( "Function name string length (%d) exceeds maximum (%d).\n", strlen( funcname_str ), MAX_FUNC_STRING_LENGTH ); - + } @@ -2545,7 +2545,7 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c dim_t n_trans = n; dim_t rs = 1; // Initialization avoids a compiler warning. dim_t cs = 1; // Initialization avoids a compiler warning. - + // Apply the trans parameter to the dimensions (if needed). bli_set_dims_with_trans( trans, m, n, &m_trans, &n_trans ); @@ -2591,12 +2591,9 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c } - -#if 0 -cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ) +cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm ) { bool does_inv_diag; - rntm_t rntm; if ( inv_diag == BLIS_NO_INVERT_DIAG ) does_inv_diag = FALSE; else does_inv_diag = TRUE; @@ -2606,7 +2603,6 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia ( NULL, // we don't need the small block allocator from the runtime. NULL, // func ptr is not referenced b/c we don't call via l3 _int(). - bli_packm_blk_var1, bmult_id_m, bmult_id_n, does_inv_diag, @@ -2617,20 +2613,13 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia NULL // no child node needed ); - // Initialize a local-to-BLIS rntm_t. This is simply so we have something - // to pass into bli_l3_packm(). The function doesn't (currently) use the - // runtime object, and even if it did, one with default values would work - // fine here. - bli_rntm_init( &rntm ); - // Pack the contents of A to P. - bli_l3_packm( a, p, cntx, &rntm, cntl, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( a, p, cntx, rntm, cntl, &BLIS_PACKM_SINGLE_THREADED ); // Return the control tree pointer so the caller can free the cntl_t and its // mem_t entry later on. return cntl; } -#endif void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x ) @@ -2975,7 +2964,7 @@ void libblis_test_parse_message( FILE* output_stream, char* message, va_list arg char* the_string; char the_char; - // Begin looping over message to insert variables wherever there are + // Begin looping over message to insert variables wherever there are // format specifiers. for ( c = 0; message[c] != '\0'; ) { diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index 786f82b30..cdb3c6dac 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -418,7 +418,7 @@ void fill_string_with_n_spaces( char* str, unsigned int n_spaces ); // --- Create object --- void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, char storage, dim_t m, dim_t n, obj_t* a ); -cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ); +cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm ); void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x ); // --- Randomize/initialize object --- diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 6366e5fc3..b07da91cc 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -171,7 +171,6 @@ void libblis_test_trsm_ukr_experiment num_t datatype; dim_t m, n; - inc_t ldap, ldbp; char sc_a = 'c'; char sc_b = 'r'; @@ -196,11 +195,6 @@ void libblis_test_trsm_ukr_experiment m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx ); n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx ); - // Also query PACKMR and PACKNR as the leading dimensions to ap and bp, - // respectively. - ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx ); - ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx ); - // Store the register blocksizes so that the driver can retrieve the // values later when printing results. op->dim_aux[0] = m; @@ -238,7 +232,10 @@ void libblis_test_trsm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); -#if 0 + rntm_t rntm; + bli_rntm_init( &rntm ); + bli_pba_rntm_set_pba( &rntm ); + // Create pack objects for a and b, and pack them to ap and bp, // respectively. cntl_t* cntl_a = libblis_test_pobj_create @@ -249,50 +246,9 @@ void libblis_test_trsm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - cntx + cntx, + &rntm ); - cntl_t* cntl_b = libblis_test_pobj_create - ( - BLIS_MR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - cntx - ); -#endif - - // Create the packed objects. Use packmr and packnr as the leading - // dimensions of ap and bp, respectively. Note that we use the ldims - // instead of the matrix dimensions for allocation purposes here. - // This is a little hacky and was prompted when trying to support - // configurations such as power9 that employ duplication/broadcasting - // of elements in one of the packed matrix objects. Thankfully, packm - // doesn't care about those dimensions and instead relies on - // information taken from the source object. Thus, this is merely - // about coaxing bli_obj_create() in allocating enough space for our - // purposes. - bli_obj_create( datatype, ldap, m, 1, ldap, &ap ); - bli_obj_create( datatype, m, ldbp, ldbp, 1, &bp ); - - // Set up the objects for packing. Calling packm_init_pack() does everything - // except checkout a memory pool block and save its address to the obj_t's. - // However, it does overwrite the buffer field of packed object with that of - // the source object (as a side-effect of bli_obj_alias_to(); that buffer - // field would normally be overwritten yet again by the address from the - // memory pool block). So, we have to save the buffer address that was - // allocated so we can re-store it to the object afterward. - void* buf_ap = bli_obj_buffer( &ap ); - void* buf_bp = bli_obj_buffer( &bp ); - bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_MR, BLIS_KR, &a, &ap, cntx ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, - BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, - BLIS_KR, BLIS_NR, &b, &bp, cntx ); - bli_obj_set_buffer( buf_ap, &ap ); - bli_obj_set_buffer( buf_bp, &bp ); // Set the diagonal offset of ap. bli_obj_set_diag_offset( 0, &ap ); @@ -302,24 +258,35 @@ void libblis_test_trsm_ukr_experiment // know which set of micro-kernels (lower or upper) to choose from. bli_obj_set_uplo( uploa, &ap ); - // Pack the data from the source objects. - bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - #if 0 bli_printm( "a", &a, "%5.2f", "" ); bli_printm( "ap", &ap, "%5.2f", "" ); #endif - // Repeat the experiment n_repeats times and record results. + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { - // Re-pack the contents of b to bp. - //bli_packm_blk_var1( &b, &bp, cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); - bli_copym( &c_save, &c ); + // Transpose B to B^T for packing. + bli_obj_induce_trans( &b ); + + cntl_t* cntl_b = libblis_test_pobj_create + ( + BLIS_NR, + BLIS_MR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + cntx, + &rntm + ); + + // Transpose B^T back to B and Bp^T back to Bp. + bli_obj_induce_trans( &b ); + bli_obj_induce_trans( &bp ); + time = bli_clock(); libblis_test_trsm_ukr_impl( iface, side, @@ -327,6 +294,10 @@ bli_printm( "ap", &ap, "%5.2f", "" ); cntx ); time_min = bli_clock_min_diff( time_min, time ); + + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); } // Estimate the performance of the best experiment repeat. @@ -339,16 +310,9 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c, perf, resid ); -#if 0 // Free the control tree nodes and release their cached mem_t entries // back to the memory broker. - bli_cntl_free( NULL, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - bli_cntl_free( NULL, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); -#endif - - // Free the packed objects. - bli_obj_free( &ap ); - bli_obj_free( &bp ); + bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a ); From 961d9d509dd94f3a66f7095057e3dc8eb6d89839 Mon Sep 17 00:00:00 2001 From: Kiran Date: Wed, 8 Dec 2021 03:00:38 +0530 Subject: [PATCH 013/230] Re-add BLIS_ENABLE_ZEN_BLOCK_SIZES macro for 'zen'. Details: - Added previously-deleted cpp macro block to bli_cntx_init_zen.c targeting the Naples microarchitecture that enabled different cache blocksizes when the number of threads exceeds 16. This commit represents PR #573. --- config/zen/bli_cntx_init_zen.c | 15 +++++++++++---- config/zen/bli_family_zen.h | 1 + 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index ed7287cee..615a31a04 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -173,15 +173,22 @@ void bli_cntx_init_zen( cntx_t* cntx ) mc = 510, kc = 1024 and nc = 4080 */ +#ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES + // Zen optmized level 3 cache block sizes #if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1020, 510, 510, 255 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1024, 1024, 1024, 1024 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 ); #else bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 2040, 1528 ); #endif +#else + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 ); +#endif bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); diff --git a/config/zen/bli_family_zen.h b/config/zen/bli_family_zen.h index d1c4ef828..da03bd7e4 100644 --- a/config/zen/bli_family_zen.h +++ b/config/zen/bli_family_zen.h @@ -39,6 +39,7 @@ #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 +#define BLIS_ENABLE_ZEN_BLOCK_SIZES // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 From 54fa28bd847b389215cffb57a83dc9b3dce79c86 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 24 Dec 2021 08:00:33 -0600 Subject: [PATCH 014/230] Move edge cases to gemm ukr; more user-custom mods. (#583) Details: - Moved edge-case handling into the gemm microkernel. This required changing the microkernel API to take m and n dimension parameters. This required updating all existing gemm microkernel function pointer types, function signatures, and related definitions to take m and n dimensions. We also updated all existing kernels in the 'kernels' directory to take m and n dimensions, and implemented edge-case handling within those microkernels via a collection of new C preprocessor macros defined within bli_edge_case_macro_defs.h. Also removed the assembly code that formerly would handle general stride IO on the microtile, since this can now be handled by the same code that does edge cases. - Pass the obj_t.ker_fn (of matrix C) into bli_gemm_cntl_create() and bli_trsm_cntl_create(), where this function pointer is used in lieu of the default macrokernel when it is non-NULL, and ignored when it is NULL. - Re-implemented macrokernel in bli_gemm_ker_var2.c to be a single function using byte pointers rather that one function for each floating-point datatype. Also, obtain the microkernel function pointer from the .ukr field of the params struct embedded within the obj_t for matrix C (assuming params is non-NULL and contains a non-NULL value in the .ukr field). Communicate both the gemm microkernel pointer to use as well as the params struct to the microkernel via the auxinfo_t struct. - Defined gemm_ker_params_t type (for the aforementioned obj_t.params struct) in bli_gemm_var.h. - Retired the separate _md macrokernel for mixed datatype computation. We now use the reimplemented bli_gemm_ker_var2() instead. - Updated gemmt macrokernels to pass m and n dimensions into microkernel calls. - Removed edge-case handling from trmm and trsm macrokernels. - Moved most of bli_packm_alloc() code into a new helper function, bli_packm_alloc_ex(). - Fixed a typo bug in bli_gemmtrsm_u_template_noopt_mxn.c. - Added test/syrk_diagonal and test/tensor_contraction directories with associated code to test those operations. --- .../kernels/3/bli_gemm_template_noopt_mxn.c | 13 +- .../3/bli_gemmtrsm_l_template_noopt_mxn.c | 4 + .../3/bli_gemmtrsm_u_template_noopt_mxn.c | 8 +- frame/1m/packm/bli_packm_alloc.c | 58 +- frame/1m/packm/bli_packm_alloc.h | 23 +- frame/3/bli_l3_cntl.c | 18 +- frame/3/bli_l3_ft_ukr.h | 2 + frame/3/bli_l3_ukr_oapi.c | 4 + frame/3/bli_l3_ukr_prot.h | 2 + frame/3/bli_l3_ukr_tapi.c | 63 +- frame/3/gemm/bli_gemm_cntl.c | 15 +- frame/3/gemm/bli_gemm_cntl.h | 6 +- frame/3/gemm/bli_gemm_front.c | 87 - frame/3/gemm/bli_gemm_ker_var2.c | 570 ++- frame/3/gemm/bli_gemm_ker_var2_md.c | 406 -- frame/3/gemm/bli_gemm_md.h | 61 +- frame/3/gemm/bli_gemm_md_c2r_ref.c | 49 +- frame/3/gemm/bli_gemm_var.h | 39 +- frame/3/gemm/ind/bli_gemm_ind_opt.h | 2 + frame/3/gemmt/bli_gemmt_l_ker_var2.c | 107 +- frame/3/gemmt/bli_gemmt_u_ker_var2.c | 107 +- frame/3/trmm/bli_trmm_ll_ker_var2.c | 125 +- frame/3/trmm/bli_trmm_lu_ker_var2.c | 121 +- frame/3/trmm/bli_trmm_rl_ker_var2.c | 127 +- frame/3/trmm/bli_trmm_ru_ker_var2.c | 129 +- frame/3/trsm/bli_trsm_cntl.c | 21 +- frame/3/trsm/bli_trsm_cntl.h | 9 +- frame/3/trsm/bli_trsm_ll_ker_var2.c | 52 +- frame/3/trsm/bli_trsm_lu_ker_var2.c | 52 +- frame/3/trsm/bli_trsm_rl_ker_var2.c | 52 +- frame/3/trsm/bli_trsm_ru_ker_var2.c | 52 +- frame/base/bli_auxinfo.h | 20 +- frame/include/bli_edge_case_macro_defs.h | 109 + frame/include/bli_macro_defs.h | 1 + frame/include/bli_type_defs.h | 7 + .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c | 15 +- .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c | 15 +- .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c | 15 +- .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c | 15 +- .../3/bli_gemm_armsve_asm_z2vx7_unindexed.c | 15 +- .../3/bli_gemm_armsve_asm_z2vx8_unindexed.c | 15 +- kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c | 48 +- kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c | 115 +- kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 3948 +++++++++-------- kernels/bgq/3/bli_gemm_bgq_int_8x8.c | 12 + .../3/bli_gemm_bulldozer_asm_d4x6_fma4.c | 2073 +++------ kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c | 1940 +++----- kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c | 1898 +++----- kernels/knc/3/bli_dgemm_knc_asm_30x8.c | 127 +- kernels/knc/3/bli_sgemm_knc_asm_30x16.c | 129 +- kernels/knl/3/bli_dgemm_knl_asm_24x8.c | 85 +- kernels/knl/3/bli_sgemm_knl_asm_24x16.c | 85 +- kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c | 1300 ++---- .../3/bli_gemm_piledriver_asm_d8x3.c | 2029 +++------ kernels/power10/3/bli_dgemm_power10_mma.c | 43 +- kernels/power10/3/bli_i16gemm_power10_mma.c | 10 +- kernels/power10/3/bli_i16sgemm_power10_mma.c | 10 +- kernels/power10/3/bli_i4gemm_power10_mma.c | 40 +- kernels/power10/3/bli_i8gemm_power10_mma.c | 38 +- kernels/power10/3/bli_sbgemm_power10_mma.c | 18 +- kernels/power10/3/bli_sgemm_power10_mma.c | 24 +- kernels/power10/3/bli_shgemm_power10_mma.c | 18 +- kernels/power7/3/bli_gemm_power7_int_8x4.c | 368 +- .../power7/3/test/bli_gemm_power7_int_8x4.h | 8 + kernels/power9/3/bli_gemm_power9_asm_d12x6.c | 238 +- .../3/bli_gemm_sandybridge_asm_d8x4.c | 3030 ++++--------- .../3/bli_gemm_sandybridge_int_d8x4.c | 361 +- kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c | 105 +- kernels/skx/3/bli_dgemm_skx_asm_16x14.c | 183 +- kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c | 111 +- ref_kernels/3/bb/bli_gemmbb_ref.c | 5 +- ref_kernels/3/bb/bli_gemmtrsmbb_ref.c | 2 + ref_kernels/3/bli_gemm_ref.c | 23 +- ref_kernels/3/bli_gemmtrsm_ref.c | 4 + ref_kernels/ind/bli_gemm1m_ref.c | 26 +- ref_kernels/ind/bli_gemmtrsm1m_ref.c | 2 + test/syrk_diagonal/complex_math.hpp | 267 ++ test/syrk_diagonal/syrk_diagonal_example.c | 186 + test/syrk_diagonal/syrk_diagonal_example.cxx | 220 + test/syrk_diagonal/syrk_diagonal_example2.c | 354 ++ test/syrk_diagonal/syrk_diagonal_example2.cxx | 338 ++ test/syrk_diagonal/syrk_diagonal_ref.cxx | 102 + test/syrk_diagonal/syrk_diagonal_ref.h | 8 + test/tensor_contraction/complex_math.hpp | 267 ++ test/tensor_contraction/tcontract_example.cxx | 988 +++++ test/tensor_contraction/tcontract_ref.cxx | 67 + test/tensor_contraction/tcontract_ref.hpp | 100 + 87 files changed, 10458 insertions(+), 13506 deletions(-) delete mode 100644 frame/3/gemm/bli_gemm_ker_var2_md.c create mode 100644 frame/include/bli_edge_case_macro_defs.h create mode 100644 test/syrk_diagonal/complex_math.hpp create mode 100644 test/syrk_diagonal/syrk_diagonal_example.c create mode 100644 test/syrk_diagonal/syrk_diagonal_example.cxx create mode 100644 test/syrk_diagonal/syrk_diagonal_example2.c create mode 100644 test/syrk_diagonal/syrk_diagonal_example2.cxx create mode 100644 test/syrk_diagonal/syrk_diagonal_ref.cxx create mode 100644 test/syrk_diagonal/syrk_diagonal_ref.h create mode 100644 test/tensor_contraction/complex_math.hpp create mode 100644 test/tensor_contraction/tcontract_example.cxx create mode 100644 test/tensor_contraction/tcontract_ref.cxx create mode 100644 test/tensor_contraction/tcontract_ref.hpp diff --git a/config/template/kernels/3/bli_gemm_template_noopt_mxn.c b/config/template/kernels/3/bli_gemm_template_noopt_mxn.c index b7a13f3b6..06f25a0e9 100644 --- a/config/template/kernels/3/bli_gemm_template_noopt_mxn.c +++ b/config/template/kernels/3/bli_gemm_template_noopt_mxn.c @@ -37,6 +37,8 @@ void bli_zgemm_template_noopt ( + dim_t m, + dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a1, @@ -88,8 +90,7 @@ void bli_zgemm_template_noopt dim_t l, j, i; - dcomplex ab[ bli_zmr * - bli_znr ]; + dcomplex ab[ mr * nr ]; dcomplex* abij; dcomplex ai, bj; @@ -137,16 +138,16 @@ void bli_zgemm_template_noopt if ( bli_zeq0( *beta ) ) { /* c11 := ab */ - bli_zcopys_mxn( mr, - nr, + bli_zcopys_mxn( m, + n, ab, rs_ab, cs_ab, c11, rs_c, cs_c ); } else { /* c11 := beta * c11 + ab */ - bli_zxpbys_mxn( mr, - nr, + bli_zxpbys_mxn( m, + n, ab, rs_ab, cs_ab, beta, c11, rs_c, cs_c ); diff --git a/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c index da0cd3110..87c21f7ed 100644 --- a/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c +++ b/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c @@ -74,6 +74,8 @@ void bli_zgemmtrsm_l_template_noopt */ const num_t dt = BLIS_DCOMPLEX; + const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); + const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); const inc_t rs_b = packnr; @@ -84,6 +86,8 @@ void bli_zgemmtrsm_l_template_noopt /* b11 = alpha * b11 - a10 * b01; */ bli_zgemm_template_noopt ( + mr, + nr, k, minus_one, a10, diff --git a/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c index 09b3af9ce..0b4544ae1 100644 --- a/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c +++ b/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c @@ -74,6 +74,8 @@ void bli_zgemmtrsm_u_template_noopt */ const num_t dt = BLIS_DCOMPLEX; + const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); + const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); const inc_t rs_b = packnr; @@ -84,10 +86,12 @@ void bli_zgemmtrsm_u_template_noopt /* b11 = alpha * b11 - a12 * b21; */ bli_zgemm_template_noopt ( + mr, + nr, k, minus_one, - a12, - b21, + a10, + b01, alpha, b11, rs_b, cs_b, data diff --git a/frame/1m/packm/bli_packm_alloc.c b/frame/1m/packm/bli_packm_alloc.c index df6750d7a..b12a93ddc 100644 --- a/frame/1m/packm/bli_packm_alloc.c +++ b/frame/1m/packm/bli_packm_alloc.c @@ -36,16 +36,35 @@ #include "blis.h" void* bli_packm_alloc - ( - siz_t size_needed, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) + ( + siz_t size_needed, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) { // Query the pack buffer type from the control tree node. packbuf_t pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); + return bli_packm_alloc_ex + ( + size_needed, + pack_buf_type, + rntm, + cntl, + thread + ); +} + +void* bli_packm_alloc_ex + ( + siz_t size_needed, + packbuf_t pack_buf_type, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ // Query the address of the mem_t entry within the control tree node. mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl ); @@ -55,7 +74,7 @@ void* bli_packm_alloc siz_t cntl_mem_size = 0; if ( bli_mem_is_alloc( cntl_mem_p ) ) - cntl_mem_size = bli_mem_size( cntl_mem_p ); + cntl_mem_size = bli_mem_size( cntl_mem_p ); if ( cntl_mem_size < size_needed ) { @@ -64,14 +83,15 @@ void* bli_packm_alloc // The chief thread releases the existing block associated with // the mem_t entry in the control tree, and then re-acquires a // new block, saving the associated mem_t entry to local_mem_s. - if ( bli_mem_is_alloc( cntl_mem_p ) ) - { - bli_pba_release - ( - rntm, - cntl_mem_p - ); - } + if ( bli_mem_is_alloc( cntl_mem_p ) ) + { + bli_pba_release + ( + rntm, + cntl_mem_p + ); + } + bli_pba_acquire_m ( rntm, @@ -89,11 +109,11 @@ void* bli_packm_alloc // this thread's control tree node. *cntl_mem_p = *local_mem_p; - // Barrier so that the master thread doesn't return from the function - // before we are done reading. - bli_thread_barrier( thread ); + // Barrier so that the master thread doesn't return from the function + // before we are done reading. + bli_thread_barrier( thread ); } - return bli_mem_buffer( cntl_mem_p ); + return bli_mem_buffer( cntl_mem_p ); } diff --git a/frame/1m/packm/bli_packm_alloc.h b/frame/1m/packm/bli_packm_alloc.h index b433be350..5a5cf126b 100644 --- a/frame/1m/packm/bli_packm_alloc.h +++ b/frame/1m/packm/bli_packm_alloc.h @@ -32,11 +32,20 @@ */ -BLIS_EXPORT_BLIS void* bli_packm_alloc - ( - siz_t size_needed, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); +BLIS_EXPORT_BLIS void* bli_packm_alloc + ( + siz_t size_needed, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ); + +BLIS_EXPORT_BLIS void* bli_packm_alloc_ex + ( + siz_t size_needed, + packbuf_t pack_buf_type, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index 3cdecfbc2..83ff8e5af 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -57,7 +57,14 @@ void bli_l3_cntl_create_if family == BLIS_GEMMT || family == BLIS_TRMM ) { - *cntl_use = bli_gemm_cntl_create( rntm, family, schema_a, schema_b ); + *cntl_use = bli_gemm_cntl_create + ( + rntm, + family, + schema_a, + schema_b, + bli_obj_ker_fn( c ) + ); } else // if ( family == BLIS_TRSM ) { @@ -66,7 +73,14 @@ void bli_l3_cntl_create_if if ( bli_obj_is_triangular( a ) ) side = BLIS_LEFT; else side = BLIS_RIGHT; - *cntl_use = bli_trsm_cntl_create( rntm, side, schema_a, schema_b ); + *cntl_use = bli_trsm_cntl_create + ( + rntm, + side, + schema_a, + schema_b, + bli_obj_ker_fn( c ) + ); } } else diff --git a/frame/3/bli_l3_ft_ukr.h b/frame/3/bli_l3_ft_ukr.h index 4249dcbd6..561c8264f 100644 --- a/frame/3/bli_l3_ft_ukr.h +++ b/frame/3/bli_l3_ft_ukr.h @@ -47,6 +47,8 @@ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ diff --git a/frame/3/bli_l3_ukr_oapi.c b/frame/3/bli_l3_ukr_oapi.c index 33262b0bb..b8f2e00e6 100644 --- a/frame/3/bli_l3_ukr_oapi.c +++ b/frame/3/bli_l3_ukr_oapi.c @@ -51,6 +51,8 @@ void PASTEMAC0(opname) \ \ num_t dt = bli_obj_dt( c ); \ \ + dim_t m = bli_obj_length( c ); \ + dim_t n = bli_obj_width( c ); \ dim_t k = bli_obj_width( a ); \ void* buf_a = bli_obj_buffer_at_off( a ); \ void* buf_b = bli_obj_buffer_at_off( b ); \ @@ -75,6 +77,8 @@ void PASTEMAC0(opname) \ \ f \ ( \ + m, \ + n, \ k, \ buf_alpha, \ buf_a, \ diff --git a/frame/3/bli_l3_ukr_prot.h b/frame/3/bli_l3_ukr_prot.h index ca523b1d7..f68973ff5 100644 --- a/frame/3/bli_l3_ukr_prot.h +++ b/frame/3/bli_l3_ukr_prot.h @@ -42,6 +42,8 @@ \ void PASTEMAC(ch,opname) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ diff --git a/frame/3/bli_l3_ukr_tapi.c b/frame/3/bli_l3_ukr_tapi.c index 67e33175b..ab745d12b 100644 --- a/frame/3/bli_l3_ukr_tapi.c +++ b/frame/3/bli_l3_ukr_tapi.c @@ -39,6 +39,8 @@ \ void PASTEMAC(ch,opname) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ @@ -58,16 +60,19 @@ void PASTEMAC(ch,opname) \ PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the typed function for the given datatype. */ \ - f( \ - k, \ - alpha, \ - a, \ - b, \ - beta, \ - c, rs_c, cs_c, \ - data, \ - cntx \ - ); \ + f \ + ( \ + m, \ + n, \ + k, \ + alpha, \ + a, \ + b, \ + beta, \ + c, rs_c, cs_c, \ + data, \ + cntx \ + ); \ } \ INSERT_GENTFUNC_BASIC2( gemm_ukernel, gemm, BLIS_GEMM_UKR ) @@ -98,17 +103,18 @@ void PASTEMAC(ch,opname) \ PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the typed function for the given datatype. */ \ - f( \ - k, \ - alpha, \ - a1x, \ - a11, \ - bx1, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ + f \ + ( \ + k, \ + alpha, \ + a1x, \ + a11, \ + bx1, \ + b11, \ + c11, rs_c, cs_c, \ + data, \ + cntx \ + ); \ } \ INSERT_GENTFUNC_BASIC2( gemmtrsm_l_ukernel, gemmtrsm, BLIS_GEMMTRSM_L_UKR ) @@ -136,13 +142,14 @@ void PASTEMAC(ch,opname) \ PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the typed function for the given datatype. */ \ - f( \ - a, \ - b, \ - c, rs_c, cs_c, \ - data, \ - cntx \ - ); \ + f \ + ( \ + a, \ + b, \ + c, rs_c, cs_c, \ + data, \ + cntx \ + ); \ } \ INSERT_GENTFUNC_BASIC2( trsm_l_ukernel, trsm, BLIS_TRSM_L_UKR ) diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 72d78efe1..052c812a3 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -40,10 +40,11 @@ cntl_t* bli_gemm_cntl_create rntm_t* rntm, opid_t family, pack_t schema_a, - pack_t schema_b + pack_t schema_b, + void_fp ker ) { - return bli_gemmbp_cntl_create( rntm, family, schema_a, schema_b ); + return bli_gemmbp_cntl_create( rntm, family, schema_a, schema_b, ker ); } // ----------------------------------------------------------------------------- @@ -53,18 +54,22 @@ cntl_t* bli_gemmbp_cntl_create rntm_t* rntm, opid_t family, pack_t schema_a, - pack_t schema_b + pack_t schema_b, + void_fp ker ) { void_fp macro_kernel_fp; - // Use the function pointers to the macrokernels that use slab - // assignment of micropanels to threads in the jr and ir loops. + // Choose the default macrokernel based on the operation family... if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2; else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2; else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2; else /* should never execute */ macro_kernel_fp = NULL; + // ...unless a non-NULL kernel function pointer is passed in, in which + // case we use that instead. + if ( ker ) macro_kernel_fp = ker; + // Create two nodes for the macro-kernel. cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node ( diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index bff91b58a..5fa213ac4 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -38,7 +38,8 @@ cntl_t* bli_gemm_cntl_create rntm_t* rntm, opid_t family, pack_t schema_a, - pack_t schema_b + pack_t schema_b, + void_fp ker ); // ----------------------------------------------------------------------------- @@ -48,7 +49,8 @@ cntl_t* bli_gemmbp_cntl_create rntm_t* rntm, opid_t family, pack_t schema_a, - pack_t schema_b + pack_t schema_b, + void_fp ker ); #if 0 diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index a9ea21dc4..4ff45036f 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -283,90 +283,3 @@ void bli_gemm_front #endif } -// ----------------------------------------------------------------------------- - -#if 0 - if ( bli_obj_dt( a ) != bli_obj_dt( b ) || - bli_obj_dt( a ) != bli_obj_dt( c ) || - bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) - { - const bool a_is_real = bli_obj_is_real( a ); - const bool a_is_comp = bli_obj_is_complex( a ); - const bool b_is_real = bli_obj_is_real( b ); - const bool b_is_comp = bli_obj_is_complex( b ); - const bool c_is_real = bli_obj_is_real( c ); - const bool c_is_comp = bli_obj_is_complex( c ); - - const bool a_is_single = bli_obj_is_single_prec( a ); - const bool a_is_double = bli_obj_is_double_prec( a ); - const bool b_is_single = bli_obj_is_single_prec( b ); - const bool b_is_double = bli_obj_is_double_prec( b ); - const bool c_is_single = bli_obj_is_single_prec( c ); - const bool c_is_double = bli_obj_is_double_prec( c ); - - const bool comp_single = bli_obj_comp_prec( c ) == BLIS_SINGLE_PREC; - const bool comp_double = bli_obj_comp_prec( c ) == BLIS_DOUBLE_PREC; - - const bool mixeddomain = bli_obj_domain( c ) != bli_obj_domain( a ) || - bli_obj_domain( c ) != bli_obj_domain( b ); - - ( void )a_is_real; ( void )a_is_comp; - ( void )b_is_real; ( void )b_is_comp; - ( void )c_is_real; ( void )c_is_comp; - ( void )a_is_single; ( void )a_is_double; - ( void )b_is_single; ( void )b_is_double; - ( void )c_is_single; ( void )c_is_double; - ( void )comp_single; ( void )comp_double; - - if ( - //( c_is_comp && a_is_comp && b_is_real ) || - //( c_is_comp && a_is_real && b_is_comp ) || - //( c_is_real && a_is_comp && b_is_comp ) || - //( c_is_comp && a_is_real && b_is_real ) || - //( c_is_real && a_is_comp && b_is_real ) || - //( c_is_real && a_is_real && b_is_comp ) || - //FALSE - TRUE - ) - { - if ( - ( c_is_single && a_is_single && b_is_single && mixeddomain ) || - ( c_is_single && a_is_single && b_is_single && comp_single ) || - ( c_is_single && a_is_single && b_is_single && comp_double ) || - ( c_is_single && a_is_single && b_is_double ) || - ( c_is_single && a_is_double && b_is_single ) || - ( c_is_double && a_is_single && b_is_single ) || - ( c_is_single && a_is_double && b_is_double ) || - ( c_is_double && a_is_single && b_is_double ) || - ( c_is_double && a_is_double && b_is_single ) || - ( c_is_double && a_is_double && b_is_double && comp_single ) || - ( c_is_double && a_is_double && b_is_double && comp_double ) || - ( c_is_double && a_is_double && b_is_double && mixeddomain ) || - FALSE - ) - bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl ); - else - bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl ); - } - else - bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl ); - return; - } -#else -#if 0 - // If any of the storage datatypes differ, or if the execution precision - // differs from the storage precision of C, utilize the mixed datatype - // code path. - // NOTE: We could check the exec dt against the storage dt of C, but for - // now we don't support the caller setting the execution domain - // explicitly. - if ( bli_obj_dt( a ) != bli_obj_dt( b ) || - bli_obj_dt( a ) != bli_obj_dt( c ) || - bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) - { - bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl ); - return; - } -#endif -#endif - diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 0c9060552..6de361194 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -35,28 +35,44 @@ #include "blis.h" -#define FUNCPTR_T gemm_fp +typedef void (*xpbys_mxn_vft) + ( + dim_t m, + dim_t n, + void* x, inc_t rs_x, inc_t cs_x, + void* b, + void* y, inc_t rs_y, inc_t cs_y + ); -typedef void (*FUNCPTR_T) - ( - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); +#undef GENTFUNC2 +#define GENTFUNC2(ctypex,ctypey,chx,chy,op) \ +\ +void PASTEMAC2(chx,chy,op) \ + ( \ + dim_t m, \ + dim_t n, \ + void* x, inc_t rs_x, inc_t cs_x, \ + void* b, \ + void* y, inc_t rs_y, inc_t cs_y \ + ) \ +{ \ + ctypex* restrict x_cast = x; \ + ctypey* restrict b_cast = b; \ + ctypey* restrict y_cast = y; \ +\ + PASTEMAC3(chx,chy,chy,xpbys_mxn) \ + ( \ + m, n, \ + x_cast, rs_x, cs_x, \ + b_cast, \ + y_cast, rs_y, cs_y \ + ); \ +} -static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2); +INSERT_GENTFUNC2_BASIC0(xbpys_mxn_fn); +INSERT_GENTFUNC2_MIXDP0(xbpys_mxn_fn); + +static xpbys_mxn_vft GENARRAY2_ALL(xbpys_mxn, xbpys_mxn_fn); void bli_gemm_ker_var2 @@ -70,23 +86,8 @@ void bli_gemm_ker_var2 thrinfo_t* thread ) { -#ifdef BLIS_ENABLE_GEMM_MD - // By now, A and B have been packed and cast to the execution precision. - // In most cases, such as when storage precision of C differs from the - // execution precision, we utilize the mixed datatype code path. However, - // a few cases still fall within this kernel, such as mixed domain with - // equal precision (ccr, crc, rcc), hence those expressions being disabled - // in the conditional below. - if ( //( bli_obj_domain( c ) != bli_obj_domain( a ) ) || - //( bli_obj_domain( c ) != bli_obj_domain( b ) ) || - ( bli_obj_dt( c ) != bli_obj_exec_dt( c ) ) ) - { - bli_gemm_ker_var2_md( a, b, c, cntx, rntm, cntl, thread ); - return; - } -#endif - num_t dt_exec = bli_obj_exec_dt( c ); + num_t dt_c = bli_obj_dt( c ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); @@ -95,50 +96,55 @@ void bli_gemm_ker_var2 dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); + char* a_cast = bli_obj_buffer_at_off( a ); inc_t is_a = bli_obj_imag_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); + char* b_cast = bli_obj_buffer_at_off( b ); inc_t is_b = bli_obj_imag_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); - void* buf_c = bli_obj_buffer_at_off( c ); + char* c_cast = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; // Detach and multiply the scalars attached to A and B. + // NOTE: We know that the internal scalars of A and B are already of the + // target datatypes because the necessary typecasting would have already + // taken place during bli_packm_init(). + obj_t scalar_a; + obj_t scalar_b; bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); + // NOTE: We know that scalar_b is of type dt_exec due to the above code + // that casts the scalars of A and B to dt_exec via scalar_a and scalar_b, + // and we know that the internal scalar in C is already of the type dt_c + // due to the casting in the implementation of bli_obj_scalar_attach(). + char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b ); + char* beta_cast = bli_obj_internal_scalar_buffer( c ); // If 1m is being employed on a column- or row-stored matrix with a // real-valued beta, we can use the real domain macro-kernel, which // eliminates a little overhead associated with the 1m virtual // micro-kernel. + // Only employ this optimization if the storage datatype of C is + // equal to the execution/computation datatype. #if 1 if ( bli_cntx_method( cntx ) == BLIS_1M ) { bli_gemm_ind_recast_1m_params ( &dt_exec, + &dt_c, schema_a, c, &m, &n, &k, @@ -151,273 +157,211 @@ void bli_gemm_ker_var2 #ifdef BLIS_ENABLE_GEMM_MD // Tweak parameters in select mixed domain cases (rcc, crc, ccr). - bli_gemm_md_ker_var2_recast - ( - &dt_exec, - bli_obj_dt( a ), - bli_obj_dt( b ), - bli_obj_dt( c ), - &m, &n, &k, - &pd_a, &ps_a, - &pd_b, &ps_b, - c, - &rs_c, &cs_c - ); + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_gemm_md_ker_var2_recast + ( + &dt_exec, + bli_obj_dt( a ), + bli_obj_dt( b ), + &dt_c, + &m, &n, &k, + &pd_a, &ps_a, + &pd_b, &ps_b, + c, + &rs_c, &cs_c + ); + } #endif - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} + siz_t dt_size = bli_dt_size( dt_exec ); + siz_t dt_c_size = bli_dt_size( dt_c ); + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + //const dim_t PACKMR = cs_a; + //const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx ); + + // Query the params field from the obj_t. If it is non-NULL, grab the ukr + // field of the params struct. If that function pointer is non-NULL, use it + // as our microkernel instead of the default microkernel queried from the + // cntx above. + gemm_ker_params_t* params = bli_obj_ker_params( c ); + gemm_ukr_vft user_ukr = params ? params->ukr : NULL; + if ( user_ukr ) gemm_ukr = user_ukr; + + // Temporary C buffer for edge cases. Note that the strides of this + // temporary buffer are set so that they match the storage of the + // original C matrix. For example, if C is column-stored, ct will be + // column-stored as well. + char ct[ BLIS_STACK_BUF_MAX_SIZE ] + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); + const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_UKR, cntx ); + const inc_t rs_ct = ( col_pref ? 1 : NR ); + const inc_t cs_ct = ( col_pref ? MR : 1 ); + char* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); + + // + // Assumptions/assertions: + // rs_a == 1 + // cs_a == PACKMR + // pd_a == MR + // ps_a == stride to next micro-panel of A + // rs_b == PACKNR + // cs_b == 1 + // pd_b == NR + // ps_b == stride to next micro-panel of B + // rs_c == (no assumptions) + // cs_c == (no assumptions) + // + + // Compute number of primary and leftover components of the m and n + // dimensions. + dim_t n_iter = n / NR; + dim_t n_left = n % NR; + + dim_t m_iter = m / MR; + dim_t m_left = m % MR; + + if ( n_left ) ++n_iter; + if ( m_left ) ++m_iter; + + // Determine some increments used to step through A, B, and C. + inc_t rstep_a = ps_a * dt_size; + + inc_t cstep_b = ps_b * dt_size; + + inc_t rstep_c = rs_c * MR * dt_c_size; + inc_t cstep_c = cs_c * NR * dt_c_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // Save the imaginary stride of A and B to the auxinfo_t object. + bli_auxinfo_set_is_a( is_a, &aux ); + bli_auxinfo_set_is_b( is_b, &aux ); + + // Save the virtual microkernel address and the params. + bli_auxinfo_set_ukr( gemm_ukr, &aux ); + bli_auxinfo_set_params( params, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. + dim_t jr_nt = bli_thread_n_way( thread ); + dim_t jr_tid = bli_thread_work_id( thread ); + dim_t ir_nt = bli_thread_n_way( caucus ); + dim_t ir_tid = bli_thread_work_id( caucus ); + + dim_t jr_start, jr_end; + dim_t ir_start, ir_end; + dim_t jr_inc, ir_inc; + + // Determine the thread range and increment for the 2nd and 1st loops. + // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + char* b2 = b1; + + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) + { + char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) + { + a2 = a_cast; + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Edge case handling now occurs within the microkernel itself, but + // we must still explicitly accumulate to a temporary microtile in + // situations where a virtual microkernel is being used, such as + // during the 1m method or some cases of mixed datatypes. + if ( dt_exec == dt_c ) + { + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + alpha_cast, + a1, + b1, + beta_cast, + c11, rs_c, cs_c, + &aux, + cntx + ); + } + else + { + // Invoke the gemm micro-kernel. + gemm_ukr + ( + MR, + NR, + k, + alpha_cast, + a1, + b1, + zero, + &ct, rs_ct, cs_ct, + &aux, + cntx + ); + + // Accumulate to C with type-casting. + xbpys_mxn[ dt_exec ][ dt_c ] + ( + m_cur, n_cur, + &ct, rs_ct, cs_ct, + beta_cast, + c11, rs_c, cs_c + ); + } + } + } -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. Note that the virtual gemm ukernel is queried - instead of the native gemm ukernel. This is needed for certain - situations for the 1m method that require an extra layer of logic - to allow for handling (for example) complex values of beta. Also - note that under certain circumstances, the real-domain version of - this macrokernel will be called for 1m (NOT the complex version) - as an optimization. In these cases, the corresponding real-domain - slots within the cntx_t's virtual gemm ukernel func_t will contain - pointers to the *native* gemm ukernel, thanks to logic in the - context initialization function for the induced method (defined - in bli_cntx_ref.c). */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t i, j; \ - dim_t m_cur; \ - dim_t n_cur; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Determine the thread range and increment for the 2nd and 1st loops. - NOTE: The definition of bli_thread_range_jrir() will depend on whether - slab or round-robin partitioning was requested at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the bottom edge of C and add the result from above. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - } \ - } \ -\ /* -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \ -*/ \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); +*/ } -INSERT_GENTFUNC_BASIC0( gemm_ker_var2 ) - diff --git a/frame/3/gemm/bli_gemm_ker_var2_md.c b/frame/3/gemm/bli_gemm_ker_var2_md.c deleted file mode 100644 index 09c279d14..000000000 --- a/frame/3/gemm/bli_gemm_ker_var2_md.c +++ /dev/null @@ -1,406 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#ifdef BLIS_ENABLE_GEMM_MD - -#define FUNCPTR_T gemm_fp - -typedef void (*FUNCPTR_T) - ( - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY2_ALL(ftypes,gemm_ker_var2_md); - - -void bli_gemm_ker_var2_md - ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - num_t dt_c = bli_obj_dt( c ); - - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; - - // Detach and multiply the scalars attached to A and B. - // NOTE: We know that the internal scalars of A and B are already of the - // target datatypes because the necessary typecasting would have already - // taken place during bli_packm_init(). - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - // NOTE: We know that scalar_b is of type dt_exec due to the above code - // that casts the scalars of A and B to dt_exec via scalar_a and scalar_b, - // and we know that the internal scalar in C is already of the type dt_c - // due to the casting in the implementation of bli_obj_scalar_attach(). - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); - -#if 0 - // NOTE: Turns out that this optimization will never be employed since - // currently bli_gemm_ker_var2_md() is only called when the storage - // datatype of C differs from the execution/computation datatype, and - // this optimization would only make sense if they are equal. - - // If 1m is being employed on a column- or row-stored matrix with a - // real-valued beta, we can use the real domain macro-kernel, which - // eliminates a little overhead associated with the 1m virtual - // micro-kernel. - if ( bli_cntx_method( cntx ) == BLIS_1M ) - { - // Only employ this optimization if the storage datatype of C is - // equal to the execution/computation datatype. - if ( dt_c == dt_exec ) - { - bli_gemm_ind_recast_1m_params - ( - &dt_exec, - schema_a, - c, - &m, &n, &k, - &pd_a, &ps_a, - &pd_b, &ps_b, - &rs_c, &cs_c - ); - } - } -#endif - - // Tweak parameters in select mixed domain cases (rcc, crc, ccr). - bli_gemm_md_ker_var2_recast - ( - &dt_exec, - bli_obj_dt( a ), - bli_obj_dt( b ), - bli_obj_dt( c ), - &m, &n, &k, - &pd_a, &ps_a, - &pd_b, &ps_b, - c, - &rs_c, &cs_c - ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_c][dt_exec]; - - // Invoke the function. - f( schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); -} - - -#undef GENTFUNC2 -#define GENTFUNC2( ctype_c, ctype_e, chc, che, varname ) \ -\ -void PASTEMAC2(chc,che,varname) \ - ( \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dte = PASTEMAC(che,type); \ - /*const num_t dtc = PASTEMAC(chc,type);*/ \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(che,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dte, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype_e ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype_e ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dte, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype_e* restrict zero = PASTEMAC(che,0); \ - ctype_e* restrict a_cast = a; \ - ctype_e* restrict b_cast = b; \ - ctype_c* restrict c_cast = c; \ - ctype_e* restrict alpha_cast = alpha; \ - ctype_c* restrict beta_cast = beta; \ - ctype_e* restrict b1; \ - ctype_c* restrict c1; \ -\ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t i, j; \ - dim_t m_cur; \ - dim_t n_cur; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ - /* - Assumptions/assertions: - rs_a == 1 - cs_a == PACKMR - pd_a == MR - ps_a == stride to next micro-panel of A - rs_b == PACKNR - cs_b == 1 - pd_b == NR - ps_b == stride to next micro-panel of B - rs_c == (no assumptions) - cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(che,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Determine the thread range and increment for the 2nd and 1st loops. - NOTE: The definition of bli_thread_range_jrir() will depend on whether - slab or round-robin partitioning was requested at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype_e* restrict a1; \ - ctype_c* restrict c11; \ - ctype_e* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype_e* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* Always save the micropanel product to the local microtile and - then accumulate it into C via the xpbys_mxn macro. */ \ - /*if ( 1 )*/ \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the microtile of C and add the result from above. */ \ - PASTEMAC3(che,chc,chc,xpbys_mxn) \ - ( \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c \ - ); \ - } \ - } \ - } \ -\ -/* -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \ -*/ \ -} - -INSERT_GENTFUNC2_BASIC0( gemm_ker_var2_md ) -INSERT_GENTFUNC2_MIXDP0( gemm_ker_var2_md ) - -#endif diff --git a/frame/3/gemm/bli_gemm_md.h b/frame/3/gemm/bli_gemm_md.h index 8fcf6bd21..751e271ea 100644 --- a/frame/3/gemm/bli_gemm_md.h +++ b/frame/3/gemm/bli_gemm_md.h @@ -154,7 +154,7 @@ BLIS_INLINE void bli_gemm_md_ker_var2_recast num_t* dt_comp, num_t dt_a, num_t dt_b, - num_t dt_c, + num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, @@ -164,7 +164,7 @@ BLIS_INLINE void bli_gemm_md_ker_var2_recast inc_t* rs_c, inc_t* cs_c ) { - if ( bli_is_real( dt_c ) && + if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { @@ -177,7 +177,7 @@ BLIS_INLINE void bli_gemm_md_ker_var2_recast *ps_a *= 2; *ps_b *= 2; } - else if ( bli_is_complex( dt_c ) && + else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { @@ -197,6 +197,7 @@ BLIS_INLINE void bli_gemm_md_ker_var2_recast // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); + *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; @@ -211,7 +212,7 @@ BLIS_INLINE void bli_gemm_md_ker_var2_recast *ps_a /= 2; } } - else if ( bli_is_complex( dt_c ) && + else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { @@ -231,6 +232,7 @@ BLIS_INLINE void bli_gemm_md_ker_var2_recast // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); + *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; @@ -274,54 +276,3 @@ BLIS_INLINE void bli_gemm_md_ker_var2_recast #endif } -// ----------------------------------------------------------------------------- - -// -// Prototype object-based interfaces. -// - -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ - ); - -GENPROT( gemm_ker_var2_md ) - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT2 -#define GENTPROT2( ctype_c, ctype_e, chc, che, varname ) \ -\ -void PASTEMAC2(chc,che,varname) \ - ( \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT2_BASIC0( gemm_ker_var2_md ) -INSERT_GENTPROT2_MIXDP0( gemm_ker_var2_md ) - diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c index 0bfb59630..bbd9190a9 100644 --- a/frame/3/gemm/bli_gemm_md_c2r_ref.c +++ b/frame/3/gemm/bli_gemm_md_c2r_ref.c @@ -41,6 +41,8 @@ \ void PASTEMAC2(ch,opname,suf) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ @@ -61,6 +63,9 @@ void PASTEMAC2(ch,opname,suf) \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + dim_t mr_r = mr; \ + dim_t nr_r = nr; \ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype_r ) ] \ @@ -81,6 +86,9 @@ void PASTEMAC2(ch,opname,suf) \ \ ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \ ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \ +\ + dim_t m_use; \ + dim_t n_use; \ \ ctype_r* c_use; \ inc_t rs_c_use; \ @@ -146,17 +154,16 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \ rs_c_use = rs_ct; \ cs_c_use = cs_ct; \ \ - /* Convert the strides from being in units of complex elements to - be in units of real elements. Note that we don't need to check for - general storage here because that case corresponds to the scenario - where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ - if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ - else rs_c_use *= 2; \ -\ + /* Convert the strides and corresponding microtile dimension from being + in units of complex elements to be in units of real elements. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) { cs_c_use *= 2; mr_r *= 2; } \ + else { rs_c_use *= 2; nr_r *= 2; }\ \ /* c = beta * c + alpha_r * a * b; */ \ rgemm_ukr \ ( \ + mr_r, \ + nr_r, \ k, \ alpha_r, \ a_r, \ @@ -166,14 +173,12 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \ data, \ cntx \ ); \ -\ - dim_t i, j; \ \ /* Accumulate the final result in ct back to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ - for ( j = 0; j < nr; ++j ) \ - for ( i = 0; i < mr; ++i ) \ + for ( dim_t j = 0; j < n; ++j ) \ + for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,adds)( *(ct + i*rs_ct + j*cs_ct), \ *(c + i*rs_c + j*cs_c ) ); \ @@ -181,8 +186,8 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ - for ( j = 0; j < nr; ++j ) \ - for ( i = 0; i < mr; ++i ) \ + for ( dim_t j = 0; j < n; ++j ) \ + for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,copys)( *(ct + i*rs_ct + j*cs_ct), \ *(c + i*rs_c + j*cs_c ) ); \ @@ -190,8 +195,8 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \ } \ else \ { \ - for ( j = 0; j < nr; ++j ) \ - for ( i = 0; i < mr; ++i ) \ + for ( dim_t j = 0; j < n; ++j ) \ + for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \ *beta, \ @@ -207,17 +212,19 @@ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \ c_use = ( ctype_r* )c; \ rs_c_use = rs_c; \ cs_c_use = cs_c; \ + m_use = m; \ + n_use = n; \ \ - /* Convert the strides from being in units of complex elements to - be in units of real elements. Note that we don't need to check for - general storage here because that case corresponds to the scenario - where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ - if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ - else rs_c_use *= 2; \ + /* Convert the strides and corresponding microtile dimension from being + in units of complex elements to be in units of real elements. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) { cs_c_use *= 2; m_use *= 2; } \ + else { rs_c_use *= 2; n_use *= 2; } \ \ /* c = beta * c + alpha_r * a * b; */ \ rgemm_ukr \ ( \ + m_use, \ + n_use, \ k, \ alpha_r, \ a_r, \ diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index e7befc5b4..888181bad 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -34,6 +34,16 @@ */ +// +// gemm kernel parameter struct. +// + +typedef struct +{ + gemm_ukr_vft ukr; +} gemm_ker_params_t; + + // // Prototype object-based interfaces. // @@ -59,32 +69,3 @@ GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( gemm_ker_var2 ) - diff --git a/frame/3/gemm/ind/bli_gemm_ind_opt.h b/frame/3/gemm/ind/bli_gemm_ind_opt.h index 7528c4f03..52ea81a5e 100644 --- a/frame/3/gemm/ind/bli_gemm_ind_opt.h +++ b/frame/3/gemm/ind/bli_gemm_ind_opt.h @@ -35,6 +35,7 @@ BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, + num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, @@ -57,6 +58,7 @@ BLIS_INLINE void bli_gemm_ind_recast_1m_params !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); + *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c index a995e6c52..fea4efec0 100644 --- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c @@ -279,6 +279,9 @@ void PASTEMAC(ch,varname) \ /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the @@ -381,43 +384,20 @@ void PASTEMAC(ch,varname) \ And if we're strictly above the diagonal, we do nothing and continue. */ \ { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ } \ } \ } \ @@ -490,6 +470,8 @@ void PASTEMAC(ch,varname) \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ + MR, \ + NR, \ k, \ alpha_cast, \ a1, \ @@ -509,43 +491,20 @@ void PASTEMAC(ch,varname) \ } \ else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ } \ } \ } \ diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c index 3115fc67b..4b849bbc6 100644 --- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c @@ -281,6 +281,9 @@ void PASTEMAC(ch,varname) \ /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the @@ -385,6 +388,8 @@ void PASTEMAC(ch,varname) \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ + MR, \ + NR, \ k, \ alpha_cast, \ a1, \ @@ -404,43 +409,20 @@ void PASTEMAC(ch,varname) \ } \ else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ } \ } \ } \ @@ -512,43 +494,20 @@ void PASTEMAC(ch,varname) \ And if we're strictly below the diagonal, we do nothing and continue. */ \ { \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the edge of C and add the result. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ } \ } \ } \ diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 792281b53..646287f93 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -167,20 +167,8 @@ void PASTEMAC(ch,varname) \ function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ - ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ @@ -254,10 +242,6 @@ void PASTEMAC(ch,varname) \ diagoffa = 0; \ c_cast = c_cast + (i )*rs_c; \ } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ @@ -307,8 +291,8 @@ void PASTEMAC(ch,varname) \ dim_t jr_inc; \ \ /* Determine the thread range and increment for the 2nd loop. - NOTE: The definition of bli_thread_range_jrir() will depend on whether - slab or round-robin partitioning was requested at configure-time. \ + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. \ NOTE: Parallelism in the 1st loop is disabled for now. */ \ bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ @@ -379,47 +363,20 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k_a1011, \ - alpha_cast, \ - a1, \ - b1_i, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Copy edge elements of C to the temporary buffer. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - c11, rs_c, cs_c, \ - ct, rs_ct, cs_ct ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k_a1011, \ - alpha_cast, \ - a1, \ - b1_i, \ - beta_cast, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Copy the result to the edge of C. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k_a1011, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ /*}*/ \ \ a1 += ps_a_cur; \ @@ -446,42 +403,20 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - one, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Add the result to the edge of C. */ \ - PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ /*}*/ \ \ a1 += rstep_a; \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 69498540b..9ef2a475d 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -167,20 +167,8 @@ void PASTEMAC(ch,varname) \ function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ - ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ @@ -261,10 +249,6 @@ void PASTEMAC(ch,varname) \ { \ m = -diagoffa + k; \ } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ @@ -386,47 +370,20 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k_a1112, \ - alpha_cast, \ - a1, \ - b1_i, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Copy edge elements of C to the temporary buffer. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - c11, rs_c, cs_c, \ - ct, rs_ct, cs_ct ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k_a1112, \ - alpha_cast, \ - a1, \ - b1_i, \ - beta_cast, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Copy the result to the edge of C. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k_a1112, \ + alpha_cast, \ + a1, \ + b1_i, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ /*}*/ \ \ a1 += ps_a_cur; \ @@ -453,42 +410,20 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - one, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Add the result to the edge of C. */ \ - PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ /*}*/ \ \ a1 += rstep_a; \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 03e3f1e53..f6b20af2e 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -167,20 +167,8 @@ void PASTEMAC(ch,varname) \ function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ - ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ @@ -261,10 +249,6 @@ void PASTEMAC(ch,varname) \ { \ n = diagoffb + k; \ } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ @@ -335,9 +319,9 @@ void PASTEMAC(ch,varname) \ \ /* Determine the thread range and increment for the 2nd and 1st loops for the initial rectangular region of B (if it exists). - NOTE: The definition of bli_thread_range_jrir() will depend on whether - slab or round-robin partitioning was requested at configure-time. \ - NOTE: Parallelism in the 1st loop is disabled for now. */ \ + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. \ + NOTE: Parallelism in the 1st loop is disabled for now. */ \ bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ @@ -382,42 +366,20 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - one, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Add the result to the edge of C. */ \ - PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ } \ } \ } \ @@ -501,47 +463,20 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k_b1121, \ - alpha_cast, \ - a1_i, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Copy edge elements of C to the temporary buffer. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - c11, rs_c, cs_c, \ - ct, rs_ct, cs_ct ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k_b1121, \ - alpha_cast, \ - a1_i, \ - b1, \ - beta_cast, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Copy the result to the edge of C. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k_b1121, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ } \ \ a1 += rstep_a; \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 5d63bd46d..f71fb3c4d 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -167,20 +167,8 @@ void PASTEMAC(ch,varname) \ function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ - ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ @@ -262,10 +250,6 @@ void PASTEMAC(ch,varname) \ { \ k = -diagoffb + n; \ } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ @@ -410,47 +394,20 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k_b0111, \ - alpha_cast, \ - a1_i, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Copy edge elements of C to the temporary buffer. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - c11, rs_c, cs_c, \ - ct, rs_ct, cs_ct ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k_b0111, \ - alpha_cast, \ - a1_i, \ - b1, \ - beta_cast, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Copy the result to the edge of C. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k_b0111, \ + alpha_cast, \ + a1_i, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ } \ \ a1 += rstep_a; \ @@ -476,9 +433,9 @@ void PASTEMAC(ch,varname) \ bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Advance the start and end iteration offsets for the rectangular region - by the number of iterations used for the triangular region. */ \ - jr_start += n_iter_tri; \ - jr_end += n_iter_tri; \ + by the number of iterations used for the triangular region. */ \ + jr_start += n_iter_tri; \ + jr_end += n_iter_tri; \ jb0 = n_iter_tri; \ \ /* Save the resulting value of b1 from the previous loop since it represents @@ -496,7 +453,7 @@ void PASTEMAC(ch,varname) \ the starting address of the rectangular region (which is already n_iter_tri logical iterations through B). */ \ b1 = b_cast + (j-jb0) * cstep_b; \ - c1 = c_cast + j * cstep_c; \ + c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ @@ -533,42 +490,20 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - one, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Add the result to the edge of C. */ \ - PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + one, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ } \ } \ } \ diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index a8196ebb9..0a3be87f7 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -40,27 +40,30 @@ cntl_t* bli_trsm_cntl_create rntm_t* rntm, side_t side, pack_t schema_a, - pack_t schema_b + pack_t schema_b, + void_fp ker ) { if ( bli_is_left( side ) ) - return bli_trsm_l_cntl_create( rntm, schema_a, schema_b ); + return bli_trsm_l_cntl_create( rntm, schema_a, schema_b, ker ); else - return bli_trsm_r_cntl_create( rntm, schema_a, schema_b ); + return bli_trsm_r_cntl_create( rntm, schema_a, schema_b, ker ); } cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, - pack_t schema_b + pack_t schema_b, + void_fp ker ) { void_fp macro_kernel_p; - // Use the function pointer to the macrokernels that use slab - // assignment of micropanels to threads in the jr and ir loops. + // Set the default macrokernel. If a non-NULL kernel function pointer is + // passed in, we use that instead. macro_kernel_p = bli_trsm_xx_ker_var2; + if ( ker ) macro_kernel_p = ker; const opid_t family = BLIS_TRSM; @@ -202,11 +205,15 @@ cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, - pack_t schema_b + pack_t schema_b, + void_fp ker ) { // NOTE: trsm macrokernels are presently disabled for right-side execution. + // Set the default macrokernel. If a non-NULL kernel function pointer is + // passed in, we use that instead. void_fp macro_kernel_p = bli_trsm_xx_ker_var2; + if ( ker ) macro_kernel_p = ker; const opid_t family = BLIS_TRSM; diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h index 7fdb1fc4f..86f4a29b2 100644 --- a/frame/3/trsm/bli_trsm_cntl.h +++ b/frame/3/trsm/bli_trsm_cntl.h @@ -38,21 +38,24 @@ cntl_t* bli_trsm_cntl_create rntm_t* rntm, side_t side, pack_t schema_a, - pack_t schema_b + pack_t schema_b, + void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, - pack_t schema_b + pack_t schema_b, + void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, - pack_t schema_b + pack_t schema_b, + void_fp ker ); void bli_trsm_cntl_free diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index dec41301a..b503efa5b 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -183,7 +183,6 @@ void PASTEMAC(ch,varname) \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ - ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ @@ -470,43 +469,20 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - minus_one, \ - a1, \ - b1, \ - alpha2_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - minus_one, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Add the result to the edge of C. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - alpha2_cast, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + minus_one, \ + a1, \ + b1, \ + alpha2_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ \ a1 += rstep_a; \ } \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 1627a12a3..55ceafb91 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -183,7 +183,6 @@ void PASTEMAC(ch,varname) \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ - ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ @@ -480,43 +479,20 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - minus_one, \ - a1, \ - b1, \ - alpha2_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - minus_one, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Add the result to the edge of C. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - alpha2_cast, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + minus_one, \ + a1, \ + b1, \ + alpha2_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ \ a1 += rstep_a; \ } \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 8cbc26b36..23d4dd728 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -188,7 +188,6 @@ void PASTEMAC(ch,varname) \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ - ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ @@ -499,43 +498,20 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( b2, &aux ); \ bli_auxinfo_set_next_b( a2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - minus_one, \ - b1, \ - a1, \ - alpha2_cast, \ - c11, cs_c, rs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - minus_one, \ - b1, \ - a1, \ - zero, \ - ct, cs_ct, rs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Add the result to the edge of C. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - alpha2_cast, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + minus_one, \ + b1, \ + a1, \ + alpha2_cast, \ + c11, cs_c, rs_c, \ + &aux, \ + cntx \ + ); \ } \ \ a1 += rstep_a; \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 97399d0ae..71381707c 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -188,7 +188,6 @@ void PASTEMAC(ch,varname) \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ - ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ @@ -492,43 +491,20 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( b2, &aux ); \ bli_auxinfo_set_next_b( a2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - minus_one, \ - b1, \ - a1, \ - alpha2_cast, \ - c11, cs_c, rs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - k, \ - minus_one, \ - b1, \ - a1, \ - zero, \ - ct, cs_ct, rs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Add the result to the edge of C. */ \ - PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - alpha2_cast, \ - c11, rs_c, cs_c ); \ - } \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + minus_one, \ + b1, \ + a1, \ + alpha2_cast, \ + c11, cs_c, rs_c, \ + &aux, \ + cntx \ + ); \ } \ \ a1 += rstep_a; \ diff --git a/frame/base/bli_auxinfo.h b/frame/base/bli_auxinfo.h index 68b6cc7cd..d8c6cbb13 100644 --- a/frame/base/bli_auxinfo.h +++ b/frame/base/bli_auxinfo.h @@ -74,6 +74,15 @@ BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) return ai->ps_b; } +BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) +{ + return ai->ukr; +} +BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) +{ + return ai->params; +} + // auxinfo_t field modification @@ -118,5 +127,14 @@ BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) ai->ps_b = ps; } -#endif +BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) +{ + ai->ukr = ukr; +} +BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) +{ + ai->params = params; +} + +#endif diff --git a/frame/include/bli_edge_case_macro_defs.h b/frame/include/bli_edge_case_macro_defs.h new file mode 100644 index 000000000..242045a02 --- /dev/null +++ b/frame/include/bli_edge_case_macro_defs.h @@ -0,0 +1,109 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_EDGE_CASE_MACRO_DEFS_H +#define BLIS_EDGE_CASE_MACRO_DEFS_H + + +// Helper macros for edge-case handling within gemm microkernels. + +#define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major) \ +\ + PASTEMAC(ch,ctype)* restrict _beta = beta; \ + PASTEMAC(ch,ctype)* restrict _c = c; \ + const inc_t _rs_c = rs_c; \ + const inc_t _cs_c = cs_c; \ + PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const inc_t _rs_ct = row_major ? nr : 1; \ + const inc_t _cs_ct = row_major ? 1 : mr; + +#define GEMM_UKR_SETUP_CT_POST(ch) \ +\ + PASTEMAC(ch,ctype) _zero; \ + PASTEMAC(ch,set0s)( _zero ); \ + \ + if ( _use_ct ) \ + { \ + c = _ct; \ + rs_c = _rs_ct; \ + cs_c = _cs_ct; \ + beta = &_zero; \ + } + +#define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ +\ + GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \ + const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ + m != mr || n != nr; \ + GEMM_UKR_SETUP_CT_POST(ch); + +#define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ +\ + GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \ + const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ + m != mr || n != nr; \ + GEMM_UKR_SETUP_CT_POST(ch); + +#define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ +\ + GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \ + const bool _use_ct = m != mr || n != nr; \ + GEMM_UKR_SETUP_CT_POST(ch); + +#define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ +\ + GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \ + const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ + m != mr || n != nr || \ + ( (uintptr_t)_c % alignment ) || \ + ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ + GEMM_UKR_SETUP_CT_POST(ch); + +#define GEMM_UKR_FLUSH_CT(ch) \ +\ + if ( _use_ct ) \ + { \ + PASTEMAC(ch,xpbys_mxn) \ + ( \ + m, n, \ + _ct, _rs_ct, _cs_ct, \ + _beta, \ + _c, _rs_c, _cs_c \ + ); \ + } \ + + +#endif + diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h index 03451d440..be45a12e3 100644 --- a/frame/include/bli_macro_defs.h +++ b/frame/include/bli_macro_defs.h @@ -98,6 +98,7 @@ #include "bli_gentprot_macro_defs.h" #include "bli_misc_macro_defs.h" +#include "bli_edge_case_macro_defs.h" #include "bli_param_macro_defs.h" #include "bli_obj_macro_defs.h" #include "bli_complex_macro_defs.h" diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 5be0ceeb4..c66505bde 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1144,6 +1144,13 @@ typedef struct inc_t ps_a; inc_t ps_b; + // The type to convert to on output. + //num_t dt_on_output; + + // (Virtual) microkernel address and additional parameters. + void_fp ukr; + void* params; + } auxinfo_t; diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c index 66337e0b7..913abd1f6 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c @@ -42,9 +42,13 @@ // 2vx10 microkernels. #include "armsve_asm_2vx10cmplx.h" +#include "arm_sve.h" + void bli_cgemm_armsve_asm_2vx10_unindexed ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, @@ -59,12 +63,15 @@ void bli_cgemm_armsve_asm_2vx10_unindexed // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_mker = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_mker = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; uint64_t info = 0; + uint64_t mr = svcntw(); + GEMM_UKR_SETUP_CT( c, mr, 10, false ); + __asm__ volatile ( // " ldr x0, %[a] \n\t" // " ldr x1, %[b] \n\t" @@ -310,5 +317,7 @@ GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) "z24","z25","z26","z27", "z28","z29","z30","z31" ); + + GEMM_UKR_FLUSH_CT( c ); } diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c index e5b78a592..9730fb8ce 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c @@ -42,9 +42,13 @@ // 2vx10 microkernels. #include "armsve_asm_2vx10.h" +#include "arm_sve.h" + void bli_dgemm_armsve_asm_2vx10_unindexed ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, double* restrict alpha, double* restrict a, double* restrict b, @@ -59,11 +63,14 @@ void bli_dgemm_armsve_asm_2vx10_unindexed // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_mker = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_mker = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + uint64_t mr = 2*svcntd(); + GEMM_UKR_SETUP_CT( d, mr, 10, false ); + __asm__ volatile ( " ldr x0, %[a] \n\t" " ldr x1, %[b] \n\t" @@ -324,5 +331,7 @@ GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x "z24","z25","z26","z27", "z28","z29","z30","z31" ); + + GEMM_UKR_FLUSH_CT( d ); } diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c index 00b3f20b4..74c4779d7 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c @@ -42,9 +42,13 @@ // 2vx10 microkernels. #include "armsve_asm_2vx10.h" +#include "arm_sve.h" + void bli_sgemm_armsve_asm_2vx10_unindexed ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, float* restrict alpha, float* restrict a, float* restrict b, @@ -59,11 +63,14 @@ void bli_sgemm_armsve_asm_2vx10_unindexed // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_mker = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_mker = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + uint64_t mr = 2*svcntw(); + GEMM_UKR_SETUP_CT( s, mr, 10, false ); + __asm__ volatile ( " ldr x0, %[a] \n\t" " ldr x1, %[b] \n\t" @@ -310,5 +317,7 @@ GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x "z24","z25","z26","z27", "z28","z29","z30","z31" ); + + GEMM_UKR_FLUSH_CT( s ); } diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c index 2fa37664a..ee041b3c4 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c @@ -42,9 +42,13 @@ // 2vx10 microkernels. #include "armsve_asm_2vx10cmplx.h" +#include "arm_sve.h" + void bli_zgemm_armsve_asm_2vx10_unindexed ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, @@ -59,12 +63,15 @@ void bli_zgemm_armsve_asm_2vx10_unindexed // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_mker = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_mker = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; uint64_t info = 0; + uint64_t mr = svcntd(); + GEMM_UKR_SETUP_CT( z, mr, 10, false ); + __asm__ volatile ( // " ldr x0, %[a] \n\t" // " ldr x1, %[b] \n\t" @@ -309,5 +316,7 @@ GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) "z24","z25","z26","z27", "z28","z29","z30","z31" ); + + GEMM_UKR_FLUSH_CT( z ); } diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c index 3d25719d9..641944ecd 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c @@ -42,9 +42,13 @@ // 2vx7 microkernels. #include "armsve_asm_2vx7cmplx.h" +#include "arm_sve.h" + void bli_zgemm_armsve_asm_2vx7_unindexed ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, @@ -59,12 +63,15 @@ void bli_zgemm_armsve_asm_2vx7_unindexed // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_mker = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_mker = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; uint64_t info = 0; + uint64_t mr = svcntd(); + GEMM_UKR_SETUP_CT( z, mr, 7, false ); + __asm__ volatile ( // " ldr x0, %[a] \n\t" // " ldr x1, %[b] \n\t" @@ -261,6 +268,8 @@ GEMM_CCMPLX_STORE_COL7_G(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27 "z24","z25","z26","z27", "z28","z29","z30","z31" ); + + GEMM_UKR_FLUSH_CT( z ); } diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c index d0eef4a8c..4272f72c0 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c @@ -42,9 +42,13 @@ // 2vx8 microkernels. #include "armsve_asm_2vx8cmplx.h" +#include "arm_sve.h" + void bli_zgemm_armsve_asm_2vx8_unindexed ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, @@ -59,12 +63,15 @@ void bli_zgemm_armsve_asm_2vx8_unindexed // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_mker = k0 / 6; - uint64_t k_left = k0 % 6; + uint64_t k_mker = k / 6; + uint64_t k_left = k % 6; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; uint64_t info = 0; + uint64_t mr = svcntd(); + GEMM_UKR_SETUP_CT( z, mr, 8, false ); + __asm__ volatile ( // " ldr x0, %[a] \n\t" // " ldr x1, %[b] \n\t" @@ -286,5 +293,7 @@ GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z16,%2,%4,x16) "z24","z25","z26","z27", "z28","z29","z30","z31" ); + + GEMM_UKR_FLUSH_CT( z ); } diff --git a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c index b526cd095..c248285c3 100644 --- a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c +++ b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c @@ -48,23 +48,23 @@ void bli_sgemm_armv7a_ker_4x4 void bli_sgemm_armv7a_asm_4x4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, - float* restrict c, inc_t rs_c0, inc_t cs_c0, + float* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint32_t k = k0; - uint32_t rs_c = rs_c0; - uint32_t cs_c = cs_c0; - + GEMM_UKR_SETUP_CT_ANY( s, 4, 4, false ); bli_sgemm_armv7a_ker_4x4( k, alpha, a, b, beta, c, rs_c, cs_c, data ); + GEMM_UKR_FLUSH_CT( s ); } @@ -83,23 +83,23 @@ void bli_dgemm_armv7a_ker_4x4 void bli_dgemm_armv7a_asm_4x4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, - double* restrict c, inc_t rs_c0, inc_t cs_c0, + double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint32_t k = k0; - uint32_t rs_c = rs_c0; - uint32_t cs_c = cs_c0; - + GEMM_UKR_SETUP_CT_ANY( d, 4, 4, false ); bli_dgemm_armv7a_ker_4x4( k, alpha, a, b, beta, c, rs_c, cs_c, data ); + GEMM_UKR_FLUSH_CT( d ); } @@ -118,23 +118,23 @@ void bli_cgemm_armv7a_ker_2x2 void bli_cgemm_armv7a_asm_2x2 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, - scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, + scomplex* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint32_t k = k0; - uint32_t rs_c = rs_c0; - uint32_t cs_c = cs_c0; - + GEMM_UKR_SETUP_CT_ANY( c, 2, 2, false ); bli_cgemm_armv7a_ker_2x2( k, alpha, a, b, beta, c, rs_c, cs_c, data ); + GEMM_UKR_FLUSH_CT( c ); } @@ -153,22 +153,22 @@ void bli_zgemm_armv7a_ker_2x2 void bli_zgemm_armv7a_asm_2x2 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, - dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, + dcomplex* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint32_t k = k0; - uint32_t rs_c = rs_c0; - uint32_t cs_c = cs_c0; - + GEMM_UKR_SETUP_CT_ANY( z, 2, 2, false ); bli_zgemm_armv7a_ker_2x2( k, alpha, a, b, beta, c, rs_c, cs_c, data ); + GEMM_UKR_FLUSH_CT( z ); } diff --git a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c index b9db58726..06f36a346 100644 --- a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c +++ b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c @@ -37,7 +37,9 @@ void bli_sgemm_armv7a_int_4x4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, float* restrict alpha, float* restrict a, float* restrict b, @@ -49,12 +51,14 @@ void bli_sgemm_armv7a_int_4x4 { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint32_t k_iter = k0 / 4; - uint32_t k_left = k0 % 4; + uint32_t k_iter = k / 4; + uint32_t k_left = k % 4; uint32_t rs_c = rs_c0; uint32_t cs_c = cs_c0; uint32_t i; + GEMM_UKR_SETUP_CT( s, 4, 4, false ); + void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); @@ -82,47 +86,17 @@ void bli_sgemm_armv7a_int_4x4 if ( *beta != 0.0F ) { - if ( rs_c == 1 ) - { - // Load column 0 - cv0 = vld1q_f32( c + 0*rs_c + 0*cs_c ); - - // Load column 1 - cv1 = vld1q_f32( c + 0*rs_c + 1*cs_c ); - - // Load column 2 - cv2 = vld1q_f32( c + 0*rs_c + 2*cs_c ); - - // Load column 3 - cv3 = vld1q_f32( c + 0*rs_c + 3*cs_c ); - } - else - { - // Load column 0 - cv0 = vld1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0); - cv0 = vld1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1); - cv0 = vld1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2); - cv0 = vld1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3); - - // Load column 1 - cv1 = vld1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0); - cv1 = vld1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1); - cv1 = vld1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2); - cv1 = vld1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3); - - // Load column 2 - cv2 = vld1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0); - cv2 = vld1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1); - cv2 = vld1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2); - cv2 = vld1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3); - - // Load column 3 - cv3 = vld1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0); - cv3 = vld1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1); - cv3 = vld1q_lane_f32( c + 2*rs_c + 3*cs_c, cv3, 2); - cv3 = vld1q_lane_f32( c + 3*rs_c + 3*cs_c, cv3, 3); - - } + // Load column 0 + cv0 = vld1q_f32( c + 0*cs_c ); + + // Load column 1 + cv1 = vld1q_f32( c + 1*cs_c ); + + // Load column 2 + cv2 = vld1q_f32( c + 2*cs_c ); + + // Load column 3 + cv3 = vld1q_f32( c + 3*cs_c ); } else { @@ -255,47 +229,22 @@ void bli_sgemm_armv7a_int_4x4 cv3 = vmlaq_f32( cv3, abv3, alphav ); } - if ( rs_c == 1 ) - { - // Store column 0 - vst1q_f32( c + 0*rs_c + 0*cs_c, cv0 ); - // Store column 1 - vst1q_f32( c + 0*rs_c + 1*cs_c, cv1 ); - // Store column 2 - vst1q_f32( c + 0*rs_c + 2*cs_c, cv2 ); - // Store column 3 - vst1q_f32( c + 0*rs_c + 3*cs_c, cv3 ); - } - else - { - // Store column 0 - vst1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0); - vst1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1); - vst1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2); - vst1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3); - - // Store column 1 - vst1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0); - vst1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1); - vst1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2); - vst1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3); - - // Store column 2 - vst1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0); - vst1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1); - vst1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2); - vst1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3); - - // Store column 3 - vst1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0); - vst1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1); - vst1q_lane_f32( c + 2*rs_c + 3*cs_c, cv3, 2); - vst1q_lane_f32( c + 3*rs_c + 3*cs_c, cv3, 3); - } + // Store column 0 + vst1q_f32( c + 0*cs_c, cv0 ); + // Store column 1 + vst1q_f32( c + 1*cs_c, cv1 ); + // Store column 2 + vst1q_f32( c + 2*cs_c, cv2 ); + // Store column 3 + vst1q_f32( c + 3*cs_c, cv3 ); + + GEMM_UKR_FLUSH_CT( s ); } void bli_dgemm_armv7a_int_4x4 ( + dim_t m, + dim_t n, dim_t k, double* restrict alpha, double* restrict a, @@ -314,6 +263,8 @@ void bli_dgemm_armv7a_int_4x4 uint32_t cs_c = cs_c0; uint32_t i; + GEMM_UKR_SETUP_CT_ANY( d, 4, 4, false ); + //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); @@ -568,5 +519,7 @@ void bli_dgemm_armv7a_int_4x4 *c23 += ab23 * *alpha; *c33 += ab33 * *alpha; } + + GEMM_UKR_FLUSH_CT( d ); } diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index dfdda863b..7b420f202 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -1,4 +1,4 @@ - /* + /* BLIS An object-based framework for developing high-performance BLAS-like @@ -40,20 +40,22 @@ o 4x4 Single precision micro-kernel fully functional. o Runnable on ARMv8, compiled with aarch64 GCC. o Use it together with the armv8 BLIS configuration. - o Tested on Juno board. Around 7.3 GFLOPS @ 1.1 GHz. + o Tested on Juno board. Around 7.3 GFLOPS @ 1.1 GHz. December 2014. - + * UPDATE NOVEMBER 2015 * Micro-kernel changed to 8x12 * Tested on Juno Board. Around 8.1 GFLOPS, 1 x A57 core @ 1.1 GHz. * Tested on Juno Board. Around 15.9 GFLOPS, 2 x A57 cores @ 1.1 GHz. - * Tested on Juno board. Around 3.1 GFLOPS, 1 x A53 core @ 850 MHz. + * Tested on Juno board. Around 3.1 GFLOPS, 1 x A53 core @ 850 MHz. * Tested on Juno board. Around 12 GFLOPS, 4 x A53 cores @ 850 MHz. */ void bli_sgemm_armv8a_asm_8x12 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, float* restrict alpha, float* restrict a, float* restrict b, @@ -68,1020 +70,1023 @@ void bli_sgemm_armv8a_asm_8x12 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( s, 8, 12, false ); -__asm__ volatile -( -" \n\t" -" \n\t" -" ldr x0,%[aaddr] \n\t" // Load address of A. -" ldr x1,%[baddr] \n\t" // Load address of B. -" ldr x2,%[caddr] \n\t" // Load address of C. -" \n\t" -" ldr x5,%[k_iter] \n\t" // Number of unrolled iterations (k_iter). -" ldr x6,%[k_left] \n\t" // Number of remaining iterations (k_left). -" \n\t" -" ldr x10,%[cs_c] \n\t" // Load cs_c. -" lsl x10,x10,#2 \n\t" // cs_c * sizeof(float) -- AUX. -" \n\t" -" ldr x14,%[rs_c] \n\t" // Load rs_c. -" lsl x14,x14,#2 \n\t" // rs_c * sizeof(float). -" \n\t" -" add x16,x2,x10 \n\t" //Load address Column 1 of C -" add x17,x16,x10 \n\t" //Load address Column 2 of C -" add x19,x17,x10 \n\t" //Load address Column 3 of C -" add x20,x19,x10 \n\t" //Load address Column 4 of C -" add x21,x20,x10 \n\t" //Load address Column 5 of C -" add x22,x21,x10 \n\t" //Load address Column 6 of C -" add x23,x22,x10 \n\t" //Load address Column 7 of C -" add x24,x23,x10 \n\t" //Load address Column 8 of C -" add x25,x24,x10 \n\t" //Load address Column 9 of C -" add x26,x25,x10 \n\t" //Load address Column 10 of C -" add x27,x26,x10 \n\t" //Load address Column 11 of C -" \n\t" -" prfm pldl1keep,[x2] \n\t" // Prefetch c. -" prfm pldl1keep,[x16] \n\t" // Prefetch c. -" prfm pldl1keep,[x17] \n\t" // Prefetch c. -" prfm pldl1keep,[x19] \n\t" // Prefetch c. -" prfm pldl1keep,[x20] \n\t" // Prefetch c. -" prfm pldl1keep,[x21] \n\t" // Prefetch c. -" prfm pldl1keep,[x22] \n\t" // Prefetch c. -" prfm pldl1keep,[x23] \n\t" // Prefetch c. -" prfm pldl1keep,[x24] \n\t" // Prefetch c. -" prfm pldl1keep,[x25] \n\t" // Prefetch c. -" prfm pldl1keep,[x26] \n\t" // Prefetch c. -" prfm pldl1keep,[x27] \n\t" // Prefetch c. -" \n\t" -" dup v8.4s, wzr \n\t" // Vector for accummulating column 0 -" prfm PLDL1KEEP, [x1, #192] \n\t" -" dup v9.4s, wzr \n\t" // Vector for accummulating column 0 -" prfm PLDL1KEEP, [x1, #256] \n\t" -" dup v10.4s, wzr \n\t" // Vector for accummulating column 1 -" prfm PLDL1KEEP, [x1, #320] \n\t" -" dup v11.4s, wzr \n\t" // Vector for accummulating column 1 -" dup v12.4s, wzr \n\t" // Vector for accummulating column 2 -" dup v13.4s, wzr \n\t" // Vector for accummulating column 2 -" \n\t" -" dup v14.4s, wzr \n\t" // Vector for accummulating column 3 -" prfm PLDL1KEEP, [x0, #128] \n\t" -" dup v15.4s, wzr \n\t" // Vector for accummulating column 3 -" prfm PLDL1KEEP, [x0, #192] \n\t" -" dup v16.4s, wzr \n\t" // Vector for accummulating column 4 -" dup v17.4s, wzr \n\t" // Vector for accummulating column 4 -" dup v18.4s, wzr \n\t" // Vector for accummulating column 5 -" dup v19.4s, wzr \n\t" // Vector for accummulating column 5 -" \n\t" -" dup v20.4s, wzr \n\t" // Vector for accummulating column 6 -" dup v21.4s, wzr \n\t" // Vector for accummulating column 6 -" dup v22.4s, wzr \n\t" // Vector for accummulating column 7 -" dup v23.4s, wzr \n\t" // Vector for accummulating column 7 -" dup v24.4s, wzr \n\t" // Vector for accummulating column 8 -" dup v25.4s, wzr \n\t" // Vector for accummulating column 8 -" \n\t" -" dup v26.4s, wzr \n\t" // Vector for accummulating column 9 -" dup v27.4s, wzr \n\t" // Vector for accummulating column 9 -" dup v28.4s, wzr \n\t" // Vector for accummulating column 10 -" dup v29.4s, wzr \n\t" // Vector for accummulating column 10 -" dup v30.4s, wzr \n\t" // Vector for accummulating column 11 -" dup v31.4s, wzr \n\t" // Vector for accummulating column 11 -" \n\t" -" cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. -BEQ(SCONSIDERKLEFT) -" \n\t" -" ldr q0, [x0] \n\t" -" ldr q1, [x0, #16] \n\t" // Load a -" \n\t" -" ldr q2, [x1] \n\t" // Load b -" ldr q3, [x1, #16] \n\t" -" ldr q4, [x1, #32] \n\t" -" \n\t" -" add x0, x0, #32 \n\t" //update address of A -" add x1, x1, #48 \n\t" //update address of B -" \n\t" -" cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. -BEQ(SLASTITER) // (as loop is do-while-like). -" \n\t" -LABEL(SLOOPKITER) // Body of the k_iter loop. -" \n\t" -" ldr q5, [x0] \n\t" -" fmla v8.4s, v0.4s,v2.s[0] \n\t" // Accummulate. -" fmla v9.4s, v1.4s,v2.s[0] \n\t" // Accummulate. -" ldr q6, [x0, #16] \n\t" -" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. -" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. -" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. -" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. -" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. -" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. -" ldr q2, [x1] \n\t" -" \n\t" -" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. -" prfm PLDL1KEEP, [x1, #336] \n\t" -" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. -" prfm PLDL1KEEP, [x1, #400] \n\t" -" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. -" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. -" prfm PLDL1KEEP, [x1, #464] \n\t" -" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. -" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. -" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. -" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. -" \n\t" -" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. -" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. -" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. -" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. -" ldr q3, [x1, #16] \n\t" -" \n\t" -" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. -" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. -" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. -" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. -" ldr q4, [x1, #32] \n\t" -" \n\t" //End It 1 -" \n\t" -" ldr q0, [x0, #32] \n\t" -" fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. -" fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. -" ldr q1, [x0, #48] \n\t" -" fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. -" fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. -" fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. -" fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. -" fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. -" fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. -" ldr q2, [x1, #48] \n\t" -" \n\t" -" fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. -" prfm PLDL1KEEP, [x0, #224] \n\t" -" fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. -" prfm PLDL1KEEP, [x0, #288] \n\t" -" fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. -" fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. -" fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. -" fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. -" fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. -" fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. -" \n\t" -" fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. -" fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. -" fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. -" fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. -" ldr q3, [x1, #64] \n\t" -" \n\t" -" fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. -" fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. -" fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. -" fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. -" ldr q4, [x1, #80] \n\t" -" \n\t" //End It 2 -" \n\t" -" ldr q5, [x0, #64] \n\t" -" fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. -" fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. -" ldr q6, [x0, #80] \n\t" -" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. -" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. -" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. -" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. -" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. -" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. -" ldr q2, [x1, #96] \n\t" -" \n\t" -" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. -" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. -" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. -" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. -" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. -" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. -" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. -" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. -" \n\t" -" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. -" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. -" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. -" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. -" ldr q3, [x1, #112] \n\t" -" \n\t" -" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. -" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. -" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. -" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. -" ldr q4, [x1, #128] \n\t" -" \n\t" //End It 3 -" \n\t" -" ldr q0, [x0, #96] \n\t" -" fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. -" fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. -" ldr q1, [x0, #112] \n\t" -" fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. -" fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. -" fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. -" fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. -" fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. -" fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. -" ldr q2, [x1, #144] \n\t" -" \n\t" -" fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. -" fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. -" fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. -" fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. -" fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. -" fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. -" fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. -" fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. -" \n\t" -" fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. -" fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. -" fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. -" fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. -" ldr q3, [x1, #160] \n\t" -" \n\t" -" fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. -" fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. -" fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. -" fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. -" ldr q4, [x1, #176] \n\t" -" add x1, x1, #192 \n\t" -" add x0, x0, #128 \n\t" -" \n\t" //End It 4 -" sub x5,x5,1 \n\t" // i-=1. -" cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. -BNE(SLOOPKITER) -" \n\t" -LABEL(SLASTITER) // Last iteration of k_iter loop. -" \n\t" -" \n\t" -" ldr q5, [x0] \n\t" -" fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. -" fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. -" ldr q6, [x0, #16] \n\t" -" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. -" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. -" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. -" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. -" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. -" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. -" ldr q2, [x1] \n\t" -" \n\t" -" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. -" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. -" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. -" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. -" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. -" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. -" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. -" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. -" \n\t" -" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. -" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. -" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. -" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. -" ldr q3, [x1, #16] \n\t" -" \n\t" -" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. -" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. -" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. -" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. -" ldr q4, [x1, #32] \n\t" -" \n\t" //End It 1 -" \n\t" -" ldr q0, [x0, #32] \n\t" -" fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. -" fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. -" ldr q1, [x0, #48] \n\t" -" fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. -" fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. -" fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. -" fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. -" fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. -" fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. -" ldr q2, [x1, #48] \n\t" -" \n\t" -" fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. -" fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. -" fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. -" fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. -" fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. -" fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. -" fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. -" fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. -" \n\t" -" fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. -" fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. -" fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. -" fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. -" ldr q3, [x1, #64] \n\t" -" \n\t" -" fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. -" fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. -" fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. -" fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. -" ldr q4, [x1, #80] \n\t" -" \n\t" //End It 2 -" \n\t" -" ldr q5, [x0, #64] \n\t" -" fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. -" fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. -" ldr q6, [x0, #80] \n\t" -" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. -" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. -" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. -" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. -" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. -" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. -" ldr q2, [x1, #96] \n\t" -" \n\t" -" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. -" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. -" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. -" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. -" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. -" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. -" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. -" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. -" \n\t" -" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. -" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. -" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. -" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. -" ldr q3, [x1, #112] \n\t" -" \n\t" -" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. -" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. -" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. -" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. -" ldr q4, [x1, #128] \n\t" -" \n\t" //End It 3 -" \n\t" -" fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. -" fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. -" fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. -" fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. -" fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. -" fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. -" fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. -" fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. -" \n\t" -" fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. -" fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. -" fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. -" fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. -" fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. -" fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. -" fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. -" fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. -" \n\t" -" fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. -" fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. -" fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. -" fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. -" \n\t" -" fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. -" fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. -" fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. -" fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. -" add x1, x1, #144 \n\t" -" add x0, x0, #96 \n\t" -" \n\t" //End It 4 -" \n\t" -LABEL(SCONSIDERKLEFT) -" cmp x6,0 \n\t" // If k_left == 0, we are done. -BEQ(SPOSTACCUM) // else, we enter the k_left loop. -" \n\t" -LABEL(SLOOPKLEFT) // Body of the left iterations -" \n\t" -" ldr q0, [x0],#16 \n\t" -" ldr q1, [x0],#16 \n\t" // Load a -" \n\t" -" ldr q2, [x1],#16 \n\t" // Load b -" ldr q3, [x1],#16 \n\t" -" ldr q4, [x1],#16 \n\t" -" \n\t" -" sub x6,x6,1 \n\t" // i = i-1. -" \n\t" -" fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. -" fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. -" fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. -" fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. -" fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. -" fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. -" fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. -" fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. -" \n\t" -" fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. -" fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. -" fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. -" fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. -" fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. -" fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. -" fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. -" fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. -" \n\t" -" fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. -" fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. -" fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. -" fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. -" fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. -" fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. -" fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. -" fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. -" \n\t" -" cmp x6,0 \n\t" // Iterate again. -BNE(SLOOPKLEFT) // if i!=0. -" \n\t" -LABEL(SPOSTACCUM) -" \n\t" -" ldr x0,%[alpha] \n\t" // Alpha address. -" ldr x1,%[beta] \n\t" // Beta address. -" \n\t" -" ld1r {v6.4s},[x0] \n\t" // Load alpha. -" ld1r {v7.4s},[x1] \n\t" // Load beta -" \n\t" -" ldr x0,%[a_next] \n\t" // Pointer to next block of A. -" ldr x1,%[b_next] \n\t" // Pointer to next pointer of B. -" \n\t" -" cmp x14,#4 \n\t" // If rs_c != 1 (column-major) -BNE(SGENSTORED) -" \n\t" -LABEL(SCOLSTORED) // C is column-major. -" \n\t" -" dup v0.4s, wzr \n\t" -" dup v1.4s, wzr \n\t" -" dup v2.4s, wzr \n\t" -" dup v3.4s, wzr \n\t" -" dup v4.4s, wzr \n\t" -" dup v5.4s, wzr \n\t" -" \n\t" -" fcmp s7,#0.0 \n\t" -BEQ(SBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case. -" \n\t" -" ldr q0, [x2] \n\t" //Load column 0 of C -" ldr q1, [x2, #16] \n\t" -" ldr q2, [x16] \n\t" //Load column 1 of C -" ldr q3, [x16, #16] \n\t" -" ldr q4, [x17] \n\t" //Load column 2 of C -" ldr q5, [x17, #16] \n\t" -" \n\t" -" fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta -" fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta -" fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta -" fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta -" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta -" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta -" \n\t" -LABEL(SBETAZEROCOLSTOREDS1) -" \n\t" -" fmla v0.4s,v8.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v1.4s,v9.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v2.4s,v10.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v3.4s,v11.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v4.4s,v12.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v5.4s,v13.4s,v6.s[0] \n\t" // Scale by alpha -" \n\t" -" str q0, [x2] \n\t" //Store column 0 of C -" str q1, [x2, #16] \n\t" -" str q2, [x16] \n\t" //Store column 1 of C -" str q3, [x16, #16] \n\t" -" str q4, [x17] \n\t" //Store column 2 of C -" str q5, [x17, #16] \n\t" -" \n\t" -" dup v8.4s, wzr \n\t" -" dup v9.4s, wzr \n\t" -" dup v10.4s, wzr \n\t" -" dup v11.4s, wzr \n\t" -" dup v12.4s, wzr \n\t" -" dup v13.4s, wzr \n\t" -" \n\t" -" fcmp s7,#0.0 \n\t" -BEQ(SBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. -" \n\t" -" ldr q8, [x19] \n\t" //Load column 3 of C -" ldr q9, [x19, #16] \n\t" -" ldr q10, [x20] \n\t" //Load column 4 of C -" ldr q11, [x20, #16] \n\t" -" ldr q12, [x21] \n\t" //Load column 5 of C -" ldr q13, [x21, #16] \n\t" -" \n\t" -" fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta -" fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta -" fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta -" fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta -" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta -" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta -" \n\t" -LABEL(SBETAZEROCOLSTOREDS2) -" \n\t" -" fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v10.4s,v16.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v11.4s,v17.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha -" \n\t" -" str q8, [x19] \n\t" //Store column 3 of C -" str q9, [x19, #16] \n\t" -" str q10, [x20] \n\t" //Store column 4 of C -" str q11, [x20, #16] \n\t" -" str q12, [x21] \n\t" //Store column 5 of C -" str q13, [x21, #16] \n\t" -" \n\t" -" dup v0.4s, wzr \n\t" -" dup v1.4s, wzr \n\t" -" dup v2.4s, wzr \n\t" -" dup v3.4s, wzr \n\t" -" dup v4.4s, wzr \n\t" -" dup v5.4s, wzr \n\t" -" \n\t" -" fcmp s7,#0.0 \n\t" -BEQ(SBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. -" \n\t" -" ldr q0, [x22] \n\t" //Load column 6 of C -" ldr q1, [x22, #16] \n\t" -" ldr q2, [x23] \n\t" //Load column 7 of C -" ldr q3, [x23, #16] \n\t" -" ldr q4, [x24] \n\t" //Load column 8 of C -" ldr q5, [x24, #16] \n\t" -" \n\t" -" fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta -" fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta -" fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta -" fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta -" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta -" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta -" \n\t" -LABEL(SBETAZEROCOLSTOREDS3) -" \n\t" -" fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v2.4s,v22.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v3.4s,v23.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha -" \n\t" -" str q0, [x22] \n\t" //Store column 6 of C -" str q1, [x22, #16] \n\t" -" str q2, [x23] \n\t" //Store column 7 of C -" str q3, [x23, #16] \n\t" -" str q4, [x24] \n\t" //Store column 8 of C -" str q5, [x24, #16] \n\t" -" \n\t" -" dup v8.4s, wzr \n\t" -" dup v9.4s, wzr \n\t" -" dup v10.4s, wzr \n\t" -" dup v11.4s, wzr \n\t" -" dup v12.4s, wzr \n\t" -" dup v13.4s, wzr \n\t" -" \n\t" -" fcmp s7,#0.0 \n\t" -BEQ(SBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. -" \n\t" -" ldr q8, [x25] \n\t" //Load column 9 of C -" ldr q9, [x25, #16] \n\t" -" ldr q10, [x26] \n\t" //Load column 10 of C -" ldr q11, [x26, #16] \n\t" -" ldr q12, [x27] \n\t" //Load column 11 of C -" ldr q13, [x27, #16] \n\t" -" \n\t" -" fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta -" fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta -" fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta -" fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta -" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta -" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta -" \n\t" -LABEL(SBETAZEROCOLSTOREDS4) -" \n\t" -" prfm pldl2keep,[x0] \n\t" -" prfm pldl2keep,[x1] \n\t" -" \n\t" -" fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v10.4s,v28.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v11.4s,v29.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha -" \n\t" -" str q8, [x25] \n\t" //Store column 9 of C -" str q9, [x25, #16] \n\t" -" str q10, [x26] \n\t" //Store column 10 of C -" str q11, [x26, #16] \n\t" -" str q12, [x27] \n\t" //Store column 11 of C -" str q13, [x27, #16] \n\t" -" \n\t" -" \n\t" -BRANCH(SEND) // Done. -" \n\t" -" \n\t" -LABEL(SGENSTORED) // C is general-stride stored. -" \n\t" -" \n\t" -" dup v0.4s, wzr \n\t" -" dup v1.4s, wzr \n\t" -" dup v2.4s, wzr \n\t" -" dup v3.4s, wzr \n\t" -" dup v4.4s, wzr \n\t" -" dup v5.4s, wzr \n\t" -" \n\t" -" fcmp s7,#0.0 \n\t" -BEQ(SBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. -" \n\t" -" mov x5, x2 \n\t" -" \n\t" -" ld1 {v0.s}[0],[x5],x14 \n\t" // Load c00 into quad and increment by rs_c. -" ld1 {v0.s}[1],[x5],x14 \n\t" // Load c01 into quad and increment by rs_c. -" ld1 {v0.s}[2],[x5],x14 \n\t" // Load c02 into quad and increment by rs_c. -" ld1 {v0.s}[3],[x5],x14 \n\t" // Load c03 into quad and increment by rs_c. -" ld1 {v1.s}[0],[x5],x14 \n\t" // Load c04 into quad and increment by rs_c. -" ld1 {v1.s}[1],[x5],x14 \n\t" // Load c05 into quad and increment by rs_c. -" ld1 {v1.s}[2],[x5],x14 \n\t" // Load c06 into quad and increment by rs_c. -" ld1 {v1.s}[3],[x5],x14 \n\t" // Load c07 into quad and increment by rs_c. -" \n\t" -" mov x5, x16 \n\t" -" \n\t" -" ld1 {v2.s}[0],[x5],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v2.s}[1],[x5],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v2.s}[2],[x5],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v2.s}[3],[x5],x14 \n\t" // Load c13 into quad and increment by rs_c. -" ld1 {v3.s}[0],[x5],x14 \n\t" // Load c14 into quad and increment by rs_c. -" ld1 {v3.s}[1],[x5],x14 \n\t" // Load c15 into quad and increment by rs_c. -" ld1 {v3.s}[2],[x5],x14 \n\t" // Load c16 into quad and increment by rs_c. -" ld1 {v3.s}[3],[x5],x14 \n\t" // Load c17 into quad and increment by rs_c. -" \n\t" -" mov x5, x17 \n\t" -" \n\t" -" ld1 {v4.s}[0],[x5],x14 \n\t" // Load c20 into quad and increment by rs_c. -" ld1 {v4.s}[1],[x5],x14 \n\t" // Load c21 into quad and increment by rs_c. -" ld1 {v4.s}[2],[x5],x14 \n\t" // Load c22 into quad and increment by rs_c. -" ld1 {v4.s}[3],[x5],x14 \n\t" // Load c23 into quad and increment by rs_c. -" ld1 {v5.s}[0],[x5],x14 \n\t" // Load c24 into quad and increment by rs_c. -" ld1 {v5.s}[1],[x5],x14 \n\t" // Load c25 into quad and increment by rs_c. -" ld1 {v5.s}[2],[x5],x14 \n\t" // Load c26 into quad and increment by rs_c. -" ld1 {v5.s}[3],[x5],x14 \n\t" // Load c27 into quad and increment by rs_c. -" \n\t" -" fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta -" fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta -" fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta -" fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta -" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta -" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta -" \n\t" -LABEL(SBETAZEROGENSTOREDS1) -" \n\t" -" fmla v0.4s, v8.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v1.4s, v9.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v2.4s,v10.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v3.4s,v11.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v4.4s,v12.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v5.4s,v13.4s,v6.s[0] \n\t" // Scale by alpha -" \n\t" -" mov x5, x2 \n\t" -" \n\t" -" st1 {v0.s}[0],[x5],x14 \n\t" // Store c00 into quad and increment by rs_c. -" st1 {v0.s}[1],[x5],x14 \n\t" // Store c01 into quad and increment by rs_c. -" st1 {v0.s}[2],[x5],x14 \n\t" // Store c02 into quad and increment by rs_c. -" st1 {v0.s}[3],[x5],x14 \n\t" // Store c03 into quad and increment by rs_c. -" st1 {v1.s}[0],[x5],x14 \n\t" // Store c04 into quad and increment by rs_c. -" st1 {v1.s}[1],[x5],x14 \n\t" // Store c05 into quad and increment by rs_c. -" st1 {v1.s}[2],[x5],x14 \n\t" // Store c06 into quad and increment by rs_c. -" st1 {v1.s}[3],[x5],x14 \n\t" // Store c07 into quad and increment by rs_c. -" \n\t" -" mov x5, x16 \n\t" -" \n\t" -" st1 {v2.s}[0],[x5],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v2.s}[1],[x5],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v2.s}[2],[x5],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v2.s}[3],[x5],x14 \n\t" // Store c13 into quad and increment by rs_c. -" st1 {v3.s}[0],[x5],x14 \n\t" // Store c14 into quad and increment by rs_c. -" st1 {v3.s}[1],[x5],x14 \n\t" // Store c15 into quad and increment by rs_c. -" st1 {v3.s}[2],[x5],x14 \n\t" // Store c16 into quad and increment by rs_c. -" st1 {v3.s}[3],[x5],x14 \n\t" // Store c17 into quad and increment by rs_c. -" \n\t" -" mov x5, x17 \n\t" -" \n\t" -" st1 {v4.s}[0],[x5],x14 \n\t" // Store c20 into quad and increment by rs_c. -" st1 {v4.s}[1],[x5],x14 \n\t" // Store c21 into quad and increment by rs_c. -" st1 {v4.s}[2],[x5],x14 \n\t" // Store c22 into quad and increment by rs_c. -" st1 {v4.s}[3],[x5],x14 \n\t" // Store c23 into quad and increment by rs_c. -" st1 {v5.s}[0],[x5],x14 \n\t" // Store c24 into quad and increment by rs_c. -" st1 {v5.s}[1],[x5],x14 \n\t" // Store c25 into quad and increment by rs_c. -" st1 {v5.s}[2],[x5],x14 \n\t" // Store c26 into quad and increment by rs_c. -" st1 {v5.s}[3],[x5],x14 \n\t" // Store c27 into quad and increment by rs_c. -" \n\t" -" dup v8.4s, wzr \n\t" -" dup v9.4s, wzr \n\t" -" dup v10.4s, wzr \n\t" -" dup v11.4s, wzr \n\t" -" dup v12.4s, wzr \n\t" -" dup v13.4s, wzr \n\t" -" \n\t" -" fcmp s7,#0.0 \n\t" -BEQ(SBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. -" \n\t" -" mov x5, x19 \n\t" -" \n\t" -" ld1 {v8.s}[0],[x5],x14 \n\t" // Load c30 into quad and increment by rs_c. -" ld1 {v8.s}[1],[x5],x14 \n\t" // Load c31 into quad and increment by rs_c. -" ld1 {v8.s}[2],[x5],x14 \n\t" // Load c32 into quad and increment by rs_c. -" ld1 {v8.s}[3],[x5],x14 \n\t" // Load c33 into quad and increment by rs_c. -" ld1 {v9.s}[0],[x5],x14 \n\t" // Load c34 into quad and increment by rs_c. -" ld1 {v9.s}[1],[x5],x14 \n\t" // Load c35 into quad and increment by rs_c. -" ld1 {v9.s}[2],[x5],x14 \n\t" // Load c36 into quad and increment by rs_c. -" ld1 {v9.s}[3],[x5],x14 \n\t" // Load c37 into quad and increment by rs_c. -" \n\t" -" mov x5, x20 \n\t" -" \n\t" -" ld1 {v10.s}[0],[x5],x14 \n\t" // Load c40 into quad and increment by rs_c. -" ld1 {v10.s}[1],[x5],x14 \n\t" // Load c41 into quad and increment by rs_c. -" ld1 {v10.s}[2],[x5],x14 \n\t" // Load c42 into quad and increment by rs_c. -" ld1 {v10.s}[3],[x5],x14 \n\t" // Load c43 into quad and increment by rs_c. -" ld1 {v11.s}[0],[x5],x14 \n\t" // Load c44 into quad and increment by rs_c. -" ld1 {v11.s}[1],[x5],x14 \n\t" // Load c45 into quad and increment by rs_c. -" ld1 {v11.s}[2],[x5],x14 \n\t" // Load c46 into quad and increment by rs_c. -" ld1 {v11.s}[3],[x5],x14 \n\t" // Load c47 into quad and increment by rs_c. -" \n\t" -" mov x5, x21 \n\t" -" \n\t" -" ld1 {v12.s}[0],[x5],x14 \n\t" // Load c50 into quad and increment by rs_c. -" ld1 {v12.s}[1],[x5],x14 \n\t" // Load c51 into quad and increment by rs_c. -" ld1 {v12.s}[2],[x5],x14 \n\t" // Load c52 into quad and increment by rs_c. -" ld1 {v12.s}[3],[x5],x14 \n\t" // Load c53 into quad and increment by rs_c. -" ld1 {v13.s}[0],[x5],x14 \n\t" // Load c54 into quad and increment by rs_c. -" ld1 {v13.s}[1],[x5],x14 \n\t" // Load c55 into quad and increment by rs_c. -" ld1 {v13.s}[2],[x5],x14 \n\t" // Load c56 into quad and increment by rs_c. -" ld1 {v13.s}[3],[x5],x14 \n\t" // Load c57 into quad and increment by rs_c. -" \n\t" -" fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta -" fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta -" fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta -" fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta -" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta -" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta -" \n\t" -LABEL(SBETAZEROGENSTOREDS2) -" \n\t" -" fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v10.4s,v16.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v11.4s,v17.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha -" \n\t" -" mov x5, x19 \n\t" -" \n\t" -" st1 {v8.s}[0],[x5],x14 \n\t" // Store c30 into quad and increment by rs_c. -" st1 {v8.s}[1],[x5],x14 \n\t" // Store c31 into quad and increment by rs_c. -" st1 {v8.s}[2],[x5],x14 \n\t" // Store c32 into quad and increment by rs_c. -" st1 {v8.s}[3],[x5],x14 \n\t" // Store c33 into quad and increment by rs_c. -" st1 {v9.s}[0],[x5],x14 \n\t" // Store c34 into quad and increment by rs_c. -" st1 {v9.s}[1],[x5],x14 \n\t" // Store c35 into quad and increment by rs_c. -" st1 {v9.s}[2],[x5],x14 \n\t" // Store c36 into quad and increment by rs_c. -" st1 {v9.s}[3],[x5],x14 \n\t" // Store c37 into quad and increment by rs_c. -" \n\t" -" mov x5, x20 \n\t" -" \n\t" -" st1 {v10.s}[0],[x5],x14 \n\t" // Store c40 into quad and increment by rs_c. -" st1 {v10.s}[1],[x5],x14 \n\t" // Store c41 into quad and increment by rs_c. -" st1 {v10.s}[2],[x5],x14 \n\t" // Store c42 into quad and increment by rs_c. -" st1 {v10.s}[3],[x5],x14 \n\t" // Store c43 into quad and increment by rs_c. -" st1 {v11.s}[0],[x5],x14 \n\t" // Store c44 into quad and increment by rs_c. -" st1 {v11.s}[1],[x5],x14 \n\t" // Store c45 into quad and increment by rs_c. -" st1 {v11.s}[2],[x5],x14 \n\t" // Store c46 into quad and increment by rs_c. -" st1 {v11.s}[3],[x5],x14 \n\t" // Store c47 into quad and increment by rs_c. -" \n\t" -" mov x5, x21 \n\t" -" \n\t" -" st1 {v12.s}[0],[x5],x14 \n\t" // Store c50 into quad and increment by rs_c. -" st1 {v12.s}[1],[x5],x14 \n\t" // Store c51 into quad and increment by rs_c. -" st1 {v12.s}[2],[x5],x14 \n\t" // Store c52 into quad and increment by rs_c. -" st1 {v12.s}[3],[x5],x14 \n\t" // Store c53 into quad and increment by rs_c. -" st1 {v13.s}[0],[x5],x14 \n\t" // Store c54 into quad and increment by rs_c. -" st1 {v13.s}[1],[x5],x14 \n\t" // Store c55 into quad and increment by rs_c. -" st1 {v13.s}[2],[x5],x14 \n\t" // Store c56 into quad and increment by rs_c. -" st1 {v13.s}[3],[x5],x14 \n\t" // Store c57 into quad and increment by rs_c. -" \n\t" -" dup v0.4s, wzr \n\t" -" dup v1.4s, wzr \n\t" -" dup v2.4s, wzr \n\t" -" dup v3.4s, wzr \n\t" -" dup v4.4s, wzr \n\t" -" dup v5.4s, wzr \n\t" -" \n\t" -" fcmp s7,#0.0 \n\t" -BEQ(SBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. -" \n\t" -" mov x5, x22 \n\t" -" \n\t" -" ld1 {v0.s}[0],[x5],x14 \n\t" // Load c60 into quad and increment by rs_c. -" ld1 {v0.s}[1],[x5],x14 \n\t" // Load c61 into quad and increment by rs_c. -" ld1 {v0.s}[2],[x5],x14 \n\t" // Load c62 into quad and increment by rs_c. -" ld1 {v0.s}[3],[x5],x14 \n\t" // Load c63 into quad and increment by rs_c. -" ld1 {v1.s}[0],[x5],x14 \n\t" // Load c64 into quad and increment by rs_c. -" ld1 {v1.s}[1],[x5],x14 \n\t" // Load c65 into quad and increment by rs_c. -" ld1 {v1.s}[2],[x5],x14 \n\t" // Load c66 into quad and increment by rs_c. -" ld1 {v1.s}[3],[x5],x14 \n\t" // Load c67 into quad and increment by rs_c. -" \n\t" -" mov x5, x23 \n\t" -" \n\t" -" ld1 {v2.s}[0],[x5],x14 \n\t" // Load c70 into quad and increment by rs_c. -" ld1 {v2.s}[1],[x5],x14 \n\t" // Load c71 into quad and increment by rs_c. -" ld1 {v2.s}[2],[x5],x14 \n\t" // Load c72 into quad and increment by rs_c. -" ld1 {v2.s}[3],[x5],x14 \n\t" // Load c73 into quad and increment by rs_c. -" ld1 {v3.s}[0],[x5],x14 \n\t" // Load c74 into quad and increment by rs_c. -" ld1 {v3.s}[1],[x5],x14 \n\t" // Load c75 into quad and increment by rs_c. -" ld1 {v3.s}[2],[x5],x14 \n\t" // Load c76 into quad and increment by rs_c. -" ld1 {v3.s}[3],[x5],x14 \n\t" // Load c77 into quad and increment by rs_c. -" \n\t" -" mov x5, x24 \n\t" -" \n\t" -" ld1 {v4.s}[0],[x5],x14 \n\t" // Load c80 into quad and increment by rs_c. -" ld1 {v4.s}[1],[x5],x14 \n\t" // Load c81 into quad and increment by rs_c. -" ld1 {v4.s}[2],[x5],x14 \n\t" // Load c82 into quad and increment by rs_c. -" ld1 {v4.s}[3],[x5],x14 \n\t" // Load c83 into quad and increment by rs_c. -" ld1 {v5.s}[0],[x5],x14 \n\t" // Load c84 into quad and increment by rs_c. -" ld1 {v5.s}[1],[x5],x14 \n\t" // Load c85 into quad and increment by rs_c. -" ld1 {v5.s}[2],[x5],x14 \n\t" // Load c86 into quad and increment by rs_c. -" ld1 {v5.s}[3],[x5],x14 \n\t" // Load c87 into quad and increment by rs_c. -" \n\t" -" fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta -" fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta -" fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta -" fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta -" fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta -" fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta -" \n\t" -LABEL(SBETAZEROGENSTOREDS3) -" \n\t" -" fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v2.4s,v22.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v3.4s,v23.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha -" \n\t" -" mov x5, x22 \n\t" -" \n\t" -" st1 {v0.s}[0],[x5],x14 \n\t" // Store c60 into quad and increment by rs_c. -" st1 {v0.s}[1],[x5],x14 \n\t" // Store c61 into quad and increment by rs_c. -" st1 {v0.s}[2],[x5],x14 \n\t" // Store c62 into quad and increment by rs_c. -" st1 {v0.s}[3],[x5],x14 \n\t" // Store c63 into quad and increment by rs_c. -" st1 {v1.s}[0],[x5],x14 \n\t" // Store c64 into quad and increment by rs_c. -" st1 {v1.s}[1],[x5],x14 \n\t" // Store c65 into quad and increment by rs_c. -" st1 {v1.s}[2],[x5],x14 \n\t" // Store c66 into quad and increment by rs_c. -" st1 {v1.s}[3],[x5],x14 \n\t" // Store c67 into quad and increment by rs_c. -" \n\t" -" mov x5, x23 \n\t" -" \n\t" -" st1 {v2.s}[0],[x5],x14 \n\t" // Store c70 into quad and increment by rs_c. -" st1 {v2.s}[1],[x5],x14 \n\t" // Store c71 into quad and increment by rs_c. -" st1 {v2.s}[2],[x5],x14 \n\t" // Store c72 into quad and increment by rs_c. -" st1 {v2.s}[3],[x5],x14 \n\t" // Store c73 into quad and increment by rs_c. -" st1 {v3.s}[0],[x5],x14 \n\t" // Store c74 into quad and increment by rs_c. -" st1 {v3.s}[1],[x5],x14 \n\t" // Store c75 into quad and increment by rs_c. -" st1 {v3.s}[2],[x5],x14 \n\t" // Store c76 into quad and increment by rs_c. -" st1 {v3.s}[3],[x5],x14 \n\t" // Store c77 into quad and increment by rs_c. -" \n\t" -" mov x5, x24 \n\t" -" \n\t" -" st1 {v4.s}[0],[x5],x14 \n\t" // Store c80 into quad and increment by rs_c. -" st1 {v4.s}[1],[x5],x14 \n\t" // Store c81 into quad and increment by rs_c. -" st1 {v4.s}[2],[x5],x14 \n\t" // Store c82 into quad and increment by rs_c. -" st1 {v4.s}[3],[x5],x14 \n\t" // Store c83 into quad and increment by rs_c. -" st1 {v5.s}[0],[x5],x14 \n\t" // Store c84 into quad and increment by rs_c. -" st1 {v5.s}[1],[x5],x14 \n\t" // Store c85 into quad and increment by rs_c. -" st1 {v5.s}[2],[x5],x14 \n\t" // Store c86 into quad and increment by rs_c. -" st1 {v5.s}[3],[x5],x14 \n\t" // Store c87 into quad and increment by rs_c. -" \n\t" -" dup v8.4s, wzr \n\t" -" dup v9.4s, wzr \n\t" -" dup v10.4s, wzr \n\t" -" dup v11.4s, wzr \n\t" -" dup v12.4s, wzr \n\t" -" dup v13.4s, wzr \n\t" -" \n\t" -" fcmp s7,#0.0 \n\t" -BEQ(SBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. -" \n\t" -" mov x5, x25 \n\t" -" \n\t" -" ld1 {v8.s}[0],[x5],x14 \n\t" // Load c90 into quad and increment by rs_c. -" ld1 {v8.s}[1],[x5],x14 \n\t" // Load c91 into quad and increment by rs_c. -" ld1 {v8.s}[2],[x5],x14 \n\t" // Load c92 into quad and increment by rs_c. -" ld1 {v8.s}[3],[x5],x14 \n\t" // Load c93 into quad and increment by rs_c. -" ld1 {v9.s}[0],[x5],x14 \n\t" // Load c94 into quad and increment by rs_c. -" ld1 {v9.s}[1],[x5],x14 \n\t" // Load c95 into quad and increment by rs_c. -" ld1 {v9.s}[2],[x5],x14 \n\t" // Load c96 into quad and increment by rs_c. -" ld1 {v9.s}[3],[x5],x14 \n\t" // Load c97 into quad and increment by rs_c. -" \n\t" -" mov x5, x26 \n\t" -" \n\t" -" ld1 {v10.s}[0],[x5],x14 \n\t" // Load c100 into quad and increment by rs_c. -" ld1 {v10.s}[1],[x5],x14 \n\t" // Load c101 into quad and increment by rs_c. -" ld1 {v10.s}[2],[x5],x14 \n\t" // Load c102 into quad and increment by rs_c. -" ld1 {v10.s}[3],[x5],x14 \n\t" // Load c103 into quad and increment by rs_c. -" ld1 {v11.s}[0],[x5],x14 \n\t" // Load c104 into quad and increment by rs_c. -" ld1 {v11.s}[1],[x5],x14 \n\t" // Load c105 into quad and increment by rs_c. -" ld1 {v11.s}[2],[x5],x14 \n\t" // Load c106 into quad and increment by rs_c. -" ld1 {v11.s}[3],[x5],x14 \n\t" // Load c107 into quad and increment by rs_c. -" \n\t" -" mov x5, x27 \n\t" -" \n\t" -" ld1 {v12.s}[0],[x5],x14 \n\t" // Load c110 into quad and increment by rs_c. -" ld1 {v12.s}[1],[x5],x14 \n\t" // Load c111 into quad and increment by rs_c. -" ld1 {v12.s}[2],[x5],x14 \n\t" // Load c112 into quad and increment by rs_c. -" ld1 {v12.s}[3],[x5],x14 \n\t" // Load c113 into quad and increment by rs_c. -" ld1 {v13.s}[0],[x5],x14 \n\t" // Load c114 into quad and increment by rs_c. -" ld1 {v13.s}[1],[x5],x14 \n\t" // Load c115 into quad and increment by rs_c. -" ld1 {v13.s}[2],[x5],x14 \n\t" // Load c116 into quad and increment by rs_c. -" ld1 {v13.s}[3],[x5],x14 \n\t" // Load c117 into quad and increment by rs_c. -" \n\t" -" fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta -" fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta -" fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta -" fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta -" fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta -" fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta -" \n\t" -LABEL(SBETAZEROGENSTOREDS4) -" \n\t" -" prfm pldl2keep,[x0] \n\t" -" prfm pldl2keep,[x1] \n\t" -" \n\t" -" fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v10.4s,v28.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v11.4s,v29.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha -" fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha -" \n\t" -" mov x5, x25 \n\t" -" \n\t" -" st1 {v8.s}[0],[x5],x14 \n\t" // Store c90 into quad and increment by rs_c. -" st1 {v8.s}[1],[x5],x14 \n\t" // Store c91 into quad and increment by rs_c. -" st1 {v8.s}[2],[x5],x14 \n\t" // Store c92 into quad and increment by rs_c. -" st1 {v8.s}[3],[x5],x14 \n\t" // Store c93 into quad and increment by rs_c. -" st1 {v9.s}[0],[x5],x14 \n\t" // Store c94 into quad and increment by rs_c. -" st1 {v9.s}[1],[x5],x14 \n\t" // Store c95 into quad and increment by rs_c. -" st1 {v9.s}[2],[x5],x14 \n\t" // Store c96 into quad and increment by rs_c. -" st1 {v9.s}[3],[x5],x14 \n\t" // Store c97 into quad and increment by rs_c. -" \n\t" -" mov x5, x26 \n\t" -" \n\t" -" st1 {v10.s}[0],[x5],x14 \n\t" // Store c100 into quad and increment by rs_c. -" st1 {v10.s}[1],[x5],x14 \n\t" // Store c101 into quad and increment by rs_c. -" st1 {v10.s}[2],[x5],x14 \n\t" // Store c102 into quad and increment by rs_c. -" st1 {v10.s}[3],[x5],x14 \n\t" // Store c103 into quad and increment by rs_c. -" st1 {v11.s}[0],[x5],x14 \n\t" // Store c104 into quad and increment by rs_c. -" st1 {v11.s}[1],[x5],x14 \n\t" // Store c105 into quad and increment by rs_c. -" st1 {v11.s}[2],[x5],x14 \n\t" // Store c106 into quad and increment by rs_c. -" st1 {v11.s}[3],[x5],x14 \n\t" // Store c107 into quad and increment by rs_c. -" \n\t" -" mov x5, x27 \n\t" -" \n\t" -" st1 {v12.s}[0],[x5],x14 \n\t" // Store c110 into quad and increment by rs_c. -" st1 {v12.s}[1],[x5],x14 \n\t" // Store c111 into quad and increment by rs_c. -" st1 {v12.s}[2],[x5],x14 \n\t" // Store c112 into quad and increment by rs_c. -" st1 {v12.s}[3],[x5],x14 \n\t" // Store c113 into quad and increment by rs_c. -" st1 {v13.s}[0],[x5],x14 \n\t" // Store c114 into quad and increment by rs_c. -" st1 {v13.s}[1],[x5],x14 \n\t" // Store c115 into quad and increment by rs_c. -" st1 {v13.s}[2],[x5],x14 \n\t" // Store c116 into quad and increment by rs_c. -" st1 {v13.s}[3],[x5],x14 \n\t" // Store c147 into quad and increment by rs_c. -" \n\t" -LABEL(SEND) // Done! -" \n\t" -:// output operands (none) -:// input operands - [aaddr] "m" (a), // 0 - [baddr] "m" (b), // 1 - [caddr] "m" (c), // 2 - [k_iter] "m" (k_iter), // 3 - [k_left] "m" (k_left), // 4 - [alpha] "m" (alpha), // 5 - [beta] "m" (beta), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c), // 8 - [a_next] "m" (a_next), // 9 - [b_next] "m" (b_next) // 10 -:// Register clobber list - "x0", "x1", "x2", - "x5", "x6", "x10","x14", - "x16","x17","x19","x20", - "x21","x22","x23","x24", - "x25","x26","x27", - "v0", "v1", "v2", "v3", - "v4", "v5", "v6", "v7", - "v8", "v9", "v10","v11", - "v12","v13","v14","v15", - "v16","v17","v18","v19", - "v20","v21","v22","v23", - "v24","v25","v26","v27", - "v28","v29","v30","v31" -); + __asm__ volatile + ( + " \n\t" + " \n\t" + " ldr x0,%[aaddr] \n\t" // Load address of A. + " ldr x1,%[baddr] \n\t" // Load address of B. + " ldr x2,%[caddr] \n\t" // Load address of C. + " \n\t" + " ldr x5,%[k_iter] \n\t" // Number of unrolled iterations (k_iter). + " ldr x6,%[k_left] \n\t" // Number of remaining iterations (k_left). + " \n\t" + " ldr x10,%[cs_c] \n\t" // Load cs_c. + " lsl x10,x10,#2 \n\t" // cs_c * sizeof(float) -- AUX. + " \n\t" + " ldr x14,%[rs_c] \n\t" // Load rs_c. + " lsl x14,x14,#2 \n\t" // rs_c * sizeof(float). + " \n\t" + " add x16,x2,x10 \n\t" //Load address Column 1 of C + " add x17,x16,x10 \n\t" //Load address Column 2 of C + " add x19,x17,x10 \n\t" //Load address Column 3 of C + " add x20,x19,x10 \n\t" //Load address Column 4 of C + " add x21,x20,x10 \n\t" //Load address Column 5 of C + " add x22,x21,x10 \n\t" //Load address Column 6 of C + " add x23,x22,x10 \n\t" //Load address Column 7 of C + " add x24,x23,x10 \n\t" //Load address Column 8 of C + " add x25,x24,x10 \n\t" //Load address Column 9 of C + " add x26,x25,x10 \n\t" //Load address Column 10 of C + " add x27,x26,x10 \n\t" //Load address Column 11 of C + " \n\t" + " prfm pldl1keep,[x2] \n\t" // Prefetch c. + " prfm pldl1keep,[x16] \n\t" // Prefetch c. + " prfm pldl1keep,[x17] \n\t" // Prefetch c. + " prfm pldl1keep,[x19] \n\t" // Prefetch c. + " prfm pldl1keep,[x20] \n\t" // Prefetch c. + " prfm pldl1keep,[x21] \n\t" // Prefetch c. + " prfm pldl1keep,[x22] \n\t" // Prefetch c. + " prfm pldl1keep,[x23] \n\t" // Prefetch c. + " prfm pldl1keep,[x24] \n\t" // Prefetch c. + " prfm pldl1keep,[x25] \n\t" // Prefetch c. + " prfm pldl1keep,[x26] \n\t" // Prefetch c. + " prfm pldl1keep,[x27] \n\t" // Prefetch c. + " \n\t" + " dup v8.4s, wzr \n\t" // Vector for accummulating column 0 + " prfm PLDL1KEEP, [x1, #192] \n\t" + " dup v9.4s, wzr \n\t" // Vector for accummulating column 0 + " prfm PLDL1KEEP, [x1, #256] \n\t" + " dup v10.4s, wzr \n\t" // Vector for accummulating column 1 + " prfm PLDL1KEEP, [x1, #320] \n\t" + " dup v11.4s, wzr \n\t" // Vector for accummulating column 1 + " dup v12.4s, wzr \n\t" // Vector for accummulating column 2 + " dup v13.4s, wzr \n\t" // Vector for accummulating column 2 + " \n\t" + " dup v14.4s, wzr \n\t" // Vector for accummulating column 3 + " prfm PLDL1KEEP, [x0, #128] \n\t" + " dup v15.4s, wzr \n\t" // Vector for accummulating column 3 + " prfm PLDL1KEEP, [x0, #192] \n\t" + " dup v16.4s, wzr \n\t" // Vector for accummulating column 4 + " dup v17.4s, wzr \n\t" // Vector for accummulating column 4 + " dup v18.4s, wzr \n\t" // Vector for accummulating column 5 + " dup v19.4s, wzr \n\t" // Vector for accummulating column 5 + " \n\t" + " dup v20.4s, wzr \n\t" // Vector for accummulating column 6 + " dup v21.4s, wzr \n\t" // Vector for accummulating column 6 + " dup v22.4s, wzr \n\t" // Vector for accummulating column 7 + " dup v23.4s, wzr \n\t" // Vector for accummulating column 7 + " dup v24.4s, wzr \n\t" // Vector for accummulating column 8 + " dup v25.4s, wzr \n\t" // Vector for accummulating column 8 + " \n\t" + " dup v26.4s, wzr \n\t" // Vector for accummulating column 9 + " dup v27.4s, wzr \n\t" // Vector for accummulating column 9 + " dup v28.4s, wzr \n\t" // Vector for accummulating column 10 + " dup v29.4s, wzr \n\t" // Vector for accummulating column 10 + " dup v30.4s, wzr \n\t" // Vector for accummulating column 11 + " dup v31.4s, wzr \n\t" // Vector for accummulating column 11 + " \n\t" + " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. + BEQ(SCONSIDERKLEFT) + " \n\t" + " ldr q0, [x0] \n\t" + " ldr q1, [x0, #16] \n\t" // Load a + " \n\t" + " ldr q2, [x1] \n\t" // Load b + " ldr q3, [x1, #16] \n\t" + " ldr q4, [x1, #32] \n\t" + " \n\t" + " add x0, x0, #32 \n\t" //update address of A + " add x1, x1, #48 \n\t" //update address of B + " \n\t" + " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. + BEQ(SLASTITER) // (as loop is do-while-like). + " \n\t" + LABEL(SLOOPKITER) // Body of the k_iter loop. + " \n\t" + " ldr q5, [x0] \n\t" + " fmla v8.4s, v0.4s,v2.s[0] \n\t" // Accummulate. + " fmla v9.4s, v1.4s,v2.s[0] \n\t" // Accummulate. + " ldr q6, [x0, #16] \n\t" + " fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. + " fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. + " fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. + " fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. + " fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. + " fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. + " ldr q2, [x1] \n\t" + " \n\t" + " fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. + " prfm PLDL1KEEP, [x1, #336] \n\t" + " fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. + " prfm PLDL1KEEP, [x1, #400] \n\t" + " fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. + " fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. + " prfm PLDL1KEEP, [x1, #464] \n\t" + " fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. + " fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. + " fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. + " fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. + " \n\t" + " fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. + " fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. + " fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. + " fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. + " ldr q3, [x1, #16] \n\t" + " \n\t" + " fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. + " fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. + " fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. + " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. + " ldr q4, [x1, #32] \n\t" + " \n\t" //End It 1 + " \n\t" + " ldr q0, [x0, #32] \n\t" + " fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. + " fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. + " ldr q1, [x0, #48] \n\t" + " fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. + " fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. + " fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. + " fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. + " fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. + " fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. + " ldr q2, [x1, #48] \n\t" + " \n\t" + " fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. + " prfm PLDL1KEEP, [x0, #224] \n\t" + " fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. + " prfm PLDL1KEEP, [x0, #288] \n\t" + " fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. + " fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. + " fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. + " fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. + " fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. + " fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. + " \n\t" + " fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. + " fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. + " fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. + " fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. + " ldr q3, [x1, #64] \n\t" + " \n\t" + " fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. + " fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. + " fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. + " fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. + " ldr q4, [x1, #80] \n\t" + " \n\t" //End It 2 + " \n\t" + " ldr q5, [x0, #64] \n\t" + " fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. + " fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. + " ldr q6, [x0, #80] \n\t" + " fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. + " fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. + " fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. + " fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. + " fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. + " fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. + " ldr q2, [x1, #96] \n\t" + " \n\t" + " fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. + " fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. + " fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. + " fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. + " fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. + " fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. + " fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. + " fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. + " \n\t" + " fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. + " fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. + " fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. + " fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. + " ldr q3, [x1, #112] \n\t" + " \n\t" + " fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. + " fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. + " fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. + " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. + " ldr q4, [x1, #128] \n\t" + " \n\t" //End It 3 + " \n\t" + " ldr q0, [x0, #96] \n\t" + " fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. + " fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. + " ldr q1, [x0, #112] \n\t" + " fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. + " fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. + " fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. + " fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. + " fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. + " fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. + " ldr q2, [x1, #144] \n\t" + " \n\t" + " fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. + " fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. + " fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. + " fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. + " fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. + " fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. + " fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. + " fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. + " \n\t" + " fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. + " fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. + " fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. + " fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. + " ldr q3, [x1, #160] \n\t" + " \n\t" + " fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. + " fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. + " fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. + " fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. + " ldr q4, [x1, #176] \n\t" + " add x1, x1, #192 \n\t" + " add x0, x0, #128 \n\t" + " \n\t" //End It 4 + " sub x5,x5,1 \n\t" // i-=1. + " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. + BNE(SLOOPKITER) + " \n\t" + LABEL(SLASTITER) // Last iteration of k_iter loop. + " \n\t" + " \n\t" + " ldr q5, [x0] \n\t" + " fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. + " fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. + " ldr q6, [x0, #16] \n\t" + " fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. + " fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. + " fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. + " fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. + " fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. + " fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. + " ldr q2, [x1] \n\t" + " \n\t" + " fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. + " fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. + " fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. + " fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. + " fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. + " fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. + " fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. + " fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. + " \n\t" + " fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. + " fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. + " fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. + " fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. + " ldr q3, [x1, #16] \n\t" + " \n\t" + " fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. + " fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. + " fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. + " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. + " ldr q4, [x1, #32] \n\t" + " \n\t" //End It 1 + " \n\t" + " ldr q0, [x0, #32] \n\t" + " fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. + " fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. + " ldr q1, [x0, #48] \n\t" + " fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. + " fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. + " fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. + " fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. + " fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. + " fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. + " ldr q2, [x1, #48] \n\t" + " \n\t" + " fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. + " fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. + " fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. + " fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. + " fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. + " fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. + " fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. + " fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. + " \n\t" + " fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. + " fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. + " fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. + " fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. + " ldr q3, [x1, #64] \n\t" + " \n\t" + " fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. + " fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. + " fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. + " fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. + " ldr q4, [x1, #80] \n\t" + " \n\t" //End It 2 + " \n\t" + " ldr q5, [x0, #64] \n\t" + " fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. + " fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. + " ldr q6, [x0, #80] \n\t" + " fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. + " fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. + " fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. + " fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. + " fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. + " fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. + " ldr q2, [x1, #96] \n\t" + " \n\t" + " fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. + " fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. + " fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. + " fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. + " fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. + " fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. + " fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. + " fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. + " \n\t" + " fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. + " fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. + " fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. + " fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. + " ldr q3, [x1, #112] \n\t" + " \n\t" + " fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. + " fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. + " fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. + " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. + " ldr q4, [x1, #128] \n\t" + " \n\t" //End It 3 + " \n\t" + " fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. + " fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. + " fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. + " fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. + " fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. + " fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. + " fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. + " fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. + " \n\t" + " fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. + " fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. + " fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. + " fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. + " fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. + " fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. + " fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. + " fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. + " \n\t" + " fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. + " fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. + " fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. + " fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. + " \n\t" + " fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. + " fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. + " fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. + " fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. + " add x1, x1, #144 \n\t" + " add x0, x0, #96 \n\t" + " \n\t" //End It 4 + " \n\t" + LABEL(SCONSIDERKLEFT) + " cmp x6,0 \n\t" // If k_left == 0, we are done. + BEQ(SPOSTACCUM) // else, we enter the k_left loop. + " \n\t" + LABEL(SLOOPKLEFT) // Body of the left iterations + " \n\t" + " ldr q0, [x0],#16 \n\t" + " ldr q1, [x0],#16 \n\t" // Load a + " \n\t" + " ldr q2, [x1],#16 \n\t" // Load b + " ldr q3, [x1],#16 \n\t" + " ldr q4, [x1],#16 \n\t" + " \n\t" + " sub x6,x6,1 \n\t" // i = i-1. + " \n\t" + " fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. + " fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. + " fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. + " fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. + " fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. + " fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. + " fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. + " fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. + " \n\t" + " fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. + " fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. + " fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. + " fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. + " fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. + " fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. + " fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. + " fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. + " \n\t" + " fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. + " fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. + " fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. + " fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. + " fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. + " fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. + " fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. + " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. + " \n\t" + " cmp x6,0 \n\t" // Iterate again. + BNE(SLOOPKLEFT) // if i!=0. + " \n\t" + LABEL(SPOSTACCUM) + " \n\t" + " ldr x0,%[alpha] \n\t" // Alpha address. + " ldr x1,%[beta] \n\t" // Beta address. + " \n\t" + " ld1r {v6.4s},[x0] \n\t" // Load alpha. + " ld1r {v7.4s},[x1] \n\t" // Load beta + " \n\t" + " ldr x0,%[a_next] \n\t" // Pointer to next block of A. + " ldr x1,%[b_next] \n\t" // Pointer to next pointer of B. + " \n\t" + " cmp x14,#4 \n\t" // If rs_c != 1 (column-major) + BNE(SGENSTORED) + " \n\t" + LABEL(SCOLSTORED) // C is column-major. + " \n\t" + " dup v0.4s, wzr \n\t" + " dup v1.4s, wzr \n\t" + " dup v2.4s, wzr \n\t" + " dup v3.4s, wzr \n\t" + " dup v4.4s, wzr \n\t" + " dup v5.4s, wzr \n\t" + " \n\t" + " fcmp s7,#0.0 \n\t" + BEQ(SBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case. + " \n\t" + " ldr q0, [x2] \n\t" //Load column 0 of C + " ldr q1, [x2, #16] \n\t" + " ldr q2, [x16] \n\t" //Load column 1 of C + " ldr q3, [x16, #16] \n\t" + " ldr q4, [x17] \n\t" //Load column 2 of C + " ldr q5, [x17, #16] \n\t" + " \n\t" + " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta + " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta + " fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta + " fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta + " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta + " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta + " \n\t" + LABEL(SBETAZEROCOLSTOREDS1) + " \n\t" + " fmla v0.4s,v8.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v1.4s,v9.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v2.4s,v10.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v3.4s,v11.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v4.4s,v12.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v5.4s,v13.4s,v6.s[0] \n\t" // Scale by alpha + " \n\t" + " str q0, [x2] \n\t" //Store column 0 of C + " str q1, [x2, #16] \n\t" + " str q2, [x16] \n\t" //Store column 1 of C + " str q3, [x16, #16] \n\t" + " str q4, [x17] \n\t" //Store column 2 of C + " str q5, [x17, #16] \n\t" + " \n\t" + " dup v8.4s, wzr \n\t" + " dup v9.4s, wzr \n\t" + " dup v10.4s, wzr \n\t" + " dup v11.4s, wzr \n\t" + " dup v12.4s, wzr \n\t" + " dup v13.4s, wzr \n\t" + " \n\t" + " fcmp s7,#0.0 \n\t" + BEQ(SBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. + " \n\t" + " ldr q8, [x19] \n\t" //Load column 3 of C + " ldr q9, [x19, #16] \n\t" + " ldr q10, [x20] \n\t" //Load column 4 of C + " ldr q11, [x20, #16] \n\t" + " ldr q12, [x21] \n\t" //Load column 5 of C + " ldr q13, [x21, #16] \n\t" + " \n\t" + " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta + " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta + " fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta + " fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta + " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta + " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta + " \n\t" + LABEL(SBETAZEROCOLSTOREDS2) + " \n\t" + " fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v10.4s,v16.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v11.4s,v17.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha + " \n\t" + " str q8, [x19] \n\t" //Store column 3 of C + " str q9, [x19, #16] \n\t" + " str q10, [x20] \n\t" //Store column 4 of C + " str q11, [x20, #16] \n\t" + " str q12, [x21] \n\t" //Store column 5 of C + " str q13, [x21, #16] \n\t" + " \n\t" + " dup v0.4s, wzr \n\t" + " dup v1.4s, wzr \n\t" + " dup v2.4s, wzr \n\t" + " dup v3.4s, wzr \n\t" + " dup v4.4s, wzr \n\t" + " dup v5.4s, wzr \n\t" + " \n\t" + " fcmp s7,#0.0 \n\t" + BEQ(SBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. + " \n\t" + " ldr q0, [x22] \n\t" //Load column 6 of C + " ldr q1, [x22, #16] \n\t" + " ldr q2, [x23] \n\t" //Load column 7 of C + " ldr q3, [x23, #16] \n\t" + " ldr q4, [x24] \n\t" //Load column 8 of C + " ldr q5, [x24, #16] \n\t" + " \n\t" + " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta + " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta + " fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta + " fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta + " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta + " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta + " \n\t" + LABEL(SBETAZEROCOLSTOREDS3) + " \n\t" + " fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v2.4s,v22.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v3.4s,v23.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha + " \n\t" + " str q0, [x22] \n\t" //Store column 6 of C + " str q1, [x22, #16] \n\t" + " str q2, [x23] \n\t" //Store column 7 of C + " str q3, [x23, #16] \n\t" + " str q4, [x24] \n\t" //Store column 8 of C + " str q5, [x24, #16] \n\t" + " \n\t" + " dup v8.4s, wzr \n\t" + " dup v9.4s, wzr \n\t" + " dup v10.4s, wzr \n\t" + " dup v11.4s, wzr \n\t" + " dup v12.4s, wzr \n\t" + " dup v13.4s, wzr \n\t" + " \n\t" + " fcmp s7,#0.0 \n\t" + BEQ(SBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. + " \n\t" + " ldr q8, [x25] \n\t" //Load column 9 of C + " ldr q9, [x25, #16] \n\t" + " ldr q10, [x26] \n\t" //Load column 10 of C + " ldr q11, [x26, #16] \n\t" + " ldr q12, [x27] \n\t" //Load column 11 of C + " ldr q13, [x27, #16] \n\t" + " \n\t" + " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta + " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta + " fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta + " fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta + " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta + " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta + " \n\t" + LABEL(SBETAZEROCOLSTOREDS4) + " \n\t" + " prfm pldl2keep,[x0] \n\t" + " prfm pldl2keep,[x1] \n\t" + " \n\t" + " fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v10.4s,v28.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v11.4s,v29.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha + " \n\t" + " str q8, [x25] \n\t" //Store column 9 of C + " str q9, [x25, #16] \n\t" + " str q10, [x26] \n\t" //Store column 10 of C + " str q11, [x26, #16] \n\t" + " str q12, [x27] \n\t" //Store column 11 of C + " str q13, [x27, #16] \n\t" + " \n\t" + " \n\t" + BRANCH(SEND) // Done. + " \n\t" + " \n\t" + LABEL(SGENSTORED) // C is general-stride stored. + " \n\t" + " \n\t" + " dup v0.4s, wzr \n\t" + " dup v1.4s, wzr \n\t" + " dup v2.4s, wzr \n\t" + " dup v3.4s, wzr \n\t" + " dup v4.4s, wzr \n\t" + " dup v5.4s, wzr \n\t" + " \n\t" + " fcmp s7,#0.0 \n\t" + BEQ(SBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. + " \n\t" + " mov x5, x2 \n\t" + " \n\t" + " ld1 {v0.s}[0],[x5],x14 \n\t" // Load c00 into quad and increment by rs_c. + " ld1 {v0.s}[1],[x5],x14 \n\t" // Load c01 into quad and increment by rs_c. + " ld1 {v0.s}[2],[x5],x14 \n\t" // Load c02 into quad and increment by rs_c. + " ld1 {v0.s}[3],[x5],x14 \n\t" // Load c03 into quad and increment by rs_c. + " ld1 {v1.s}[0],[x5],x14 \n\t" // Load c04 into quad and increment by rs_c. + " ld1 {v1.s}[1],[x5],x14 \n\t" // Load c05 into quad and increment by rs_c. + " ld1 {v1.s}[2],[x5],x14 \n\t" // Load c06 into quad and increment by rs_c. + " ld1 {v1.s}[3],[x5],x14 \n\t" // Load c07 into quad and increment by rs_c. + " \n\t" + " mov x5, x16 \n\t" + " \n\t" + " ld1 {v2.s}[0],[x5],x14 \n\t" // Load c10 into quad and increment by rs_c. + " ld1 {v2.s}[1],[x5],x14 \n\t" // Load c11 into quad and increment by rs_c. + " ld1 {v2.s}[2],[x5],x14 \n\t" // Load c12 into quad and increment by rs_c. + " ld1 {v2.s}[3],[x5],x14 \n\t" // Load c13 into quad and increment by rs_c. + " ld1 {v3.s}[0],[x5],x14 \n\t" // Load c14 into quad and increment by rs_c. + " ld1 {v3.s}[1],[x5],x14 \n\t" // Load c15 into quad and increment by rs_c. + " ld1 {v3.s}[2],[x5],x14 \n\t" // Load c16 into quad and increment by rs_c. + " ld1 {v3.s}[3],[x5],x14 \n\t" // Load c17 into quad and increment by rs_c. + " \n\t" + " mov x5, x17 \n\t" + " \n\t" + " ld1 {v4.s}[0],[x5],x14 \n\t" // Load c20 into quad and increment by rs_c. + " ld1 {v4.s}[1],[x5],x14 \n\t" // Load c21 into quad and increment by rs_c. + " ld1 {v4.s}[2],[x5],x14 \n\t" // Load c22 into quad and increment by rs_c. + " ld1 {v4.s}[3],[x5],x14 \n\t" // Load c23 into quad and increment by rs_c. + " ld1 {v5.s}[0],[x5],x14 \n\t" // Load c24 into quad and increment by rs_c. + " ld1 {v5.s}[1],[x5],x14 \n\t" // Load c25 into quad and increment by rs_c. + " ld1 {v5.s}[2],[x5],x14 \n\t" // Load c26 into quad and increment by rs_c. + " ld1 {v5.s}[3],[x5],x14 \n\t" // Load c27 into quad and increment by rs_c. + " \n\t" + " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta + " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta + " fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta + " fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta + " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta + " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta + " \n\t" + LABEL(SBETAZEROGENSTOREDS1) + " \n\t" + " fmla v0.4s, v8.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v1.4s, v9.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v2.4s,v10.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v3.4s,v11.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v4.4s,v12.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v5.4s,v13.4s,v6.s[0] \n\t" // Scale by alpha + " \n\t" + " mov x5, x2 \n\t" + " \n\t" + " st1 {v0.s}[0],[x5],x14 \n\t" // Store c00 into quad and increment by rs_c. + " st1 {v0.s}[1],[x5],x14 \n\t" // Store c01 into quad and increment by rs_c. + " st1 {v0.s}[2],[x5],x14 \n\t" // Store c02 into quad and increment by rs_c. + " st1 {v0.s}[3],[x5],x14 \n\t" // Store c03 into quad and increment by rs_c. + " st1 {v1.s}[0],[x5],x14 \n\t" // Store c04 into quad and increment by rs_c. + " st1 {v1.s}[1],[x5],x14 \n\t" // Store c05 into quad and increment by rs_c. + " st1 {v1.s}[2],[x5],x14 \n\t" // Store c06 into quad and increment by rs_c. + " st1 {v1.s}[3],[x5],x14 \n\t" // Store c07 into quad and increment by rs_c. + " \n\t" + " mov x5, x16 \n\t" + " \n\t" + " st1 {v2.s}[0],[x5],x14 \n\t" // Store c10 into quad and increment by rs_c. + " st1 {v2.s}[1],[x5],x14 \n\t" // Store c11 into quad and increment by rs_c. + " st1 {v2.s}[2],[x5],x14 \n\t" // Store c12 into quad and increment by rs_c. + " st1 {v2.s}[3],[x5],x14 \n\t" // Store c13 into quad and increment by rs_c. + " st1 {v3.s}[0],[x5],x14 \n\t" // Store c14 into quad and increment by rs_c. + " st1 {v3.s}[1],[x5],x14 \n\t" // Store c15 into quad and increment by rs_c. + " st1 {v3.s}[2],[x5],x14 \n\t" // Store c16 into quad and increment by rs_c. + " st1 {v3.s}[3],[x5],x14 \n\t" // Store c17 into quad and increment by rs_c. + " \n\t" + " mov x5, x17 \n\t" + " \n\t" + " st1 {v4.s}[0],[x5],x14 \n\t" // Store c20 into quad and increment by rs_c. + " st1 {v4.s}[1],[x5],x14 \n\t" // Store c21 into quad and increment by rs_c. + " st1 {v4.s}[2],[x5],x14 \n\t" // Store c22 into quad and increment by rs_c. + " st1 {v4.s}[3],[x5],x14 \n\t" // Store c23 into quad and increment by rs_c. + " st1 {v5.s}[0],[x5],x14 \n\t" // Store c24 into quad and increment by rs_c. + " st1 {v5.s}[1],[x5],x14 \n\t" // Store c25 into quad and increment by rs_c. + " st1 {v5.s}[2],[x5],x14 \n\t" // Store c26 into quad and increment by rs_c. + " st1 {v5.s}[3],[x5],x14 \n\t" // Store c27 into quad and increment by rs_c. + " \n\t" + " dup v8.4s, wzr \n\t" + " dup v9.4s, wzr \n\t" + " dup v10.4s, wzr \n\t" + " dup v11.4s, wzr \n\t" + " dup v12.4s, wzr \n\t" + " dup v13.4s, wzr \n\t" + " \n\t" + " fcmp s7,#0.0 \n\t" + BEQ(SBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. + " \n\t" + " mov x5, x19 \n\t" + " \n\t" + " ld1 {v8.s}[0],[x5],x14 \n\t" // Load c30 into quad and increment by rs_c. + " ld1 {v8.s}[1],[x5],x14 \n\t" // Load c31 into quad and increment by rs_c. + " ld1 {v8.s}[2],[x5],x14 \n\t" // Load c32 into quad and increment by rs_c. + " ld1 {v8.s}[3],[x5],x14 \n\t" // Load c33 into quad and increment by rs_c. + " ld1 {v9.s}[0],[x5],x14 \n\t" // Load c34 into quad and increment by rs_c. + " ld1 {v9.s}[1],[x5],x14 \n\t" // Load c35 into quad and increment by rs_c. + " ld1 {v9.s}[2],[x5],x14 \n\t" // Load c36 into quad and increment by rs_c. + " ld1 {v9.s}[3],[x5],x14 \n\t" // Load c37 into quad and increment by rs_c. + " \n\t" + " mov x5, x20 \n\t" + " \n\t" + " ld1 {v10.s}[0],[x5],x14 \n\t" // Load c40 into quad and increment by rs_c. + " ld1 {v10.s}[1],[x5],x14 \n\t" // Load c41 into quad and increment by rs_c. + " ld1 {v10.s}[2],[x5],x14 \n\t" // Load c42 into quad and increment by rs_c. + " ld1 {v10.s}[3],[x5],x14 \n\t" // Load c43 into quad and increment by rs_c. + " ld1 {v11.s}[0],[x5],x14 \n\t" // Load c44 into quad and increment by rs_c. + " ld1 {v11.s}[1],[x5],x14 \n\t" // Load c45 into quad and increment by rs_c. + " ld1 {v11.s}[2],[x5],x14 \n\t" // Load c46 into quad and increment by rs_c. + " ld1 {v11.s}[3],[x5],x14 \n\t" // Load c47 into quad and increment by rs_c. + " \n\t" + " mov x5, x21 \n\t" + " \n\t" + " ld1 {v12.s}[0],[x5],x14 \n\t" // Load c50 into quad and increment by rs_c. + " ld1 {v12.s}[1],[x5],x14 \n\t" // Load c51 into quad and increment by rs_c. + " ld1 {v12.s}[2],[x5],x14 \n\t" // Load c52 into quad and increment by rs_c. + " ld1 {v12.s}[3],[x5],x14 \n\t" // Load c53 into quad and increment by rs_c. + " ld1 {v13.s}[0],[x5],x14 \n\t" // Load c54 into quad and increment by rs_c. + " ld1 {v13.s}[1],[x5],x14 \n\t" // Load c55 into quad and increment by rs_c. + " ld1 {v13.s}[2],[x5],x14 \n\t" // Load c56 into quad and increment by rs_c. + " ld1 {v13.s}[3],[x5],x14 \n\t" // Load c57 into quad and increment by rs_c. + " \n\t" + " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta + " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta + " fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta + " fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta + " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta + " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta + " \n\t" + LABEL(SBETAZEROGENSTOREDS2) + " \n\t" + " fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v10.4s,v16.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v11.4s,v17.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha + " \n\t" + " mov x5, x19 \n\t" + " \n\t" + " st1 {v8.s}[0],[x5],x14 \n\t" // Store c30 into quad and increment by rs_c. + " st1 {v8.s}[1],[x5],x14 \n\t" // Store c31 into quad and increment by rs_c. + " st1 {v8.s}[2],[x5],x14 \n\t" // Store c32 into quad and increment by rs_c. + " st1 {v8.s}[3],[x5],x14 \n\t" // Store c33 into quad and increment by rs_c. + " st1 {v9.s}[0],[x5],x14 \n\t" // Store c34 into quad and increment by rs_c. + " st1 {v9.s}[1],[x5],x14 \n\t" // Store c35 into quad and increment by rs_c. + " st1 {v9.s}[2],[x5],x14 \n\t" // Store c36 into quad and increment by rs_c. + " st1 {v9.s}[3],[x5],x14 \n\t" // Store c37 into quad and increment by rs_c. + " \n\t" + " mov x5, x20 \n\t" + " \n\t" + " st1 {v10.s}[0],[x5],x14 \n\t" // Store c40 into quad and increment by rs_c. + " st1 {v10.s}[1],[x5],x14 \n\t" // Store c41 into quad and increment by rs_c. + " st1 {v10.s}[2],[x5],x14 \n\t" // Store c42 into quad and increment by rs_c. + " st1 {v10.s}[3],[x5],x14 \n\t" // Store c43 into quad and increment by rs_c. + " st1 {v11.s}[0],[x5],x14 \n\t" // Store c44 into quad and increment by rs_c. + " st1 {v11.s}[1],[x5],x14 \n\t" // Store c45 into quad and increment by rs_c. + " st1 {v11.s}[2],[x5],x14 \n\t" // Store c46 into quad and increment by rs_c. + " st1 {v11.s}[3],[x5],x14 \n\t" // Store c47 into quad and increment by rs_c. + " \n\t" + " mov x5, x21 \n\t" + " \n\t" + " st1 {v12.s}[0],[x5],x14 \n\t" // Store c50 into quad and increment by rs_c. + " st1 {v12.s}[1],[x5],x14 \n\t" // Store c51 into quad and increment by rs_c. + " st1 {v12.s}[2],[x5],x14 \n\t" // Store c52 into quad and increment by rs_c. + " st1 {v12.s}[3],[x5],x14 \n\t" // Store c53 into quad and increment by rs_c. + " st1 {v13.s}[0],[x5],x14 \n\t" // Store c54 into quad and increment by rs_c. + " st1 {v13.s}[1],[x5],x14 \n\t" // Store c55 into quad and increment by rs_c. + " st1 {v13.s}[2],[x5],x14 \n\t" // Store c56 into quad and increment by rs_c. + " st1 {v13.s}[3],[x5],x14 \n\t" // Store c57 into quad and increment by rs_c. + " \n\t" + " dup v0.4s, wzr \n\t" + " dup v1.4s, wzr \n\t" + " dup v2.4s, wzr \n\t" + " dup v3.4s, wzr \n\t" + " dup v4.4s, wzr \n\t" + " dup v5.4s, wzr \n\t" + " \n\t" + " fcmp s7,#0.0 \n\t" + BEQ(SBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. + " \n\t" + " mov x5, x22 \n\t" + " \n\t" + " ld1 {v0.s}[0],[x5],x14 \n\t" // Load c60 into quad and increment by rs_c. + " ld1 {v0.s}[1],[x5],x14 \n\t" // Load c61 into quad and increment by rs_c. + " ld1 {v0.s}[2],[x5],x14 \n\t" // Load c62 into quad and increment by rs_c. + " ld1 {v0.s}[3],[x5],x14 \n\t" // Load c63 into quad and increment by rs_c. + " ld1 {v1.s}[0],[x5],x14 \n\t" // Load c64 into quad and increment by rs_c. + " ld1 {v1.s}[1],[x5],x14 \n\t" // Load c65 into quad and increment by rs_c. + " ld1 {v1.s}[2],[x5],x14 \n\t" // Load c66 into quad and increment by rs_c. + " ld1 {v1.s}[3],[x5],x14 \n\t" // Load c67 into quad and increment by rs_c. + " \n\t" + " mov x5, x23 \n\t" + " \n\t" + " ld1 {v2.s}[0],[x5],x14 \n\t" // Load c70 into quad and increment by rs_c. + " ld1 {v2.s}[1],[x5],x14 \n\t" // Load c71 into quad and increment by rs_c. + " ld1 {v2.s}[2],[x5],x14 \n\t" // Load c72 into quad and increment by rs_c. + " ld1 {v2.s}[3],[x5],x14 \n\t" // Load c73 into quad and increment by rs_c. + " ld1 {v3.s}[0],[x5],x14 \n\t" // Load c74 into quad and increment by rs_c. + " ld1 {v3.s}[1],[x5],x14 \n\t" // Load c75 into quad and increment by rs_c. + " ld1 {v3.s}[2],[x5],x14 \n\t" // Load c76 into quad and increment by rs_c. + " ld1 {v3.s}[3],[x5],x14 \n\t" // Load c77 into quad and increment by rs_c. + " \n\t" + " mov x5, x24 \n\t" + " \n\t" + " ld1 {v4.s}[0],[x5],x14 \n\t" // Load c80 into quad and increment by rs_c. + " ld1 {v4.s}[1],[x5],x14 \n\t" // Load c81 into quad and increment by rs_c. + " ld1 {v4.s}[2],[x5],x14 \n\t" // Load c82 into quad and increment by rs_c. + " ld1 {v4.s}[3],[x5],x14 \n\t" // Load c83 into quad and increment by rs_c. + " ld1 {v5.s}[0],[x5],x14 \n\t" // Load c84 into quad and increment by rs_c. + " ld1 {v5.s}[1],[x5],x14 \n\t" // Load c85 into quad and increment by rs_c. + " ld1 {v5.s}[2],[x5],x14 \n\t" // Load c86 into quad and increment by rs_c. + " ld1 {v5.s}[3],[x5],x14 \n\t" // Load c87 into quad and increment by rs_c. + " \n\t" + " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta + " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta + " fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta + " fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta + " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta + " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta + " \n\t" + LABEL(SBETAZEROGENSTOREDS3) + " \n\t" + " fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v2.4s,v22.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v3.4s,v23.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha + " \n\t" + " mov x5, x22 \n\t" + " \n\t" + " st1 {v0.s}[0],[x5],x14 \n\t" // Store c60 into quad and increment by rs_c. + " st1 {v0.s}[1],[x5],x14 \n\t" // Store c61 into quad and increment by rs_c. + " st1 {v0.s}[2],[x5],x14 \n\t" // Store c62 into quad and increment by rs_c. + " st1 {v0.s}[3],[x5],x14 \n\t" // Store c63 into quad and increment by rs_c. + " st1 {v1.s}[0],[x5],x14 \n\t" // Store c64 into quad and increment by rs_c. + " st1 {v1.s}[1],[x5],x14 \n\t" // Store c65 into quad and increment by rs_c. + " st1 {v1.s}[2],[x5],x14 \n\t" // Store c66 into quad and increment by rs_c. + " st1 {v1.s}[3],[x5],x14 \n\t" // Store c67 into quad and increment by rs_c. + " \n\t" + " mov x5, x23 \n\t" + " \n\t" + " st1 {v2.s}[0],[x5],x14 \n\t" // Store c70 into quad and increment by rs_c. + " st1 {v2.s}[1],[x5],x14 \n\t" // Store c71 into quad and increment by rs_c. + " st1 {v2.s}[2],[x5],x14 \n\t" // Store c72 into quad and increment by rs_c. + " st1 {v2.s}[3],[x5],x14 \n\t" // Store c73 into quad and increment by rs_c. + " st1 {v3.s}[0],[x5],x14 \n\t" // Store c74 into quad and increment by rs_c. + " st1 {v3.s}[1],[x5],x14 \n\t" // Store c75 into quad and increment by rs_c. + " st1 {v3.s}[2],[x5],x14 \n\t" // Store c76 into quad and increment by rs_c. + " st1 {v3.s}[3],[x5],x14 \n\t" // Store c77 into quad and increment by rs_c. + " \n\t" + " mov x5, x24 \n\t" + " \n\t" + " st1 {v4.s}[0],[x5],x14 \n\t" // Store c80 into quad and increment by rs_c. + " st1 {v4.s}[1],[x5],x14 \n\t" // Store c81 into quad and increment by rs_c. + " st1 {v4.s}[2],[x5],x14 \n\t" // Store c82 into quad and increment by rs_c. + " st1 {v4.s}[3],[x5],x14 \n\t" // Store c83 into quad and increment by rs_c. + " st1 {v5.s}[0],[x5],x14 \n\t" // Store c84 into quad and increment by rs_c. + " st1 {v5.s}[1],[x5],x14 \n\t" // Store c85 into quad and increment by rs_c. + " st1 {v5.s}[2],[x5],x14 \n\t" // Store c86 into quad and increment by rs_c. + " st1 {v5.s}[3],[x5],x14 \n\t" // Store c87 into quad and increment by rs_c. + " \n\t" + " dup v8.4s, wzr \n\t" + " dup v9.4s, wzr \n\t" + " dup v10.4s, wzr \n\t" + " dup v11.4s, wzr \n\t" + " dup v12.4s, wzr \n\t" + " dup v13.4s, wzr \n\t" + " \n\t" + " fcmp s7,#0.0 \n\t" + BEQ(SBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. + " \n\t" + " mov x5, x25 \n\t" + " \n\t" + " ld1 {v8.s}[0],[x5],x14 \n\t" // Load c90 into quad and increment by rs_c. + " ld1 {v8.s}[1],[x5],x14 \n\t" // Load c91 into quad and increment by rs_c. + " ld1 {v8.s}[2],[x5],x14 \n\t" // Load c92 into quad and increment by rs_c. + " ld1 {v8.s}[3],[x5],x14 \n\t" // Load c93 into quad and increment by rs_c. + " ld1 {v9.s}[0],[x5],x14 \n\t" // Load c94 into quad and increment by rs_c. + " ld1 {v9.s}[1],[x5],x14 \n\t" // Load c95 into quad and increment by rs_c. + " ld1 {v9.s}[2],[x5],x14 \n\t" // Load c96 into quad and increment by rs_c. + " ld1 {v9.s}[3],[x5],x14 \n\t" // Load c97 into quad and increment by rs_c. + " \n\t" + " mov x5, x26 \n\t" + " \n\t" + " ld1 {v10.s}[0],[x5],x14 \n\t" // Load c100 into quad and increment by rs_c. + " ld1 {v10.s}[1],[x5],x14 \n\t" // Load c101 into quad and increment by rs_c. + " ld1 {v10.s}[2],[x5],x14 \n\t" // Load c102 into quad and increment by rs_c. + " ld1 {v10.s}[3],[x5],x14 \n\t" // Load c103 into quad and increment by rs_c. + " ld1 {v11.s}[0],[x5],x14 \n\t" // Load c104 into quad and increment by rs_c. + " ld1 {v11.s}[1],[x5],x14 \n\t" // Load c105 into quad and increment by rs_c. + " ld1 {v11.s}[2],[x5],x14 \n\t" // Load c106 into quad and increment by rs_c. + " ld1 {v11.s}[3],[x5],x14 \n\t" // Load c107 into quad and increment by rs_c. + " \n\t" + " mov x5, x27 \n\t" + " \n\t" + " ld1 {v12.s}[0],[x5],x14 \n\t" // Load c110 into quad and increment by rs_c. + " ld1 {v12.s}[1],[x5],x14 \n\t" // Load c111 into quad and increment by rs_c. + " ld1 {v12.s}[2],[x5],x14 \n\t" // Load c112 into quad and increment by rs_c. + " ld1 {v12.s}[3],[x5],x14 \n\t" // Load c113 into quad and increment by rs_c. + " ld1 {v13.s}[0],[x5],x14 \n\t" // Load c114 into quad and increment by rs_c. + " ld1 {v13.s}[1],[x5],x14 \n\t" // Load c115 into quad and increment by rs_c. + " ld1 {v13.s}[2],[x5],x14 \n\t" // Load c116 into quad and increment by rs_c. + " ld1 {v13.s}[3],[x5],x14 \n\t" // Load c117 into quad and increment by rs_c. + " \n\t" + " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta + " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta + " fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta + " fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta + " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta + " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta + " \n\t" + LABEL(SBETAZEROGENSTOREDS4) + " \n\t" + " prfm pldl2keep,[x0] \n\t" + " prfm pldl2keep,[x1] \n\t" + " \n\t" + " fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v10.4s,v28.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v11.4s,v29.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha + " fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha + " \n\t" + " mov x5, x25 \n\t" + " \n\t" + " st1 {v8.s}[0],[x5],x14 \n\t" // Store c90 into quad and increment by rs_c. + " st1 {v8.s}[1],[x5],x14 \n\t" // Store c91 into quad and increment by rs_c. + " st1 {v8.s}[2],[x5],x14 \n\t" // Store c92 into quad and increment by rs_c. + " st1 {v8.s}[3],[x5],x14 \n\t" // Store c93 into quad and increment by rs_c. + " st1 {v9.s}[0],[x5],x14 \n\t" // Store c94 into quad and increment by rs_c. + " st1 {v9.s}[1],[x5],x14 \n\t" // Store c95 into quad and increment by rs_c. + " st1 {v9.s}[2],[x5],x14 \n\t" // Store c96 into quad and increment by rs_c. + " st1 {v9.s}[3],[x5],x14 \n\t" // Store c97 into quad and increment by rs_c. + " \n\t" + " mov x5, x26 \n\t" + " \n\t" + " st1 {v10.s}[0],[x5],x14 \n\t" // Store c100 into quad and increment by rs_c. + " st1 {v10.s}[1],[x5],x14 \n\t" // Store c101 into quad and increment by rs_c. + " st1 {v10.s}[2],[x5],x14 \n\t" // Store c102 into quad and increment by rs_c. + " st1 {v10.s}[3],[x5],x14 \n\t" // Store c103 into quad and increment by rs_c. + " st1 {v11.s}[0],[x5],x14 \n\t" // Store c104 into quad and increment by rs_c. + " st1 {v11.s}[1],[x5],x14 \n\t" // Store c105 into quad and increment by rs_c. + " st1 {v11.s}[2],[x5],x14 \n\t" // Store c106 into quad and increment by rs_c. + " st1 {v11.s}[3],[x5],x14 \n\t" // Store c107 into quad and increment by rs_c. + " \n\t" + " mov x5, x27 \n\t" + " \n\t" + " st1 {v12.s}[0],[x5],x14 \n\t" // Store c110 into quad and increment by rs_c. + " st1 {v12.s}[1],[x5],x14 \n\t" // Store c111 into quad and increment by rs_c. + " st1 {v12.s}[2],[x5],x14 \n\t" // Store c112 into quad and increment by rs_c. + " st1 {v12.s}[3],[x5],x14 \n\t" // Store c113 into quad and increment by rs_c. + " st1 {v13.s}[0],[x5],x14 \n\t" // Store c114 into quad and increment by rs_c. + " st1 {v13.s}[1],[x5],x14 \n\t" // Store c115 into quad and increment by rs_c. + " st1 {v13.s}[2],[x5],x14 \n\t" // Store c116 into quad and increment by rs_c. + " st1 {v13.s}[3],[x5],x14 \n\t" // Store c147 into quad and increment by rs_c. + " \n\t" + LABEL(SEND) // Done! + " \n\t" + :// output operands (none) + :// input operands + [aaddr] "m" (a), // 0 + [baddr] "m" (b), // 1 + [caddr] "m" (c), // 2 + [k_iter] "m" (k_iter), // 3 + [k_left] "m" (k_left), // 4 + [alpha] "m" (alpha), // 5 + [beta] "m" (beta), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [a_next] "m" (a_next), // 9 + [b_next] "m" (b_next) // 10 + :// Register clobber list + "x0", "x1", "x2", + "x5", "x6", "x10","x14", + "x16","x17","x19","x20", + "x21","x22","x23","x24", + "x25","x26","x27", + "v0", "v1", "v2", "v3", + "v4", "v5", "v6", "v7", + "v8", "v9", "v10","v11", + "v12","v13","v14","v15", + "v16","v17","v18","v19", + "v20","v21","v22","v23", + "v24","v25","v26","v27", + "v28","v29","v30","v31" + ); + + GEMM_UKR_FLUSH_CT( s ); } @@ -1089,24 +1094,26 @@ LABEL(SEND) // Done! o 4x4 Double precision micro-kernel NOT fully functional yet. o Runnable on ARMv8, compiled with aarch64 GCC. o Use it together with the armv8 BLIS configuration. - o Tested on Juno board. Around 3 GFLOPS @ 1.1 GHz. + o Tested on Juno board. Around 3 GFLOPS @ 1.1 GHz. December 2014. - + * UPDATE OCTOBER 2015: Now is fully functional. * Tested on Juno board. Around 5.6 GFLOPS, 2 A57 cores @ 1.1 GHz. * Tested on Juno board. Around 4 GFLOPS, 4 A53 cores @ 850 MHz. - + * UPDATE NOVEMBER 2015 * Micro-kernel changed to 6x8 * Tested on Juno Board. Around 4 GFLOPS, 1 x A57 core @ 1.1 GHz. * Tested on Juno Board. Around 7.6 GFLOPS, 2 x A57 cores @ 1.1 GHz. - * Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core @ 850 MHz. + * Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core @ 850 MHz. * Tested on Juno board. Around 5.5 GFLOPS, 4 x A53 cores @ 850 MHz. */ void bli_dgemm_armv8a_asm_6x8 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, double* restrict alpha, double* restrict a, double* restrict b, @@ -1121,966 +1128,969 @@ void bli_dgemm_armv8a_asm_6x8 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; -__asm__ volatile -( -" \n\t" -" ldr x0,%[aaddr] \n\t" // Load address of A -" ldr x1,%[baddr] \n\t" // Load address of B -" ldr x2,%[caddr] \n\t" // Load address of C -" \n\t" -" ldr x5,%[k_iter] \n\t" // Init guard (k_iter) -" ldr x6,%[k_left] \n\t" // Init guard (k_iter) -" \n\t" -" ldr x10,%[cs_c] \n\t" // Load cs_c -" lsl x10,x10,#3 \n\t" // cs_c * sizeof(double) -" \n\t" -" ldr x14,%[rs_c] \n\t" // Load rs_c. -" lsl x14,x14,#3 \n\t" // rs_c * sizeof(double). -" \n\t" -" add x20,x2,x10 \n\t" //Load address Column 1 of C -" add x21,x20,x10 \n\t" //Load address Column 2 of C -" add x22,x21,x10 \n\t" //Load address Column 3 of C -" add x23,x22,x10 \n\t" //Load address Column 4 of C -" add x24,x23,x10 \n\t" //Load address Column 5 of C -" add x25,x24,x10 \n\t" //Load address Column 6 of C -" add x26,x25,x10 \n\t" //Load address Column 7 of C -" \n\t" -" prfm pldl1keep,[x2] \n\t" // Prefetch c. -" prfm pldl1keep,[x20] \n\t" // Prefetch c. -" prfm pldl1keep,[x21] \n\t" // Prefetch c. -" prfm pldl1keep,[x22] \n\t" // Prefetch c. -" prfm pldl1keep,[x23] \n\t" // Prefetch c. -" prfm pldl1keep,[x24] \n\t" // Prefetch c. -" prfm pldl1keep,[x25] \n\t" // Prefetch c. -" prfm pldl1keep,[x26] \n\t" // Prefetch c. -" \n\t" -" dup v8.2d, xzr \n\t" // Vector for accummulating column 0 -" prfm PLDL1KEEP, [x1, #256] \n\t" -" dup v9.2d, xzr \n\t" // Vector for accummulating column 0 -" prfm PLDL1KEEP, [x1, #320] \n\t" -" dup v10.2d, xzr \n\t" // Vector for accummulating column 0 -" prfm PLDL1KEEP, [x1, #384] \n\t" -" dup v11.2d, xzr \n\t" // Vector for accummulating column 1 -" prfm PLDL1KEEP, [x1, #448] \n\t" -" dup v12.2d, xzr \n\t" // Vector for accummulating column 1 -" dup v13.2d, xzr \n\t" // Vector for accummulating column 1 -" \n\t" -" dup v14.2d, xzr \n\t" // Vector for accummulating column 2 -" prfm PLDL1KEEP, [x0, #192] \n\t" -" dup v15.2d, xzr \n\t" // Vector for accummulating column 2 -" prfm PLDL1KEEP, [x0, #256] \n\t" -" dup v16.2d, xzr \n\t" // Vector for accummulating column 2 -" prfm PLDL1KEEP, [x0, #320] \n\t" -" dup v17.2d, xzr \n\t" // Vector for accummulating column 3 -" dup v18.2d, xzr \n\t" // Vector for accummulating column 3 -" dup v19.2d, xzr \n\t" // Vector for accummulating column 3 -" \n\t" -" dup v20.2d, xzr \n\t" // Vector for accummulating column 4 -" dup v21.2d, xzr \n\t" // Vector for accummulating column 4 -" dup v22.2d, xzr \n\t" // Vector for accummulating column 4 -" dup v23.2d, xzr \n\t" // Vector for accummulating column 5 -" dup v24.2d, xzr \n\t" // Vector for accummulating column 5 -" dup v25.2d, xzr \n\t" // Vector for accummulating column 5 -" \n\t" -" dup v26.2d, xzr \n\t" // Vector for accummulating column 6 -" dup v27.2d, xzr \n\t" // Vector for accummulating column 6 -" dup v28.2d, xzr \n\t" // Vector for accummulating column 6 -" dup v29.2d, xzr \n\t" // Vector for accummulating column 7 -" dup v30.2d, xzr \n\t" // Vector for accummulating column 7 -" dup v31.2d, xzr \n\t" // Vector for accummulating column 7 -" \n\t" -" \n\t" -" cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. -BEQ(DCONSIDERKLEFT) -" \n\t" -" ldr q0, [x0] \n\t" // Load a -" ldr q1, [x0, #16] \n\t" -" ldr q2, [x0, #32] \n\t" -" \n\t" -" ldr q3, [x1] \n\t" // Load b -" ldr q4, [x1, #16] \n\t" -" ldr q5, [x1, #32] \n\t" -" ldr q6, [x1, #48] \n\t" -" \n\t" -" add x0, x0, #48 \n\t" //update address of A -" add x1, x1, #64 \n\t" //update address of B -" \n\t" -" cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. -BEQ(DLASTITER) // (as loop is do-while-like). -" \n\t" -LABEL(DLOOP) // Body -" \n\t" -" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate -" prfm PLDL1KEEP, [x1, #448] \n\t" //512-64=448 -" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate -" prfm PLDL1KEEP, [x1, #512] \n\t" -" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate -" prfm PLDL1KEEP, [x1, #576] \n\t" -" \n\t" -" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate -" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate -" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate -" \n\t" -" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate -" ldr q3, [x1] \n\t" -" \n\t" -" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate -" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate -" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate -" ldr q7, [x0, #32] \n\t" -" \n\t" -" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate -" ldr q4, [x1, #16] \n\t" -" \n\t" -" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate -" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate -" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate -" ldr q5, [x1, #32] \n\t" -" \n\t" -" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate -" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate -" ldr q0, [x0] \n\t" -" \n\t" -" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate -" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate -" ldr q1, [x0, #16] \n\t" -" \n\t" -" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate -" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate -" ldr q6, [x1, #48] \n\t" -" \n\t" // End it 1 -" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate -" prfm PLDL1KEEP, [x1, #640] \n\t" -" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate -" prfm PLDL1KEEP, [x0, #336] \n\t" -" fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate -" prfm PLDL1KEEP, [x0, #400] \n\t" -" \n\t" -" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate -" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate -" fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate -" \n\t" -" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate -" ldr q3, [x1, #64] \n\t" -" \n\t" -" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate -" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate -" fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate -" ldr q2, [x0, #80] \n\t" -" \n\t" -" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate -" ldr q4, [x1, #80] \n\t" -" \n\t" -" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate -" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate -" fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate -" ldr q5, [x1, #96] \n\t" -" \n\t" -" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate -" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate -" ldr q0, [x0, #48] \n\t" -" \n\t" -" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate -" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate -" ldr q1, [x0, #64] \n\t" -" \n\t" -" fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate -" fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate -" ldr q6, [x1, #112] \n\t" -" \n\t" //End it 2 -" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate -" prfm PLDL1KEEP, [x0, #464] \n\t" -" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate -" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate -" \n\t" -" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate -" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate -" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate -" \n\t" -" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate -" ldr q3, [x1, #128] \n\t" -" \n\t" -" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate -" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate -" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate -" ldr q7, [x0, #128] \n\t" -" \n\t" -" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate -" ldr q4, [x1, #144] \n\t" -" \n\t" -" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate -" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate -" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate -" ldr q5, [x1, #160] \n\t" -" \n\t" -" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate -" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate -" ldr q0, [x0, #96] \n\t" -" \n\t" -" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate -" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate -" ldr q1, [x0, #112] \n\t" -" \n\t" -" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate -" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate -" ldr q6, [x1, #176] \n\t" -" \n\t" // End it 3 -" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate -" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate -" fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate -" \n\t" -" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate -" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate -" fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate -" ldr q3, [x1, #192] \n\t" -" \n\t" -" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate -" ldr q2, [x0, #176] \n\t" -" \n\t" -" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate -" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate -" fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate -" ldr q4, [x1, #208] \n\t" -" \n\t" -" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate -" \n\t" -" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate -" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate -" fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate -" ldr q5, [x1, #224] \n\t" -" \n\t" -" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate -" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate -" ldr q0, [x0, #144] \n\t" -" \n\t" -" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate -" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate -" ldr q1, [x0, #160] \n\t" -" \n\t" -" fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate -" fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate -" ldr q6, [x1, #240] \n\t" -" \n\t" //End it 4 -" add x0, x0, #192 \n\t" -" add x1, x1, #256 \n\t" -" \n\t" -" sub x5,x5,1 \n\t" // i-=1 -" cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. -BNE(DLOOP) -" \n\t" -LABEL(DLASTITER) -" \n\t" -" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate -" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate -" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate -" \n\t" -" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate -" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate -" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate -" ldr q3, [x1] \n\t" -" \n\t" -" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate -" ldr q7, [x0, #32] \n\t" -" \n\t" -" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate -" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate -" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate -" ldr q4, [x1, #16] \n\t" -" \n\t" -" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate -" \n\t" -" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate -" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate -" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate -" ldr q5, [x1, #32] \n\t" -" \n\t" -" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate -" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate -" ldr q0, [x0] \n\t" -" \n\t" -" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate -" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate -" ldr q1, [x0, #16] \n\t" -" \n\t" -" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate -" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate -" ldr q6, [x1, #48] \n\t" -" \n\t" // End it 1 -" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate -" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate -" fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate -" \n\t" -" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate -" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate -" fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate -" ldr q3, [x1, #64] \n\t" -" \n\t" -" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate -" ldr q2, [x0, #80] \n\t" -" \n\t" -" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate -" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate -" fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate -" ldr q4, [x1, #80] \n\t" -" \n\t" -" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate -" \n\t" -" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate -" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate -" fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate -" ldr q5, [x1, #96] \n\t" -" \n\t" -" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate -" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate -" ldr q0, [x0, #48] \n\t" -" \n\t" -" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate -" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate -" ldr q1, [x0, #64] \n\t" -" \n\t" -" fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate -" fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate -" ldr q6, [x1, #112] \n\t" -" \n\t" //End it 2 -" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate -" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate -" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate -" \n\t" -" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate -" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate -" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate -" ldr q3, [x1, #128] \n\t" -" \n\t" -" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate -" ldr q7, [x0, #128] \n\t" -" \n\t" -" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate -" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate -" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate -" ldr q4, [x1, #144] \n\t" -" \n\t" -" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate -" \n\t" -" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate -" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate -" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate -" ldr q5, [x1, #160] \n\t" -" \n\t" -" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate -" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate -" ldr q0, [x0, #96] \n\t" -" \n\t" -" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate -" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate -" ldr q1, [x0, #112] \n\t" -" \n\t" -" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate -" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate -" ldr q6, [x1, #176] \n\t" -" \n\t" // End it 3 -" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate -" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate -" fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate -" \n\t" -" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate -" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate -" fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate -" \n\t" -" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate -" \n\t" -" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate -" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate -" fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate -" \n\t" -" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate -" \n\t" -" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate -" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate -" fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate -" \n\t" -" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate -" add x1, x1, #192 \n\t" -" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate -" \n\t" -" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate -" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate -" \n\t" -" fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate -" fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate -" \n\t" //End it 4 -" add x0, x0, #144 \n\t" -" \n\t" -LABEL(DCONSIDERKLEFT) -" cmp x6,0 \n\t" // If k_left == 0, we are done. -BEQ(DPOSTACCUM) // else, we enter the k_left loop. -" \n\t" -LABEL(DLOOPKLEFT) -" \n\t" -" ldr q0, [x0],#16 \n\t" -" ldr q1, [x0],#16 \n\t" // Load a -" ldr q2, [x0],#16 \n\t" -" \n\t" -" ldr q3, [x1],#16 \n\t" // Load b -" ldr q4, [x1],#16 \n\t" -" ldr q5, [x1],#16 \n\t" -" ldr q6, [x1],#16 \n\t" -" \n\t" -" sub x6,x6,1 \n\t" -" \n\t" -" fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate -" fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate -" fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate -" \n\t" -" fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate -" fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate -" fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate -" \n\t" -" fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate -" fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate -" fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate -" \n\t" -" fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate -" fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate -" fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate -" \n\t" -" fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate -" fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate -" fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate -" \n\t" -" fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate -" fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate -" fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate -" \n\t" -" fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate -" fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate -" \n\t" -" fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate -" fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate -" \n\t" -" fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate -" fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate -" \n\t" -" cmp x6,0 \n\t" // Iterate again. -BNE(DLOOPKLEFT) // if i!=0. -" \n\t" -LABEL(DPOSTACCUM) -" \n\t" -" ldr x0,%[alpha] \n\t" // Alpha address -" ldr x1,%[beta] \n\t" // Beta address -" \n\t" -" ld1r {v6.2d},[x0] \n\t" // Load alpha. -" ld1r {v7.2d},[x1] \n\t" // Load beta -" \n\t" -" ldr x0,%[a_next] \n\t" // Next A address for later use. -" ldr x1,%[b_next] \n\t" // Next B address for later use. -" \n\t" -" cmp x14,#8 \n\t" // If rs_c != 1 (column-major) -BNE(DGENSTORED) -" \n\t" -LABEL(DCOLSTORED) // C is column-major. -" \n\t" -" dup v0.2d, xzr \n\t" -" dup v1.2d, xzr \n\t" -" dup v2.2d, xzr \n\t" -" dup v3.2d, xzr \n\t" -" dup v4.2d, xzr \n\t" -" dup v5.2d, xzr \n\t" -" \n\t" -" fcmp d7,#0.0 \n\t" -BEQ(DBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case. -" \n\t" -" ldr q0, [x2] \n\t" //Load column 0 of C -" ldr q1, [x2, #16] \n\t" -" ldr q2, [x2, #32] \n\t" -" \n\t" -" ldr q3, [x20] \n\t" //Load column 1 of C -" ldr q4, [x20, #16] \n\t" -" ldr q5, [x20, #32] \n\t" -" \n\t" -" fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta -" fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta -" fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta -" fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta -" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta -" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta -" \n\t" -LABEL(DBETAZEROCOLSTOREDS1) -" \n\t" -" fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v2.2d,v10.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v3.2d,v11.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v4.2d,v12.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v5.2d,v13.2d,v6.d[0] \n\t" // Scale by alpha -" \n\t" -" str q0, [x2] \n\t" //Store column 0 of C -" str q1, [x2, #16] \n\t" -" str q2, [x2, #32] \n\t" -" \n\t" -" str q3, [x20] \n\t" //Store column 1 of C -" str q4, [x20, #16] \n\t" -" str q5, [x20, #32] \n\t" -" \n\t" -" dup v8.2d, xzr \n\t" -" dup v9.2d, xzr \n\t" -" dup v10.2d, xzr \n\t" -" dup v11.2d, xzr \n\t" -" dup v12.2d, xzr \n\t" -" dup v13.2d, xzr \n\t" -" \n\t" -" fcmp d7,#0.0 \n\t" -BEQ(DBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. -" \n\t" -" ldr q8, [x21] \n\t" //Load column 2 of C -" ldr q9, [x21, #16] \n\t" -" ldr q10, [x21, #32] \n\t" -" \n\t" -" ldr q11, [x22] \n\t" //Load column 3 of C -" ldr q12, [x22, #16] \n\t" -" ldr q13, [x22, #32] \n\t" -" \n\t" -" fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta -" fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta -" fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta -" fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta -" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta -" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta -" \n\t" -LABEL(DBETAZEROCOLSTOREDS2) -" \n\t" -" fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v10.2d,v16.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v11.2d,v17.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v12.2d,v18.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v13.2d,v19.2d,v6.d[0] \n\t" // Scale by alpha -" \n\t" -" str q8, [x21] \n\t" //Store column 2 of C -" str q9, [x21, #16] \n\t" -" str q10, [x21, #32] \n\t" -" \n\t" -" str q11, [x22] \n\t" //Store column 3 of C -" str q12, [x22, #16] \n\t" -" str q13, [x22, #32] \n\t" -" \n\t" -" dup v0.2d, xzr \n\t" -" dup v1.2d, xzr \n\t" -" dup v2.2d, xzr \n\t" -" dup v3.2d, xzr \n\t" -" dup v4.2d, xzr \n\t" -" dup v5.2d, xzr \n\t" -" \n\t" -" fcmp d7,#0.0 \n\t" -BEQ(DBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. -" \n\t" -" ldr q0, [x23] \n\t" //Load column 4 of C -" ldr q1, [x23, #16] \n\t" -" ldr q2, [x23, #32] \n\t" -" \n\t" -" ldr q3, [x24] \n\t" //Load column 5 of C -" ldr q4, [x24, #16] \n\t" -" ldr q5, [x24, #32] \n\t" -" \n\t" -" fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta -" fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta -" fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta -" fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta -" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta -" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta -" \n\t" -LABEL(DBETAZEROCOLSTOREDS3) -" \n\t" -" fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v2.2d,v22.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v3.2d,v23.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v4.2d,v24.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v5.2d,v25.2d,v6.d[0] \n\t" // Scale by alpha -" \n\t" -" str q0, [x23] \n\t" //Store column 4 of C -" str q1, [x23, #16] \n\t" -" str q2, [x23, #32] \n\t" -" \n\t" -" str q3, [x24] \n\t" //Store column 5 of C -" str q4, [x24, #16] \n\t" -" str q5, [x24, #32] \n\t" -" \n\t" -" dup v8.2d, xzr \n\t" -" dup v9.2d, xzr \n\t" -" dup v10.2d, xzr \n\t" -" dup v11.2d, xzr \n\t" -" dup v12.2d, xzr \n\t" -" dup v13.2d, xzr \n\t" -" \n\t" -" fcmp d7,#0.0 \n\t" -BEQ(DBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. -" \n\t" -" ldr q8, [x25] \n\t" //Load column 6 of C -" ldr q9, [x25, #16] \n\t" -" ldr q10, [x25, #32] \n\t" -" \n\t" -" ldr q11, [x26] \n\t" //Load column 7 of C -" ldr q12, [x26, #16] \n\t" -" ldr q13, [x26, #32] \n\t" -" \n\t" -" fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta -" fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta -" fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta -" fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta -" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta -" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta -" \n\t" -LABEL(DBETAZEROCOLSTOREDS4) -" \n\t" -" prfm pldl2keep,[x0] \n\t" -" prfm pldl2keep,[x1] \n\t" -" \n\t" -" fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v10.2d,v28.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v11.2d,v29.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v12.2d,v30.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v13.2d,v31.2d,v6.d[0] \n\t" // Scale by alpha -" \n\t" -" str q8, [x25] \n\t" //Store column 6 of C -" str q9, [x25, #16] \n\t" -" str q10, [x25, #32] \n\t" -" \n\t" -" str q11, [x26] \n\t" //Store column 7 of C -" str q12, [x26, #16] \n\t" -" str q13, [x26, #32] \n\t" -" \n\t" -BRANCH(DEND) -" \n\t" -LABEL(DGENSTORED) // C is general-stride stored. -" \n\t" -" dup v0.2d, xzr \n\t" -" dup v1.2d, xzr \n\t" -" dup v2.2d, xzr \n\t" -" dup v3.2d, xzr \n\t" -" dup v4.2d, xzr \n\t" -" dup v5.2d, xzr \n\t" -" \n\t" -" fcmp d7,#0.0 \n\t" -BEQ(DBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. -" \n\t" -" mov x27, x2 \n\t" -" \n\t" // Load address of C. -" ld1 {v0.d}[0],[x27],x14 \n\t" // Load c00 into quad and increment by rs_c. -" ld1 {v0.d}[1],[x27],x14 \n\t" // Load c01 into quad and increment by rs_c. -" ld1 {v1.d}[0],[x27],x14 \n\t" // Load c02 into quad and increment by rs_c. -" ld1 {v1.d}[1],[x27],x14 \n\t" // Load c03 into quad and increment by rs_c. -" ld1 {v2.d}[0],[x27],x14 \n\t" // Load c04 into quad and increment by rs_c. -" ld1 {v2.d}[1],[x27],x14 \n\t" // Load c05 into quad and increment by rs_c. -" \n\t" -" mov x27, x20 \n\t" // Load address of C. -" \n\t" -" ld1 {v3.d}[0],[x27],x14 \n\t" // Load c10 into quad and increment by rs_c. -" ld1 {v3.d}[1],[x27],x14 \n\t" // Load c11 into quad and increment by rs_c. -" ld1 {v4.d}[0],[x27],x14 \n\t" // Load c12 into quad and increment by rs_c. -" ld1 {v4.d}[1],[x27],x14 \n\t" // Load c13 into quad and increment by rs_c. -" ld1 {v5.d}[0],[x27],x14 \n\t" // Load c14 into quad and increment by rs_c. -" ld1 {v5.d}[1],[x27],x14 \n\t" // Load c15 into quad and increment by rs_c. -" \n\t" -" fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta -" fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta -" fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta -" fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta -" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta -" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta -" \n\t" -LABEL(DBETAZEROGENSTOREDS1) -" \n\t" -" fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v2.2d,v10.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v3.2d,v11.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v4.2d,v12.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v5.2d,v13.2d,v6.d[0] \n\t" // Scale by alpha -" \n\t" -" mov x27, x2 \n\t" // Load address of C. -" \n\t" -" st1 {v0.d}[0],[x27],x14 \n\t" // Store c00 into quad and increment by rs_c. -" st1 {v0.d}[1],[x27],x14 \n\t" // Store c01 into quad and increment by rs_c. -" st1 {v1.d}[0],[x27],x14 \n\t" // Store c02 into quad and increment by rs_c. -" st1 {v1.d}[1],[x27],x14 \n\t" // Store c03 into quad and increment by rs_c. -" st1 {v2.d}[0],[x27],x14 \n\t" // Store c04 into quad and increment by rs_c. -" st1 {v2.d}[1],[x27],x14 \n\t" // Store c05 into quad and increment by rs_c. -" \n\t" -" mov x27, x20 \n\t" // Load address of C. -" \n\t" -" st1 {v3.d}[0],[x27],x14 \n\t" // Store c10 into quad and increment by rs_c. -" st1 {v3.d}[1],[x27],x14 \n\t" // Store c11 into quad and increment by rs_c. -" st1 {v4.d}[0],[x27],x14 \n\t" // Store c12 into quad and increment by rs_c. -" st1 {v4.d}[1],[x27],x14 \n\t" // Store c13 into quad and increment by rs_c. -" st1 {v5.d}[0],[x27],x14 \n\t" // Store c14 into quad and increment by rs_c. -" st1 {v5.d}[1],[x27],x14 \n\t" // Store c15 into quad and increment by rs_c. -" \n\t" -" dup v8.2d, xzr \n\t" -" dup v9.2d, xzr \n\t" -" dup v10.2d, xzr \n\t" -" dup v11.2d, xzr \n\t" -" dup v12.2d, xzr \n\t" -" dup v13.2d, xzr \n\t" -" \n\t" -" fcmp d7,#0.0 \n\t" -BEQ(DBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. -" \n\t" -" mov x27, x21 \n\t" // Load address of C. -" \n\t" -" ld1 {v8.d}[0], [x27],x14 \n\t" // Load c20 into quad and increment by rs_c. -" ld1 {v8.d}[1], [x27],x14 \n\t" // Load c21 into quad and increment by rs_c. -" ld1 {v9.d}[0], [x27],x14 \n\t" // Load c22 into quad and increment by rs_c. -" ld1 {v9.d}[1], [x27],x14 \n\t" // Load c23 into quad and increment by rs_c. -" ld1 {v10.d}[0],[x27],x14 \n\t" // Load c24 into quad and increment by rs_c. -" ld1 {v10.d}[1],[x27],x14 \n\t" // Load c25 into quad and increment by rs_c. -" \n\t" -" mov x27, x22 \n\t" // Load address of C. -" \n\t" -" ld1 {v11.d}[0],[x27],x14 \n\t" // Load c30 into quad and increment by rs_c. -" ld1 {v11.d}[1],[x27],x14 \n\t" // Load c31 into quad and increment by rs_c. -" ld1 {v12.d}[0],[x27],x14 \n\t" // Load c32 into quad and increment by rs_c. -" ld1 {v12.d}[1],[x27],x14 \n\t" // Load c33 into quad and increment by rs_c. -" ld1 {v13.d}[0],[x27],x14 \n\t" // Load c34 into quad and increment by rs_c. -" ld1 {v13.d}[1],[x27],x14 \n\t" // Load c35 into quad and increment by rs_c. -" \n\t" -" fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta -" fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta -" fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta -" fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta -" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta -" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta -" \n\t" -LABEL(DBETAZEROGENSTOREDS2) -" \n\t" -" fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v10.2d,v16.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v11.2d,v17.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v12.2d,v18.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v13.2d,v19.2d,v6.d[0] \n\t" // Scale by alpha -" \n\t" -" mov x27, x21 \n\t" // Load address of C. -" \n\t" -" st1 {v8.d}[0], [x27],x14 \n\t" // Store c20 into quad and increment by rs_c. -" st1 {v8.d}[1], [x27],x14 \n\t" // Store c21 into quad and increment by rs_c. -" st1 {v9.d}[0], [x27],x14 \n\t" // Store c22 into quad and increment by rs_c. -" st1 {v9.d}[1], [x27],x14 \n\t" // Store c23 into quad and increment by rs_c. -" st1 {v10.d}[0],[x27],x14 \n\t" // Store c24 into quad and increment by rs_c. -" st1 {v10.d}[1],[x27],x14 \n\t" // Store c25 into quad and increment by rs_c. -" \n\t" -" mov x27, x22 \n\t" // Load address of C. -" \n\t" -" st1 {v11.d}[0],[x27],x14 \n\t" // Store c30 into quad and increment by rs_c. -" st1 {v11.d}[1],[x27],x14 \n\t" // Store c31 into quad and increment by rs_c. -" st1 {v12.d}[0],[x27],x14 \n\t" // Store c32 into quad and increment by rs_c. -" st1 {v12.d}[1],[x27],x14 \n\t" // Store c33 into quad and increment by rs_c. -" st1 {v13.d}[0],[x27],x14 \n\t" // Store c34 into quad and increment by rs_c. -" st1 {v13.d}[1],[x27],x14 \n\t" // Store c35 into quad and increment by rs_c. -" \n\t" -" dup v0.2d, xzr \n\t" -" dup v1.2d, xzr \n\t" -" dup v2.2d, xzr \n\t" -" dup v3.2d, xzr \n\t" -" dup v4.2d, xzr \n\t" -" dup v5.2d, xzr \n\t" -" \n\t" -" fcmp d7,#0.0 \n\t" -BEQ(DBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. -" \n\t" -" mov x27, x23 \n\t" // Load address of C. -" \n\t" -" ld1 {v0.d}[0],[x27],x14 \n\t" // Load c40 into quad and increment by rs_c. -" ld1 {v0.d}[1],[x27],x14 \n\t" // Load c41 into quad and increment by rs_c. -" ld1 {v1.d}[0],[x27],x14 \n\t" // Load c42 into quad and increment by rs_c. -" ld1 {v1.d}[1],[x27],x14 \n\t" // Load c43 into quad and increment by rs_c. -" ld1 {v2.d}[0],[x27],x14 \n\t" // Load c44 into quad and increment by rs_c. -" ld1 {v2.d}[1],[x27],x14 \n\t" // Load c45 into quad and increment by rs_c. -" \n\t" -" mov x27, x24 \n\t" // Load address of C. -" \n\t" -" ld1 {v3.d}[0],[x27],x14 \n\t" // Load c50 into quad and increment by rs_c. -" ld1 {v3.d}[1],[x27],x14 \n\t" // Load c51 into quad and increment by rs_c. -" ld1 {v4.d}[0],[x27],x14 \n\t" // Load c52 into quad and increment by rs_c. -" ld1 {v4.d}[1],[x27],x14 \n\t" // Load c53 into quad and increment by rs_c. -" ld1 {v5.d}[0],[x27],x14 \n\t" // Load c54 into quad and increment by rs_c. -" ld1 {v5.d}[1],[x27],x14 \n\t" // Load c55 into quad and increment by rs_c. -" \n\t" -" fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta -" fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta -" fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta -" fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta -" fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta -" fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta -" \n\t" -LABEL(DBETAZEROGENSTOREDS3) -" \n\t" -" fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v2.2d,v22.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v3.2d,v23.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v4.2d,v24.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v5.2d,v25.2d,v6.d[0] \n\t" // Scale by alpha -" \n\t" -" mov x27, x23 \n\t" // Load address of C. -" \n\t" -" st1 {v0.d}[0],[x27],x14 \n\t" // Store c40 into quad and increment by rs_c. -" st1 {v0.d}[1],[x27],x14 \n\t" // Store c41 into quad and increment by rs_c. -" st1 {v1.d}[0],[x27],x14 \n\t" // Store c42 into quad and increment by rs_c. -" st1 {v1.d}[1],[x27],x14 \n\t" // Store c43 into quad and increment by rs_c. -" st1 {v2.d}[0],[x27],x14 \n\t" // Store c44 into quad and increment by rs_c. -" st1 {v2.d}[1],[x27],x14 \n\t" // Store c45 into quad and increment by rs_c. -" \n\t" -" mov x27, x24 \n\t" // Load address of C. -" \n\t" -" st1 {v3.d}[0],[x27],x14 \n\t" // Store c50 into quad and increment by rs_c. -" st1 {v3.d}[1],[x27],x14 \n\t" // Store c51 into quad and increment by rs_c. -" st1 {v4.d}[0],[x27],x14 \n\t" // Store c52 into quad and increment by rs_c. -" st1 {v4.d}[1],[x27],x14 \n\t" // Store c53 into quad and increment by rs_c. -" st1 {v5.d}[0],[x27],x14 \n\t" // Store c54 into quad and increment by rs_c. -" st1 {v5.d}[1],[x27],x14 \n\t" // Store c55 into quad and increment by rs_c. -" \n\t" -" dup v8.2d, xzr \n\t" -" dup v9.2d, xzr \n\t" -" dup v10.2d, xzr \n\t" -" dup v11.2d, xzr \n\t" -" dup v12.2d, xzr \n\t" -" dup v13.2d, xzr \n\t" -" \n\t" -" fcmp d7,#0.0 \n\t" -BEQ(DBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. -" \n\t" -" mov x27, x25 \n\t" -" \n\t" -" ld1 {v8.d}[0], [x27],x14 \n\t" // Load c60 into quad and increment by rs_c. -" ld1 {v8.d}[1], [x27],x14 \n\t" // Load c61 into quad and increment by rs_c. -" ld1 {v9.d}[0], [x27],x14 \n\t" // Load c62 into quad and increment by rs_c. -" ld1 {v9.d}[1], [x27],x14 \n\t" // Load c63 into quad and increment by rs_c. -" ld1 {v10.d}[0],[x27],x14 \n\t" // Load c64 into quad and increment by rs_c. -" ld1 {v10.d}[1],[x27],x14 \n\t" // Load c65 into quad and increment by rs_c. -" \n\t" -" mov x27, x26 \n\t" // Load address of C. -" \n\t" -" ld1 {v11.d}[0],[x27],x14 \n\t" // Load c70 into quad and increment by rs_c. -" ld1 {v11.d}[1],[x27],x14 \n\t" // Load c71 into quad and increment by rs_c. -" ld1 {v12.d}[0],[x27],x14 \n\t" // Load c72 into quad and increment by rs_c. -" ld1 {v12.d}[1],[x27],x14 \n\t" // Load c73 into quad and increment by rs_c. -" ld1 {v13.d}[0],[x27],x14 \n\t" // Load c74 into quad and increment by rs_c. -" ld1 {v13.d}[1],[x27],x14 \n\t" // Load c75 into quad and increment by rs_c. -" \n\t" -" fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta -" fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta -" fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta -" fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta -" fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta -" fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta -" \n\t" -LABEL(DBETAZEROGENSTOREDS4) -" \n\t" -" prfm pldl2keep,[x0] \n\t" -" prfm pldl2keep,[x1] \n\t" -" \n\t" -" fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v10.2d,v28.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v11.2d,v29.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v12.2d,v30.2d,v6.d[0] \n\t" // Scale by alpha -" fmla v13.2d,v31.2d,v6.d[0] \n\t" // Scale by alpha -" \n\t" -" mov x27, x25 \n\t" // Load address of C. -" \n\t" -" st1 {v8.d}[0], [x27],x14 \n\t" // Store c60 into quad and increment by rs_c. -" st1 {v8.d}[1], [x27],x14 \n\t" // Store c61 into quad and increment by rs_c. -" st1 {v9.d}[0], [x27],x14 \n\t" // Store c62 into quad and increment by rs_c. -" st1 {v9.d}[1], [x27],x14 \n\t" // Store c63 into quad and increment by rs_c. -" st1 {v10.d}[0],[x27],x14 \n\t" // Store c64 into quad and increment by rs_c. -" st1 {v10.d}[1],[x27],x14 \n\t" // Store c65 into quad and increment by rs_c. -" \n\t" -" mov x27, x26 \n\t" // Load address of C. -" \n\t" -" st1 {v11.d}[0],[x27],x14 \n\t" // Store c70 into quad and increment by rs_c. -" st1 {v11.d}[1],[x27],x14 \n\t" // Store c71 into quad and increment by rs_c. -" st1 {v12.d}[0],[x27],x14 \n\t" // Store c72 into quad and increment by rs_c. -" st1 {v12.d}[1],[x27],x14 \n\t" // Store c73 into quad and increment by rs_c. -" st1 {v13.d}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c. -" st1 {v13.d}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c. -" \n\t" -LABEL(DEND) // Done! -" \n\t" -:// output operands (none) -:// input operands - [aaddr] "m" (a), // 0 - [baddr] "m" (b), // 1 - [caddr] "m" (c), // 2 - [k_iter] "m" (k_iter), // 3 - [k_left] "m" (k_left), // 4 - [alpha] "m" (alpha), // 5 - [beta] "m" (beta), // 6 - [rs_c] "m" (rs_c), // 6 - [cs_c] "m" (cs_c), // 7 - [a_next] "m" (a_next), // 8 - [b_next] "m" (b_next) // 9 -:// Register clobber list - "x0","x1","x2", - "x5","x6","x10", - "x14","x16","x17", - "x20","x21","x22","x23","x24","x25","x26","x27", - "v0","v1","v2", - "v3","v4","v5", - "v6","v7","v8", - "v9","v10","v11", - "v12","v13","v14", - "v15","v16","v17","v18","v19", - "v20","v21","v22","v23", - "v24","v25","v26","v27", - "v28","v29","v30","v31" -); - + GEMM_UKR_SETUP_CT( d, 6, 8, false ); + __asm__ volatile + ( + " \n\t" + " ldr x0,%[aaddr] \n\t" // Load address of A + " ldr x1,%[baddr] \n\t" // Load address of B + " ldr x2,%[caddr] \n\t" // Load address of C + " \n\t" + " ldr x5,%[k_iter] \n\t" // Init guard (k_iter) + " ldr x6,%[k_left] \n\t" // Init guard (k_iter) + " \n\t" + " ldr x10,%[cs_c] \n\t" // Load cs_c + " lsl x10,x10,#3 \n\t" // cs_c * sizeof(double) + " \n\t" + " ldr x14,%[rs_c] \n\t" // Load rs_c. + " lsl x14,x14,#3 \n\t" // rs_c * sizeof(double). + " \n\t" + " add x20,x2,x10 \n\t" //Load address Column 1 of C + " add x21,x20,x10 \n\t" //Load address Column 2 of C + " add x22,x21,x10 \n\t" //Load address Column 3 of C + " add x23,x22,x10 \n\t" //Load address Column 4 of C + " add x24,x23,x10 \n\t" //Load address Column 5 of C + " add x25,x24,x10 \n\t" //Load address Column 6 of C + " add x26,x25,x10 \n\t" //Load address Column 7 of C + " \n\t" + " prfm pldl1keep,[x2] \n\t" // Prefetch c. + " prfm pldl1keep,[x20] \n\t" // Prefetch c. + " prfm pldl1keep,[x21] \n\t" // Prefetch c. + " prfm pldl1keep,[x22] \n\t" // Prefetch c. + " prfm pldl1keep,[x23] \n\t" // Prefetch c. + " prfm pldl1keep,[x24] \n\t" // Prefetch c. + " prfm pldl1keep,[x25] \n\t" // Prefetch c. + " prfm pldl1keep,[x26] \n\t" // Prefetch c. + " \n\t" + " dup v8.2d, xzr \n\t" // Vector for accummulating column 0 + " prfm PLDL1KEEP, [x1, #256] \n\t" + " dup v9.2d, xzr \n\t" // Vector for accummulating column 0 + " prfm PLDL1KEEP, [x1, #320] \n\t" + " dup v10.2d, xzr \n\t" // Vector for accummulating column 0 + " prfm PLDL1KEEP, [x1, #384] \n\t" + " dup v11.2d, xzr \n\t" // Vector for accummulating column 1 + " prfm PLDL1KEEP, [x1, #448] \n\t" + " dup v12.2d, xzr \n\t" // Vector for accummulating column 1 + " dup v13.2d, xzr \n\t" // Vector for accummulating column 1 + " \n\t" + " dup v14.2d, xzr \n\t" // Vector for accummulating column 2 + " prfm PLDL1KEEP, [x0, #192] \n\t" + " dup v15.2d, xzr \n\t" // Vector for accummulating column 2 + " prfm PLDL1KEEP, [x0, #256] \n\t" + " dup v16.2d, xzr \n\t" // Vector for accummulating column 2 + " prfm PLDL1KEEP, [x0, #320] \n\t" + " dup v17.2d, xzr \n\t" // Vector for accummulating column 3 + " dup v18.2d, xzr \n\t" // Vector for accummulating column 3 + " dup v19.2d, xzr \n\t" // Vector for accummulating column 3 + " \n\t" + " dup v20.2d, xzr \n\t" // Vector for accummulating column 4 + " dup v21.2d, xzr \n\t" // Vector for accummulating column 4 + " dup v22.2d, xzr \n\t" // Vector for accummulating column 4 + " dup v23.2d, xzr \n\t" // Vector for accummulating column 5 + " dup v24.2d, xzr \n\t" // Vector for accummulating column 5 + " dup v25.2d, xzr \n\t" // Vector for accummulating column 5 + " \n\t" + " dup v26.2d, xzr \n\t" // Vector for accummulating column 6 + " dup v27.2d, xzr \n\t" // Vector for accummulating column 6 + " dup v28.2d, xzr \n\t" // Vector for accummulating column 6 + " dup v29.2d, xzr \n\t" // Vector for accummulating column 7 + " dup v30.2d, xzr \n\t" // Vector for accummulating column 7 + " dup v31.2d, xzr \n\t" // Vector for accummulating column 7 + " \n\t" + " \n\t" + " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. + BEQ(DCONSIDERKLEFT) + " \n\t" + " ldr q0, [x0] \n\t" // Load a + " ldr q1, [x0, #16] \n\t" + " ldr q2, [x0, #32] \n\t" + " \n\t" + " ldr q3, [x1] \n\t" // Load b + " ldr q4, [x1, #16] \n\t" + " ldr q5, [x1, #32] \n\t" + " ldr q6, [x1, #48] \n\t" + " \n\t" + " add x0, x0, #48 \n\t" //update address of A + " add x1, x1, #64 \n\t" //update address of B + " \n\t" + " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. + BEQ(DLASTITER) // (as loop is do-while-like). + " \n\t" + LABEL(DLOOP) // Body + " \n\t" + " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate + " prfm PLDL1KEEP, [x1, #448] \n\t" //512-64=448 + " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate + " prfm PLDL1KEEP, [x1, #512] \n\t" + " fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate + " prfm PLDL1KEEP, [x1, #576] \n\t" + " \n\t" + " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate + " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate + " fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate + " \n\t" + " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate + " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate + " fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate + " ldr q3, [x1] \n\t" + " \n\t" + " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate + " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate + " fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate + " ldr q7, [x0, #32] \n\t" + " \n\t" + " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate + " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate + " fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate + " ldr q4, [x1, #16] \n\t" + " \n\t" + " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate + " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate + " fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate + " ldr q5, [x1, #32] \n\t" + " \n\t" + " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate + " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate + " ldr q0, [x0] \n\t" + " \n\t" + " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate + " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate + " ldr q1, [x0, #16] \n\t" + " \n\t" + " fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate + " fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate + " ldr q6, [x1, #48] \n\t" + " \n\t" // End it 1 + " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate + " prfm PLDL1KEEP, [x1, #640] \n\t" + " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate + " prfm PLDL1KEEP, [x0, #336] \n\t" + " fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate + " prfm PLDL1KEEP, [x0, #400] \n\t" + " \n\t" + " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate + " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate + " fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate + " \n\t" + " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate + " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate + " fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate + " ldr q3, [x1, #64] \n\t" + " \n\t" + " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate + " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate + " fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate + " ldr q2, [x0, #80] \n\t" + " \n\t" + " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate + " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate + " fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate + " ldr q4, [x1, #80] \n\t" + " \n\t" + " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate + " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate + " fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate + " ldr q5, [x1, #96] \n\t" + " \n\t" + " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate + " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate + " ldr q0, [x0, #48] \n\t" + " \n\t" + " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate + " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate + " ldr q1, [x0, #64] \n\t" + " \n\t" + " fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate + " fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate + " ldr q6, [x1, #112] \n\t" + " \n\t" //End it 2 + " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate + " prfm PLDL1KEEP, [x0, #464] \n\t" + " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate + " fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate + " \n\t" + " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate + " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate + " fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate + " \n\t" + " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate + " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate + " fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate + " ldr q3, [x1, #128] \n\t" + " \n\t" + " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate + " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate + " fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate + " ldr q7, [x0, #128] \n\t" + " \n\t" + " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate + " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate + " fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate + " ldr q4, [x1, #144] \n\t" + " \n\t" + " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate + " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate + " fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate + " ldr q5, [x1, #160] \n\t" + " \n\t" + " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate + " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate + " ldr q0, [x0, #96] \n\t" + " \n\t" + " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate + " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate + " ldr q1, [x0, #112] \n\t" + " \n\t" + " fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate + " fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate + " ldr q6, [x1, #176] \n\t" + " \n\t" // End it 3 + " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate + " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate + " fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate + " \n\t" + " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate + " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate + " fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate + " ldr q3, [x1, #192] \n\t" + " \n\t" + " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate + " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate + " fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate + " ldr q2, [x0, #176] \n\t" + " \n\t" + " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate + " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate + " fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate + " ldr q4, [x1, #208] \n\t" + " \n\t" + " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate + " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate + " fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate + " \n\t" + " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate + " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate + " fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate + " ldr q5, [x1, #224] \n\t" + " \n\t" + " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate + " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate + " ldr q0, [x0, #144] \n\t" + " \n\t" + " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate + " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate + " ldr q1, [x0, #160] \n\t" + " \n\t" + " fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate + " fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate + " ldr q6, [x1, #240] \n\t" + " \n\t" //End it 4 + " add x0, x0, #192 \n\t" + " add x1, x1, #256 \n\t" + " \n\t" + " sub x5,x5,1 \n\t" // i-=1 + " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. + BNE(DLOOP) + " \n\t" + LABEL(DLASTITER) + " \n\t" + " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate + " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate + " fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate + " \n\t" + " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate + " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate + " fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate + " ldr q3, [x1] \n\t" + " \n\t" + " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate + " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate + " fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate + " ldr q7, [x0, #32] \n\t" + " \n\t" + " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate + " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate + " fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate + " ldr q4, [x1, #16] \n\t" + " \n\t" + " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate + " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate + " fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate + " \n\t" + " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate + " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate + " fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate + " ldr q5, [x1, #32] \n\t" + " \n\t" + " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate + " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate + " ldr q0, [x0] \n\t" + " \n\t" + " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate + " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate + " ldr q1, [x0, #16] \n\t" + " \n\t" + " fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate + " fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate + " ldr q6, [x1, #48] \n\t" + " \n\t" // End it 1 + " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate + " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate + " fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate + " \n\t" + " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate + " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate + " fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate + " ldr q3, [x1, #64] \n\t" + " \n\t" + " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate + " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate + " fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate + " ldr q2, [x0, #80] \n\t" + " \n\t" + " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate + " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate + " fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate + " ldr q4, [x1, #80] \n\t" + " \n\t" + " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate + " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate + " fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate + " \n\t" + " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate + " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate + " fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate + " ldr q5, [x1, #96] \n\t" + " \n\t" + " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate + " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate + " ldr q0, [x0, #48] \n\t" + " \n\t" + " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate + " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate + " ldr q1, [x0, #64] \n\t" + " \n\t" + " fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate + " fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate + " ldr q6, [x1, #112] \n\t" + " \n\t" //End it 2 + " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate + " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate + " fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate + " \n\t" + " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate + " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate + " fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate + " ldr q3, [x1, #128] \n\t" + " \n\t" + " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate + " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate + " fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate + " ldr q7, [x0, #128] \n\t" + " \n\t" + " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate + " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate + " fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate + " ldr q4, [x1, #144] \n\t" + " \n\t" + " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate + " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate + " fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate + " \n\t" + " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate + " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate + " fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate + " ldr q5, [x1, #160] \n\t" + " \n\t" + " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate + " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate + " ldr q0, [x0, #96] \n\t" + " \n\t" + " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate + " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate + " ldr q1, [x0, #112] \n\t" + " \n\t" + " fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate + " fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate + " ldr q6, [x1, #176] \n\t" + " \n\t" // End it 3 + " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate + " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate + " fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate + " \n\t" + " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate + " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate + " fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate + " \n\t" + " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate + " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate + " fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate + " \n\t" + " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate + " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate + " fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate + " \n\t" + " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate + " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate + " fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate + " \n\t" + " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate + " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate + " fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate + " \n\t" + " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate + " add x1, x1, #192 \n\t" + " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate + " \n\t" + " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate + " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate + " \n\t" + " fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate + " fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate + " \n\t" //End it 4 + " add x0, x0, #144 \n\t" + " \n\t" + LABEL(DCONSIDERKLEFT) + " cmp x6,0 \n\t" // If k_left == 0, we are done. + BEQ(DPOSTACCUM) // else, we enter the k_left loop. + " \n\t" + LABEL(DLOOPKLEFT) + " \n\t" + " ldr q0, [x0],#16 \n\t" + " ldr q1, [x0],#16 \n\t" // Load a + " ldr q2, [x0],#16 \n\t" + " \n\t" + " ldr q3, [x1],#16 \n\t" // Load b + " ldr q4, [x1],#16 \n\t" + " ldr q5, [x1],#16 \n\t" + " ldr q6, [x1],#16 \n\t" + " \n\t" + " sub x6,x6,1 \n\t" + " \n\t" + " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate + " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate + " fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate + " \n\t" + " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate + " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate + " fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate + " \n\t" + " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate + " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate + " fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate + " \n\t" + " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate + " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate + " fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate + " \n\t" + " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate + " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate + " fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate + " \n\t" + " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate + " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate + " fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate + " \n\t" + " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate + " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate + " \n\t" + " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate + " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate + " \n\t" + " fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate + " fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate + " \n\t" + " cmp x6,0 \n\t" // Iterate again. + BNE(DLOOPKLEFT) // if i!=0. + " \n\t" + LABEL(DPOSTACCUM) + " \n\t" + " ldr x0,%[alpha] \n\t" // Alpha address + " ldr x1,%[beta] \n\t" // Beta address + " \n\t" + " ld1r {v6.2d},[x0] \n\t" // Load alpha. + " ld1r {v7.2d},[x1] \n\t" // Load beta + " \n\t" + " ldr x0,%[a_next] \n\t" // Next A address for later use. + " ldr x1,%[b_next] \n\t" // Next B address for later use. + " \n\t" + " cmp x14,#8 \n\t" // If rs_c != 1 (column-major) + BNE(DGENSTORED) + " \n\t" + LABEL(DCOLSTORED) // C is column-major. + " \n\t" + " dup v0.2d, xzr \n\t" + " dup v1.2d, xzr \n\t" + " dup v2.2d, xzr \n\t" + " dup v3.2d, xzr \n\t" + " dup v4.2d, xzr \n\t" + " dup v5.2d, xzr \n\t" + " \n\t" + " fcmp d7,#0.0 \n\t" + BEQ(DBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case. + " \n\t" + " ldr q0, [x2] \n\t" //Load column 0 of C + " ldr q1, [x2, #16] \n\t" + " ldr q2, [x2, #32] \n\t" + " \n\t" + " ldr q3, [x20] \n\t" //Load column 1 of C + " ldr q4, [x20, #16] \n\t" + " ldr q5, [x20, #32] \n\t" + " \n\t" + " fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta + " fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta + " fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta + " fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta + " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta + " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta + " \n\t" + LABEL(DBETAZEROCOLSTOREDS1) + " \n\t" + " fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v2.2d,v10.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v3.2d,v11.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v4.2d,v12.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v5.2d,v13.2d,v6.d[0] \n\t" // Scale by alpha + " \n\t" + " str q0, [x2] \n\t" //Store column 0 of C + " str q1, [x2, #16] \n\t" + " str q2, [x2, #32] \n\t" + " \n\t" + " str q3, [x20] \n\t" //Store column 1 of C + " str q4, [x20, #16] \n\t" + " str q5, [x20, #32] \n\t" + " \n\t" + " dup v8.2d, xzr \n\t" + " dup v9.2d, xzr \n\t" + " dup v10.2d, xzr \n\t" + " dup v11.2d, xzr \n\t" + " dup v12.2d, xzr \n\t" + " dup v13.2d, xzr \n\t" + " \n\t" + " fcmp d7,#0.0 \n\t" + BEQ(DBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. + " \n\t" + " ldr q8, [x21] \n\t" //Load column 2 of C + " ldr q9, [x21, #16] \n\t" + " ldr q10, [x21, #32] \n\t" + " \n\t" + " ldr q11, [x22] \n\t" //Load column 3 of C + " ldr q12, [x22, #16] \n\t" + " ldr q13, [x22, #32] \n\t" + " \n\t" + " fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta + " fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta + " fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta + " fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta + " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta + " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta + " \n\t" + LABEL(DBETAZEROCOLSTOREDS2) + " \n\t" + " fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v10.2d,v16.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v11.2d,v17.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v12.2d,v18.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v13.2d,v19.2d,v6.d[0] \n\t" // Scale by alpha + " \n\t" + " str q8, [x21] \n\t" //Store column 2 of C + " str q9, [x21, #16] \n\t" + " str q10, [x21, #32] \n\t" + " \n\t" + " str q11, [x22] \n\t" //Store column 3 of C + " str q12, [x22, #16] \n\t" + " str q13, [x22, #32] \n\t" + " \n\t" + " dup v0.2d, xzr \n\t" + " dup v1.2d, xzr \n\t" + " dup v2.2d, xzr \n\t" + " dup v3.2d, xzr \n\t" + " dup v4.2d, xzr \n\t" + " dup v5.2d, xzr \n\t" + " \n\t" + " fcmp d7,#0.0 \n\t" + BEQ(DBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. + " \n\t" + " ldr q0, [x23] \n\t" //Load column 4 of C + " ldr q1, [x23, #16] \n\t" + " ldr q2, [x23, #32] \n\t" + " \n\t" + " ldr q3, [x24] \n\t" //Load column 5 of C + " ldr q4, [x24, #16] \n\t" + " ldr q5, [x24, #32] \n\t" + " \n\t" + " fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta + " fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta + " fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta + " fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta + " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta + " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta + " \n\t" + LABEL(DBETAZEROCOLSTOREDS3) + " \n\t" + " fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v2.2d,v22.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v3.2d,v23.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v4.2d,v24.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v5.2d,v25.2d,v6.d[0] \n\t" // Scale by alpha + " \n\t" + " str q0, [x23] \n\t" //Store column 4 of C + " str q1, [x23, #16] \n\t" + " str q2, [x23, #32] \n\t" + " \n\t" + " str q3, [x24] \n\t" //Store column 5 of C + " str q4, [x24, #16] \n\t" + " str q5, [x24, #32] \n\t" + " \n\t" + " dup v8.2d, xzr \n\t" + " dup v9.2d, xzr \n\t" + " dup v10.2d, xzr \n\t" + " dup v11.2d, xzr \n\t" + " dup v12.2d, xzr \n\t" + " dup v13.2d, xzr \n\t" + " \n\t" + " fcmp d7,#0.0 \n\t" + BEQ(DBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. + " \n\t" + " ldr q8, [x25] \n\t" //Load column 6 of C + " ldr q9, [x25, #16] \n\t" + " ldr q10, [x25, #32] \n\t" + " \n\t" + " ldr q11, [x26] \n\t" //Load column 7 of C + " ldr q12, [x26, #16] \n\t" + " ldr q13, [x26, #32] \n\t" + " \n\t" + " fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta + " fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta + " fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta + " fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta + " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta + " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta + " \n\t" + LABEL(DBETAZEROCOLSTOREDS4) + " \n\t" + " prfm pldl2keep,[x0] \n\t" + " prfm pldl2keep,[x1] \n\t" + " \n\t" + " fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v10.2d,v28.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v11.2d,v29.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v12.2d,v30.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v13.2d,v31.2d,v6.d[0] \n\t" // Scale by alpha + " \n\t" + " str q8, [x25] \n\t" //Store column 6 of C + " str q9, [x25, #16] \n\t" + " str q10, [x25, #32] \n\t" + " \n\t" + " str q11, [x26] \n\t" //Store column 7 of C + " str q12, [x26, #16] \n\t" + " str q13, [x26, #32] \n\t" + " \n\t" + BRANCH(DEND) + " \n\t" + LABEL(DGENSTORED) // C is general-stride stored. + " \n\t" + " dup v0.2d, xzr \n\t" + " dup v1.2d, xzr \n\t" + " dup v2.2d, xzr \n\t" + " dup v3.2d, xzr \n\t" + " dup v4.2d, xzr \n\t" + " dup v5.2d, xzr \n\t" + " \n\t" + " fcmp d7,#0.0 \n\t" + BEQ(DBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. + " \n\t" + " mov x27, x2 \n\t" + " \n\t" // Load address of C. + " ld1 {v0.d}[0],[x27],x14 \n\t" // Load c00 into quad and increment by rs_c. + " ld1 {v0.d}[1],[x27],x14 \n\t" // Load c01 into quad and increment by rs_c. + " ld1 {v1.d}[0],[x27],x14 \n\t" // Load c02 into quad and increment by rs_c. + " ld1 {v1.d}[1],[x27],x14 \n\t" // Load c03 into quad and increment by rs_c. + " ld1 {v2.d}[0],[x27],x14 \n\t" // Load c04 into quad and increment by rs_c. + " ld1 {v2.d}[1],[x27],x14 \n\t" // Load c05 into quad and increment by rs_c. + " \n\t" + " mov x27, x20 \n\t" // Load address of C. + " \n\t" + " ld1 {v3.d}[0],[x27],x14 \n\t" // Load c10 into quad and increment by rs_c. + " ld1 {v3.d}[1],[x27],x14 \n\t" // Load c11 into quad and increment by rs_c. + " ld1 {v4.d}[0],[x27],x14 \n\t" // Load c12 into quad and increment by rs_c. + " ld1 {v4.d}[1],[x27],x14 \n\t" // Load c13 into quad and increment by rs_c. + " ld1 {v5.d}[0],[x27],x14 \n\t" // Load c14 into quad and increment by rs_c. + " ld1 {v5.d}[1],[x27],x14 \n\t" // Load c15 into quad and increment by rs_c. + " \n\t" + " fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta + " fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta + " fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta + " fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta + " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta + " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta + " \n\t" + LABEL(DBETAZEROGENSTOREDS1) + " \n\t" + " fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v2.2d,v10.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v3.2d,v11.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v4.2d,v12.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v5.2d,v13.2d,v6.d[0] \n\t" // Scale by alpha + " \n\t" + " mov x27, x2 \n\t" // Load address of C. + " \n\t" + " st1 {v0.d}[0],[x27],x14 \n\t" // Store c00 into quad and increment by rs_c. + " st1 {v0.d}[1],[x27],x14 \n\t" // Store c01 into quad and increment by rs_c. + " st1 {v1.d}[0],[x27],x14 \n\t" // Store c02 into quad and increment by rs_c. + " st1 {v1.d}[1],[x27],x14 \n\t" // Store c03 into quad and increment by rs_c. + " st1 {v2.d}[0],[x27],x14 \n\t" // Store c04 into quad and increment by rs_c. + " st1 {v2.d}[1],[x27],x14 \n\t" // Store c05 into quad and increment by rs_c. + " \n\t" + " mov x27, x20 \n\t" // Load address of C. + " \n\t" + " st1 {v3.d}[0],[x27],x14 \n\t" // Store c10 into quad and increment by rs_c. + " st1 {v3.d}[1],[x27],x14 \n\t" // Store c11 into quad and increment by rs_c. + " st1 {v4.d}[0],[x27],x14 \n\t" // Store c12 into quad and increment by rs_c. + " st1 {v4.d}[1],[x27],x14 \n\t" // Store c13 into quad and increment by rs_c. + " st1 {v5.d}[0],[x27],x14 \n\t" // Store c14 into quad and increment by rs_c. + " st1 {v5.d}[1],[x27],x14 \n\t" // Store c15 into quad and increment by rs_c. + " \n\t" + " dup v8.2d, xzr \n\t" + " dup v9.2d, xzr \n\t" + " dup v10.2d, xzr \n\t" + " dup v11.2d, xzr \n\t" + " dup v12.2d, xzr \n\t" + " dup v13.2d, xzr \n\t" + " \n\t" + " fcmp d7,#0.0 \n\t" + BEQ(DBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. + " \n\t" + " mov x27, x21 \n\t" // Load address of C. + " \n\t" + " ld1 {v8.d}[0], [x27],x14 \n\t" // Load c20 into quad and increment by rs_c. + " ld1 {v8.d}[1], [x27],x14 \n\t" // Load c21 into quad and increment by rs_c. + " ld1 {v9.d}[0], [x27],x14 \n\t" // Load c22 into quad and increment by rs_c. + " ld1 {v9.d}[1], [x27],x14 \n\t" // Load c23 into quad and increment by rs_c. + " ld1 {v10.d}[0],[x27],x14 \n\t" // Load c24 into quad and increment by rs_c. + " ld1 {v10.d}[1],[x27],x14 \n\t" // Load c25 into quad and increment by rs_c. + " \n\t" + " mov x27, x22 \n\t" // Load address of C. + " \n\t" + " ld1 {v11.d}[0],[x27],x14 \n\t" // Load c30 into quad and increment by rs_c. + " ld1 {v11.d}[1],[x27],x14 \n\t" // Load c31 into quad and increment by rs_c. + " ld1 {v12.d}[0],[x27],x14 \n\t" // Load c32 into quad and increment by rs_c. + " ld1 {v12.d}[1],[x27],x14 \n\t" // Load c33 into quad and increment by rs_c. + " ld1 {v13.d}[0],[x27],x14 \n\t" // Load c34 into quad and increment by rs_c. + " ld1 {v13.d}[1],[x27],x14 \n\t" // Load c35 into quad and increment by rs_c. + " \n\t" + " fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta + " fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta + " fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta + " fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta + " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta + " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta + " \n\t" + LABEL(DBETAZEROGENSTOREDS2) + " \n\t" + " fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v10.2d,v16.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v11.2d,v17.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v12.2d,v18.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v13.2d,v19.2d,v6.d[0] \n\t" // Scale by alpha + " \n\t" + " mov x27, x21 \n\t" // Load address of C. + " \n\t" + " st1 {v8.d}[0], [x27],x14 \n\t" // Store c20 into quad and increment by rs_c. + " st1 {v8.d}[1], [x27],x14 \n\t" // Store c21 into quad and increment by rs_c. + " st1 {v9.d}[0], [x27],x14 \n\t" // Store c22 into quad and increment by rs_c. + " st1 {v9.d}[1], [x27],x14 \n\t" // Store c23 into quad and increment by rs_c. + " st1 {v10.d}[0],[x27],x14 \n\t" // Store c24 into quad and increment by rs_c. + " st1 {v10.d}[1],[x27],x14 \n\t" // Store c25 into quad and increment by rs_c. + " \n\t" + " mov x27, x22 \n\t" // Load address of C. + " \n\t" + " st1 {v11.d}[0],[x27],x14 \n\t" // Store c30 into quad and increment by rs_c. + " st1 {v11.d}[1],[x27],x14 \n\t" // Store c31 into quad and increment by rs_c. + " st1 {v12.d}[0],[x27],x14 \n\t" // Store c32 into quad and increment by rs_c. + " st1 {v12.d}[1],[x27],x14 \n\t" // Store c33 into quad and increment by rs_c. + " st1 {v13.d}[0],[x27],x14 \n\t" // Store c34 into quad and increment by rs_c. + " st1 {v13.d}[1],[x27],x14 \n\t" // Store c35 into quad and increment by rs_c. + " \n\t" + " dup v0.2d, xzr \n\t" + " dup v1.2d, xzr \n\t" + " dup v2.2d, xzr \n\t" + " dup v3.2d, xzr \n\t" + " dup v4.2d, xzr \n\t" + " dup v5.2d, xzr \n\t" + " \n\t" + " fcmp d7,#0.0 \n\t" + BEQ(DBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. + " \n\t" + " mov x27, x23 \n\t" // Load address of C. + " \n\t" + " ld1 {v0.d}[0],[x27],x14 \n\t" // Load c40 into quad and increment by rs_c. + " ld1 {v0.d}[1],[x27],x14 \n\t" // Load c41 into quad and increment by rs_c. + " ld1 {v1.d}[0],[x27],x14 \n\t" // Load c42 into quad and increment by rs_c. + " ld1 {v1.d}[1],[x27],x14 \n\t" // Load c43 into quad and increment by rs_c. + " ld1 {v2.d}[0],[x27],x14 \n\t" // Load c44 into quad and increment by rs_c. + " ld1 {v2.d}[1],[x27],x14 \n\t" // Load c45 into quad and increment by rs_c. + " \n\t" + " mov x27, x24 \n\t" // Load address of C. + " \n\t" + " ld1 {v3.d}[0],[x27],x14 \n\t" // Load c50 into quad and increment by rs_c. + " ld1 {v3.d}[1],[x27],x14 \n\t" // Load c51 into quad and increment by rs_c. + " ld1 {v4.d}[0],[x27],x14 \n\t" // Load c52 into quad and increment by rs_c. + " ld1 {v4.d}[1],[x27],x14 \n\t" // Load c53 into quad and increment by rs_c. + " ld1 {v5.d}[0],[x27],x14 \n\t" // Load c54 into quad and increment by rs_c. + " ld1 {v5.d}[1],[x27],x14 \n\t" // Load c55 into quad and increment by rs_c. + " \n\t" + " fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta + " fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta + " fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta + " fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta + " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta + " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta + " \n\t" + LABEL(DBETAZEROGENSTOREDS3) + " \n\t" + " fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v2.2d,v22.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v3.2d,v23.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v4.2d,v24.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v5.2d,v25.2d,v6.d[0] \n\t" // Scale by alpha + " \n\t" + " mov x27, x23 \n\t" // Load address of C. + " \n\t" + " st1 {v0.d}[0],[x27],x14 \n\t" // Store c40 into quad and increment by rs_c. + " st1 {v0.d}[1],[x27],x14 \n\t" // Store c41 into quad and increment by rs_c. + " st1 {v1.d}[0],[x27],x14 \n\t" // Store c42 into quad and increment by rs_c. + " st1 {v1.d}[1],[x27],x14 \n\t" // Store c43 into quad and increment by rs_c. + " st1 {v2.d}[0],[x27],x14 \n\t" // Store c44 into quad and increment by rs_c. + " st1 {v2.d}[1],[x27],x14 \n\t" // Store c45 into quad and increment by rs_c. + " \n\t" + " mov x27, x24 \n\t" // Load address of C. + " \n\t" + " st1 {v3.d}[0],[x27],x14 \n\t" // Store c50 into quad and increment by rs_c. + " st1 {v3.d}[1],[x27],x14 \n\t" // Store c51 into quad and increment by rs_c. + " st1 {v4.d}[0],[x27],x14 \n\t" // Store c52 into quad and increment by rs_c. + " st1 {v4.d}[1],[x27],x14 \n\t" // Store c53 into quad and increment by rs_c. + " st1 {v5.d}[0],[x27],x14 \n\t" // Store c54 into quad and increment by rs_c. + " st1 {v5.d}[1],[x27],x14 \n\t" // Store c55 into quad and increment by rs_c. + " \n\t" + " dup v8.2d, xzr \n\t" + " dup v9.2d, xzr \n\t" + " dup v10.2d, xzr \n\t" + " dup v11.2d, xzr \n\t" + " dup v12.2d, xzr \n\t" + " dup v13.2d, xzr \n\t" + " \n\t" + " fcmp d7,#0.0 \n\t" + BEQ(DBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. + " \n\t" + " mov x27, x25 \n\t" + " \n\t" + " ld1 {v8.d}[0], [x27],x14 \n\t" // Load c60 into quad and increment by rs_c. + " ld1 {v8.d}[1], [x27],x14 \n\t" // Load c61 into quad and increment by rs_c. + " ld1 {v9.d}[0], [x27],x14 \n\t" // Load c62 into quad and increment by rs_c. + " ld1 {v9.d}[1], [x27],x14 \n\t" // Load c63 into quad and increment by rs_c. + " ld1 {v10.d}[0],[x27],x14 \n\t" // Load c64 into quad and increment by rs_c. + " ld1 {v10.d}[1],[x27],x14 \n\t" // Load c65 into quad and increment by rs_c. + " \n\t" + " mov x27, x26 \n\t" // Load address of C. + " \n\t" + " ld1 {v11.d}[0],[x27],x14 \n\t" // Load c70 into quad and increment by rs_c. + " ld1 {v11.d}[1],[x27],x14 \n\t" // Load c71 into quad and increment by rs_c. + " ld1 {v12.d}[0],[x27],x14 \n\t" // Load c72 into quad and increment by rs_c. + " ld1 {v12.d}[1],[x27],x14 \n\t" // Load c73 into quad and increment by rs_c. + " ld1 {v13.d}[0],[x27],x14 \n\t" // Load c74 into quad and increment by rs_c. + " ld1 {v13.d}[1],[x27],x14 \n\t" // Load c75 into quad and increment by rs_c. + " \n\t" + " fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta + " fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta + " fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta + " fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta + " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta + " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta + " \n\t" + LABEL(DBETAZEROGENSTOREDS4) + " \n\t" + " prfm pldl2keep,[x0] \n\t" + " prfm pldl2keep,[x1] \n\t" + " \n\t" + " fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v10.2d,v28.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v11.2d,v29.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v12.2d,v30.2d,v6.d[0] \n\t" // Scale by alpha + " fmla v13.2d,v31.2d,v6.d[0] \n\t" // Scale by alpha + " \n\t" + " mov x27, x25 \n\t" // Load address of C. + " \n\t" + " st1 {v8.d}[0], [x27],x14 \n\t" // Store c60 into quad and increment by rs_c. + " st1 {v8.d}[1], [x27],x14 \n\t" // Store c61 into quad and increment by rs_c. + " st1 {v9.d}[0], [x27],x14 \n\t" // Store c62 into quad and increment by rs_c. + " st1 {v9.d}[1], [x27],x14 \n\t" // Store c63 into quad and increment by rs_c. + " st1 {v10.d}[0],[x27],x14 \n\t" // Store c64 into quad and increment by rs_c. + " st1 {v10.d}[1],[x27],x14 \n\t" // Store c65 into quad and increment by rs_c. + " \n\t" + " mov x27, x26 \n\t" // Load address of C. + " \n\t" + " st1 {v11.d}[0],[x27],x14 \n\t" // Store c70 into quad and increment by rs_c. + " st1 {v11.d}[1],[x27],x14 \n\t" // Store c71 into quad and increment by rs_c. + " st1 {v12.d}[0],[x27],x14 \n\t" // Store c72 into quad and increment by rs_c. + " st1 {v12.d}[1],[x27],x14 \n\t" // Store c73 into quad and increment by rs_c. + " st1 {v13.d}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c. + " st1 {v13.d}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c. + " \n\t" + LABEL(DEND) // Done! + " \n\t" + :// output operands (none) + :// input operands + [aaddr] "m" (a), // 0 + [baddr] "m" (b), // 1 + [caddr] "m" (c), // 2 + [k_iter] "m" (k_iter), // 3 + [k_left] "m" (k_left), // 4 + [alpha] "m" (alpha), // 5 + [beta] "m" (beta), // 6 + [rs_c] "m" (rs_c), // 6 + [cs_c] "m" (cs_c), // 7 + [a_next] "m" (a_next), // 8 + [b_next] "m" (b_next) // 9 + :// Register clobber list + "x0","x1","x2", + "x5","x6","x10", + "x14","x16","x17", + "x20","x21","x22","x23","x24","x25","x26","x27", + "v0","v1","v2", + "v3","v4","v5", + "v6","v7","v8", + "v9","v10","v11", + "v12","v13","v14", + "v15","v16","v17","v18","v19", + "v20","v21","v22","v23", + "v24","v25","v26","v27", + "v28","v29","v30","v31" + ); + GEMM_UKR_FLUSH_CT( d ); } #if 0 void bli_cgemm_armv8a_opt_4x4 ( + dim_t m, + dim_t n, dim_t k, scomplex* restrict alpha, scomplex* restrict a, @@ -2095,6 +2105,8 @@ void bli_cgemm_armv8a_opt_4x4 void bli_zgemm_armv8a_opt_4x4 ( + dim_t m, + dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, diff --git a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c index 1612e69b0..15e3e072f 100644 --- a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c +++ b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c @@ -56,6 +56,8 @@ void bli_dgemm_bgq_int_8x8 ( + dim_t m, + dim_t n, dim_t k, double* restrict alpha, double* restrict a, @@ -66,6 +68,8 @@ void bli_dgemm_bgq_int_8x8 cntx_t* restrict cntx ) { + GEMM_UKR_SETUP_CT_ANY( d, 8, 8, false ); + //Registers for storing C. //4 4x4 subblocks of C, c00, c01, c10, c11 //4 registers per subblock: a, b, c, d @@ -201,6 +205,8 @@ void bli_dgemm_bgq_int_8x8 UPDATE( AB, c, 0 ); AB = vec_perm( c11d, c11d, pattern ); UPDATE( AB, c, 4 ); + + GEMM_UKR_FLUSH_CT( d ); } void printvec(vector4double v) @@ -214,6 +220,8 @@ void printvec(vector4double v) void bli_zgemm_bgq_int_4x4 ( + dim_t m, + dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, @@ -224,6 +232,8 @@ void bli_zgemm_bgq_int_4x4 cntx_t* restrict cntx ) { + GEMM_UKR_SETUP_CT_ANY( z, 4, 4, false ); + double* a_d = ( double* )a; double* b_d = ( double* )b; double* c_d = ( double* )c; @@ -368,4 +378,6 @@ void bli_zgemm_bgq_int_4x4 c_d += 2*cs_c; ZUPDATE( c03a, c03b, c_d, 0 ); ZUPDATE( c13a, c13b, c_d, 4 ); + + GEMM_UKR_FLUSH_CT( z ); } diff --git a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c index 403aaaaee..3a75d61d7 100644 --- a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c +++ b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c @@ -90,7 +90,9 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, float* restrict alpha, float* restrict a, float* restrict b, @@ -102,25 +104,27 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT_ALIGNED( s, 8, 8, false, 32 ); + begin_asm() - + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. - + vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovsldup(mem(rbx, 0*32), ymm2) // elements of a and b. vpermilps(imm(0x4e), ymm2, ymm3) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) lea(mem(rcx, rdi, 4), r10) // load address of c + 4*cs_c; - + lea(mem(rdi, rdi, 2), r14) // r14 = 3*cs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*cs_c @@ -130,7 +134,7 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 prefetch(0, mem(r10, rdi, 1, 7*8)) // prefetch c + 5*cs_c prefetch(0, mem(r10, rdi, 2, 7*8)) // prefetch c + 6*cs_c prefetch(0, mem(r10, r14, 1, 7*8)) // prefetch c + 7*cs_c - + vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) @@ -139,15 +143,15 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - + label(.SLOOPKITER) // MAIN LOOP - + // iteration 0 prefetch(0, mem(rax, 16*32)) vfmaddps(ymm15, ymm0, ymm2, ymm15) @@ -155,44 +159,44 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 vmovshdup(mem(rbx, 0*32), ymm2) vfmaddps(ymm13, ymm0, ymm3, ymm13) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) - + vmovaps(mem(rax, 1*32), ymm1) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm11, ymm0, ymm4, ymm11) vfmaddps(ymm9, ymm0, ymm5, ymm9) - + vfmaddps(ymm14, ymm0, ymm2, ymm14) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 1*32), ymm2) vfmaddps(ymm12, ymm0, ymm3, ymm12) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) - + vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm10, ymm0, ymm4, ymm10) vfmaddps(ymm8, ymm0, ymm5, ymm8) - + // iteration 1 vfmaddps(ymm15, ymm1, ymm2, ymm15) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovshdup(mem(rbx, 1*32), ymm2) vfmaddps(ymm13, ymm1, ymm3, ymm13) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) - + vmovaps(mem(rax, 2*32), ymm0) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm11, ymm1, ymm4, ymm11) vfmaddps(ymm9, ymm1, ymm5, ymm9) - + vfmaddps(ymm14, ymm1, ymm2, ymm14) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 2*32), ymm2) vfmaddps(ymm12, ymm1, ymm3, ymm12) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) - + vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm10, ymm1, ymm4, ymm10) vfmaddps(ymm8, ymm1, ymm5, ymm8) - + // iteration 2 prefetch(0, mem(rax, 18*32)) vfmaddps(ymm15, ymm0, ymm2, ymm15) @@ -200,23 +204,23 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 vmovshdup(mem(rbx, 2*32), ymm2) vfmaddps(ymm13, ymm0, ymm3, ymm13) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) - + vmovaps(mem(rax, 3*32), ymm1) add(imm(4*8*4), rax) // a += 4*8 (unroll x mr) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm11, ymm0, ymm4, ymm11) vfmaddps(ymm9, ymm0, ymm5, ymm9) - + vfmaddps(ymm14, ymm0, ymm2, ymm14) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 3*32), ymm2) vfmaddps(ymm12, ymm0, ymm3, ymm12) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) - + vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm10, ymm0, ymm4, ymm10) vfmaddps(ymm8, ymm0, ymm5, ymm8) - + // iteration 3 vfmaddps(ymm15, ymm1, ymm2, ymm15) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) @@ -224,134 +228,134 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 add(imm(4*8*4), rbx) // b += 4*8 (unroll x nr) vfmaddps(ymm13, ymm1, ymm3, ymm13) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) - + vmovaps(mem(rax, 0*32), ymm0) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm11, ymm1, ymm4, ymm11) vfmaddps(ymm9, ymm1, ymm5, ymm9) - + vfmaddps(ymm14, ymm1, ymm2, ymm14) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 0*32), ymm2) vfmaddps(ymm12, ymm1, ymm3, ymm12) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) - + vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm10, ymm1, ymm4, ymm10) vfmaddps(ymm8, ymm1, ymm5, ymm8) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - + label(.SLOOPKLEFT) // EDGE LOOP - + prefetch(0, mem(rax, 16*32)) vfmaddps(ymm15, ymm0, ymm2, ymm15) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmovshdup(mem(rbx, 0*32), ymm2) vfmaddps(ymm13, ymm0, ymm3, ymm13) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) - + vmovaps(mem(rax, 1*32), ymm1) add(imm(8*1*4), rax) // a += 8 (1 x mr) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm11, ymm0, ymm4, ymm11) vfmaddps(ymm9, ymm0, ymm5, ymm9) - - vfmaddps(ymm14, ymm0, ymm2, ymm14) + + vfmaddps(ymm14, ymm0, ymm2, ymm14) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 1*32), ymm2) add(imm(8*1*4), rbx) // b += 8 (1 x nr) - vfmaddps(ymm12, ymm0, ymm3, ymm12) + vfmaddps(ymm12, ymm0, ymm3, ymm12) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) - + vpermilps(imm(0x4e), ymm2, ymm3) - vfmaddps(ymm10, ymm0, ymm4, ymm10) - vfmaddps(ymm8, ymm0, ymm5, ymm8) + vfmaddps(ymm10, ymm0, ymm4, ymm10) + vfmaddps(ymm8, ymm0, ymm5, ymm8) vmovaps(ymm1, ymm0) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - + + label(.SPOSTACCUM) // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab02 ( ab04 ( ab06 - // ab10 ab12 ab14 ab16 + // ab10 ab12 ab14 ab16 // ab22 ab20 ab26 ab24 // ab32 ab30 ab36 ab34 // ab44 ab46 ab40 ab42 - // ab54 ab56 ab50 ab52 + // ab54 ab56 ab50 ab52 // ab66 ab64 ab62 ab60 // ab76 ) ab74 ) ab72 ) ab70 ) - + // ymm14: ymm12: ymm10: ymm8: // ( ab01 ( ab03 ( ab05 ( ab07 - // ab11 ab13 ab15 ab17 + // ab11 ab13 ab15 ab17 // ab23 ab21 ab27 ab25 // ab33 ab31 ab37 ab35 // ab45 ab47 ab41 ab43 - // ab55 ab57 ab51 ab53 + // ab55 ab57 ab51 ab53 // ab67 ab65 ab63 ab61 // ab77 ) ab75 ) ab73 ) ab71 ) GROUP_YMM_BY_4 // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab02 ( ab04 ( ab06 - // ab10 ab12 ab14 ab16 + // ab10 ab12 ab14 ab16 // ab20 ab22 ab24 ab26 // ab30 ab32 ab34 ab36 // ab44 ab46 ab40 ab42 - // ab54 ab56 ab50 ab52 + // ab54 ab56 ab50 ab52 // ab64 ab66 ab60 ab62 // ab74 ) ab76 ) ab70 ) ab72 ) - + // ymm14: ymm12: ymm10: ymm8: // ( ab01 ( ab03 ( ab05 ( ab07 - // ab11 ab13 ab15 ab17 + // ab11 ab13 ab15 ab17 // ab21 ab23 ab25 ab27 // ab31 ab33 ab35 ab37 // ab45 ab47 ab41 ab43 - // ab55 ab57 ab51 ab53 + // ab55 ab57 ab51 ab53 // ab65 ab67 ab61 ab63 // ab75 ) ab77 ) ab71 ) ab73 ) // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab02 ( ab04 ( ab06 - // ab10 ab12 ab14 ab16 + // ab10 ab12 ab14 ab16 // ab20 ab22 ab24 ab26 // ab30 ab32 ab34 ab36 // ab40 ab42 ab44 ab46 - // ab50 ab52 ab54 ab56 + // ab50 ab52 ab54 ab56 // ab60 ab62 ab64 ab66 // ab70 ) ab72 ) ab74 ) ab76 ) - + // ymm14: ymm12: ymm10: ymm8: // ( ab01 ( ab03 ( ab05 ( ab07 - // ab11 ab13 ab15 ab17 + // ab11 ab13 ab15 ab17 // ab21 ab23 ab25 ab27 // ab31 ab33 ab35 ab37 // ab41 ab43 ab45 ab47 - // ab51 ab53 ab55 ab57 + // ab51 ab53 ab55 ab57 // ab61 ab63 ab65 ab67 // ab71 ) ab73 ) ab75 ) ab77 ) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm4) // load beta and duplicate - + vmulps(ymm0, ymm8, ymm8) // scale by alpha vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) @@ -360,401 +364,115 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) - - - - - mov(var(rs_c), rsi) // load rs_c - lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) - - lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; - - lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; - lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c; - - - // determine if - // c % 32 == 0, AND - // 4*cs_c % 32 == 0, AND - // rs_c == 1 - // ie: aligned, ldim aligned, and - // column-stored - - cmp(imm(4), rsi) // set ZF if (4*rs_c) == 4. - sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); - test(imm(31), rcx) // set ZF if c & 32 is zero. - setz(bh) // bh = ( ZF == 0 ? 1 : 0 ); - test(imm(31), rdi) // set ZF if (4*cs_c) & 32 is zero. - setz(al) // al = ( ZF == 0 ? 1 : 0 ); - // and(bl,bh) followed by - // and(bh,al) will reveal result - + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm4) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.SCOLSTORED) // jump to column storage case - - - label(.SGENSTORED) - // update c00:c70 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - - vfmaddps(ymm15, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, - - STORE_SS - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - // update c01:c71 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - -// vmulps(ymm4, ymm0, ymm0) // scale by beta, -// vaddps(ymm14, ymm0, ymm0) // add the gemm result, - vfmaddps(ymm14, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, - - STORE_SS - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c02:c72 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - -// vmulps(ymm4, ymm0, ymm0) // scale by beta, -// vaddps(ymm13, ymm0, ymm0) // add the gemm result, - vfmaddps(ymm13, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, - - STORE_SS - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c03:c73 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - -// vmulps(ymm4, ymm0, ymm0) // scale by beta, -// vaddps(ymm12, ymm0, ymm0) // add the gemm result, - vfmaddps(ymm12, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, - - STORE_SS - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c04:c74 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - -// vmulps(ymm4, ymm0, ymm0) // scale by beta, -// vaddps(ymm11, ymm0, ymm0) // add the gemm result, - vfmaddps(ymm11, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, - - STORE_SS - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c05:c75 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - -// vmulps(ymm4, ymm0, ymm0) // scale by beta, -// vaddps(ymm10, ymm0, ymm0) // add the gemm result, - vfmaddps(ymm10, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, - - STORE_SS - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c06:c76 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - -// vmulps(ymm4, ymm0, ymm0) // scale by beta, -// vaddps(ymm9, ymm0, ymm0) // add the gemm result, - vfmaddps(ymm9, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, - - STORE_SS - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c07:c77 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - -// vmulps(ymm4, ymm0, ymm0) // scale by beta, -// vaddps(ymm8, ymm0, ymm0) // add the gemm result, - vfmaddps(ymm8, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, - - STORE_SS - - - - jmp(.SDONE) // jump to end. - - - - label(.SCOLSTORED) - - - vmovaps(mem(rcx), ymm0) // load c00:c70, -// vmulps(ymm4, ymm0, ymm0) // scale by beta, -// vaddps(ymm15, ymm0, ymm0) // add the gemm result, - vfmaddps(ymm15, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, - vmovaps(ymm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovaps(mem(rcx), ymm1) // load c01:c71, -// vmulps(ymm4, ymm1, ymm1) // scale by beta, -// vaddps(ymm14, ymm1, ymm1) // add the gemm result, - vfmaddps(ymm14, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, - vmovaps(ymm1, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovaps(mem(rcx), ymm0) // load c02:c72, -// vmulps(ymm4, ymm0, ymm0) // scale by beta, -// vaddps(ymm13, ymm0, ymm0) // add the gemm result, - vfmaddps(ymm13, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, - vmovaps(ymm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovaps(mem(rcx), ymm1) // load c03:c73, -// vmulps(ymm4, ymm1, ymm1) // scale by beta, -// vaddps(ymm12, ymm1, ymm1) // add the gemm result, - vfmaddps(ymm12, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, - vmovaps(ymm1, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovaps(mem(rcx), ymm0) // load c04:c74, -// vmulps(ymm4, ymm0, ymm0) // scale by beta, -// vaddps(ymm11, ymm0, ymm0) // add the gemm result, - vfmaddps(ymm11, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, - vmovaps(ymm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovaps(mem(rcx), ymm1) // load c05:c75, -// vmulps(ymm4, ymm1, ymm1) // scale by beta, -// vaddps(ymm10, ymm1, ymm1) // add the gemm result, - vfmaddps(ymm10, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, - vmovaps(ymm1, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovaps(mem(rcx), ymm0) // load c06:c76, -// vmulps(ymm4, ymm0, ymm0) // scale by beta, -// vaddps(ymm9, ymm0, ymm0) // add the gemm result, - vfmaddps(ymm9, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, - vmovaps(ymm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovaps(mem(rcx), ymm1) // load c07:c77, -// vmulps(ymm4, ymm1, ymm1) // scale by beta, -// vaddps(ymm8, ymm1, ymm1) // add the gemm result, - vfmaddps(ymm8, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, - vmovaps(ymm1, mem(rcx)) // and store back to memory. - - jmp(.SDONE) // jump to end. - - + + vmovaps(mem(rcx), ymm0) // load c00:c70, + //vmulps(ymm4, ymm0, ymm0) // scale by beta, + //vaddps(ymm15, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm15, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + vmovaps(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(mem(rcx), ymm1) // load c01:c71, + //vmulps(ymm4, ymm1, ymm1) // scale by beta, + //vaddps(ymm14, ymm1, ymm1) // add the gemm result, + vfmaddps(ymm14, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, + vmovaps(ymm1, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(mem(rcx), ymm0) // load c02:c72, + //vmulps(ymm4, ymm0, ymm0) // scale by beta, + //vaddps(ymm13, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm13, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + vmovaps(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(mem(rcx), ymm1) // load c03:c73, + //vmulps(ymm4, ymm1, ymm1) // scale by beta, + //vaddps(ymm12, ymm1, ymm1) // add the gemm result, + vfmaddps(ymm12, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, + vmovaps(ymm1, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(mem(rcx), ymm0) // load c04:c74, + //vmulps(ymm4, ymm0, ymm0) // scale by beta, + //vaddps(ymm11, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm11, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + vmovaps(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(mem(rcx), ymm1) // load c05:c75, + //vmulps(ymm4, ymm1, ymm1) // scale by beta, + //vaddps(ymm10, ymm1, ymm1) // add the gemm result, + vfmaddps(ymm10, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, + vmovaps(ymm1, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(mem(rcx), ymm0) // load c06:c76, + //vmulps(ymm4, ymm0, ymm0) // scale by beta, + //vaddps(ymm9, ymm0, ymm0) // add the gemm result, + vfmaddps(ymm9, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, + vmovaps(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(mem(rcx), ymm1) // load c07:c77, + //vmulps(ymm4, ymm1, ymm1) // scale by beta, + //vaddps(ymm8, ymm1, ymm1) // add the gemm result, + vfmaddps(ymm8, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, + vmovaps(ymm1, mem(rcx)) // and store back to memory. + + jmp(.SDONE) // jump to end. + label(.SBETAZERO) - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.SCOLSTORBZ) // jump to column storage case - - - label(.SGENSTORBZ) - // update c00:c70 - vmovapd(ymm15, ymm0) - STORE_SS - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - // update c01:c71 - vmovapd(ymm14, ymm0) - STORE_SS - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - // update c02:c72 - vmovapd(ymm13, ymm0) - STORE_SS - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - // update c03:c73 - vmovapd(ymm12, ymm0) - STORE_SS - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - // update c04:c74 - vmovapd(ymm11, ymm0) - STORE_SS - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - // update c05:c75 - vmovapd(ymm10, ymm0) - STORE_SS - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - // update c06:c76 - vmovapd(ymm9, ymm0) - STORE_SS - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - // update c07:c77 - vmovapd(ymm8, ymm0) - STORE_SS - - jmp(.SDONE) // jump to end. - - - label(.SCOLSTORBZ) - - vmovaps(ymm15, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovaps(ymm14, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovaps(ymm13, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovaps(ymm12, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovaps(ymm11, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovaps(ymm10, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovaps(ymm9, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovaps(ymm8, mem(rcx)) // and store back to memory. - + + vmovaps(ymm15, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm14, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm13, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm12, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm11, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm10, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm9, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm8, mem(rcx)) // and store back to memory. + label(.SDONE) - - end_asm( + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c)/*, // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -762,6 +480,8 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( s ); } #undef KERNEL4x6_1 @@ -862,7 +582,9 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 void bli_dgemm_bulldozer_asm_4x6_fma4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, double* restrict alpha, double* restrict a, double* restrict b, @@ -874,66 +596,68 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 12; - uint64_t k_left = k0 % 12; + uint64_t k_iter = k / 12; + uint64_t k_left = k % 12; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT_ANY( d, 4, 6, false ); + begin_asm() - - + + vzeroall() mov(var(b), rbx) // load address of b. mov(var(a), rax) // load address of a. prefetch(0, mem(rax, 64)) - - + + vmovaps(mem(rbx, 0*8), xmm1) vmovaps(mem(rbx, 2*8), xmm2) vmovaps(mem(rbx, 4*8), xmm3) add(imm(12*8), rbx) add(imm(8*8), rax) - + mov(var(k_iter), rsi) // i = k_iter; notice var(k_iter) not $0 test(rsi, rsi) je(.CONSIDERKLEFT) - + ALIGN32 label(.LOOPKITER) // MAIN LOOP - - KERNEL4x6_1(xx) - KERNEL4x6_2(xx) - KERNEL4x6_3(xx) - KERNEL4x6_4(xx) - KERNEL4x6_1(xx) - KERNEL4x6_2(xx) - KERNEL4x6_3(xx) - KERNEL4x6_4(xx) - KERNEL4x6_1(xx) - KERNEL4x6_2(xx) - KERNEL4x6_3(xx) - KERNEL4x6_4(xx) - + + KERNEL4x6_1(xx) + KERNEL4x6_2(xx) + KERNEL4x6_3(xx) + KERNEL4x6_4(xx) + KERNEL4x6_1(xx) + KERNEL4x6_2(xx) + KERNEL4x6_3(xx) + KERNEL4x6_4(xx) + KERNEL4x6_1(xx) + KERNEL4x6_2(xx) + KERNEL4x6_3(xx) + KERNEL4x6_4(xx) + dec(rsi) jne(.LOOPKITER) - + label(.CONSIDERKLEFT) - + mov(var(k_left), rsi) - test(rsi, rsi) + test(rsi, rsi) label(.LOOPKLEFT) je(.POSTACCUM) - - KERNEL4x6_1(xx) + + KERNEL4x6_1(xx) add(imm(6*8), rbx) add(imm(4*8), rax) - + dec(rsi) jmp(.LOOPKLEFT) // iterate again if i != 0. - + label(.POSTACCUM) - - + + mov(var(rs_c), rsi) // load cs_c mov(var(cs_c), rdi) // load rs_c vmovddup(mem(var(alpha)), xmm2) //load alpha @@ -942,32 +666,32 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 sal(imm(3), rsi) // cs_c *= sizeof(double) sal(imm(3), rdi) // rs_c *= sizeof(double) lea(mem(rcx, rdi, 2), rdx) - - vmovlpd(mem(rcx), xmm0, xmm0) - vmovlpd(mem(rdx), xmm1, xmm1) + + vmovlpd(mem(rcx), xmm0, xmm0) + vmovlpd(mem(rdx), xmm1, xmm1) vmovhpd(mem(rcx, rdi, 1), xmm0, xmm0) vmovhpd(mem(rdx, rdi, 1), xmm1, xmm1) lea(mem(rdx, rdi, 2), r8) vmulpd(xmm2, xmm4, xmm4) // scale by alpha, vmulpd(xmm2, xmm5, xmm5) // scale by alpha, vfmaddpd(xmm4, xmm0, xmm3, xmm4) // scale by beta, and add the gemm result - vmovlpd(mem(r8), xmm0, xmm0) + vmovlpd(mem(r8), xmm0, xmm0) vfmaddpd(xmm5, xmm1, xmm3, xmm5) // scale by beta, and add the gemm result vmovhpd(mem(r8, rdi, 1), xmm0, xmm0) vmovlpd(xmm4, mem(rcx)) // and store back to memory. vmovlpd(xmm5, mem(rdx)) // and store back to memory. vmovhpd(xmm4, mem(rcx, rdi, 1)) - add(rsi, rcx) + add(rsi, rcx) vmovhpd(xmm5, mem(rdx, rdi, 1)) - add(rsi, rdx) - + add(rsi, rdx) + vmulpd(xmm2, xmm6, xmm6) // scale by alpha, vfmaddpd(xmm6, xmm0, xmm3, xmm6) // scale by beta, and add the gemm result vmovlpd(xmm6, mem(r8)) // and store back to memory. vmovhpd(xmm6, mem(r8, rdi, 1)) - add(rsi, r8) - - + add(rsi, r8) + + vmovlpd(mem(rcx), xmm0, xmm0) vmovlpd(mem(rdx), xmm1, xmm1) vmovlpd(mem(r8), xmm4, xmm4) @@ -984,13 +708,13 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 vmovlpd(xmm8, mem(rdx)) // and store back to memory. vmovlpd(xmm9, mem(r8)) // and store back to memory. vmovhpd(xmm7, mem(rcx, rdi, 1)) - add(rsi, rcx) + add(rsi, rcx) vmovhpd(xmm8, mem(rdx, rdi, 1)) - add(rsi, rdx) + add(rsi, rdx) vmovhpd(xmm9, mem(r8, rdi, 1)) - add(rsi, r8) - - + add(rsi, r8) + + vmovlpd(mem(rcx), xmm0, xmm0) vmovlpd(mem(rdx), xmm1, xmm1) vmovlpd(mem(r8), xmm4, xmm4) @@ -1007,13 +731,13 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 vmovlpd(xmm11, mem(rdx)) // and store back to memory. vmovlpd(xmm12, mem(r8)) // and store back to memory. vmovhpd(xmm10, mem(rcx, rdi, 1)) - add(rsi, rcx) + add(rsi, rcx) vmovhpd(xmm11, mem(rdx, rdi, 1)) - add(rsi, rdx) + add(rsi, rdx) vmovhpd(xmm12, mem(r8, rdi, 1)) - add(rsi, r8) - - + add(rsi, r8) + + vmovlpd(mem(rcx), xmm0, xmm0) vmovlpd(mem(rdx), xmm1, xmm1) vmovlpd(mem(r8), xmm4, xmm4) @@ -1031,30 +755,32 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 vmovlpd(xmm15, mem(r8)) // and store back to memory. vmovhpd(xmm13, mem(rcx, rdi, 1)) vmovhpd(xmm14, mem(rdx, rdi, 1)) - vmovhpd(xmm15, mem(r8, rdi, 1)) - - end_asm( - : // output operands (none) - : // input operands - [k_iter] "r" (k_iter), // 0 - [k_left] "r" (k_left), // 1 - [a] "r" (a), // 2 - [b] "r" (b), // 3 - [alpha] "r" (alpha), // 4 - [beta] "r" (beta), // 5 - [c] "r" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c)/*, // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next)*/ // 10 - : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "memory" + vmovhpd(xmm15, mem(r8, rdi, 1)) + + end_asm( + : // output operands (none) + : // input operands + [k_iter] "r" (k_iter), // 0 + [k_left] "r" (k_left), // 1 + [a] "r" (a), // 2 + [b] "r" (b), // 3 + [alpha] "r" (alpha), // 4 + [beta] "r" (beta), // 5 + [c] "r" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" ) + + GEMM_UKR_FLUSH_CT( d ); } //The parameter "i" is the iteration number, i.e. the B values to read #define MADD_TO_YMM(i) \ @@ -1076,7 +802,9 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 void bli_cgemm_bulldozer_asm_8x4_fma4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, @@ -1091,33 +819,35 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT_ALIGNED( c, 8, 4, false, 32 ); + begin_asm() - + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r15) // load address of b_next. //mov(var(a_next), r14) // load address of a_next. sub(imm(4*64), r15) - + vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovsldup(mem(rbx, 0*32), ymm2) vpermilps(imm(0x4e), ymm2, ymm3) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; - + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c - + vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) @@ -1126,343 +856,312 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) - + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - + label(.CLOOPKITER) // MAIN LOOP - + add(imm(4*4*8), r15) // b_next += 4*4 (unroll x nr) - + // iteration 0 prefetch(0, mem(rax, 8*32)) vmovaps(mem(rax, 1*32), ymm1) MADD_TO_YMM(0) - + vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vaddsubps(ymm6, ymm15, ymm15) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 1*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 2*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) - - + + // iteration 1 prefetch(0, mem(rax, 10*32)) vmovaps(mem(rax, 3*32), ymm1) MADD_TO_YMM(1) - + vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm6, ymm15, ymm15) vaddsubps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 2*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 4*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) - + // iteration 2 prefetch(0, mem(rax, 12*32)) vmovaps(mem(rax, 5*32), ymm1) MADD_TO_YMM(2) prefetch(0, mem(r15, 2*32)) // prefetch b_next[2*4] - + vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm6, ymm15, ymm15) vaddsubps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 3*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 6*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) - - + + // iteration 3 prefetch(0, mem(rax, 14*32)) vmovaps(mem(rax, 7*32), ymm1) MADD_TO_YMM(3) - + vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm6, ymm15, ymm15) vaddsubps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 4*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 8*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) - - + + add(imm(8*4*8), rax) // a += 8*4 (unroll x mr) add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr) - - + + dec(rsi) // i -= 1; jne(.CLOOPKITER) // iterate again if i != 0. - - - + + + label(.CCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.CLOOPKLEFT) // EDGE LOOP - + // iteration 0 prefetch(0, mem(rax, 8*32)) vmovaps(mem(rax, 1*32), ymm1) MADD_TO_YMM(0) - + vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm6, ymm15, ymm15) vaddsubps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 1*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 2*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) - - + + add(imm(8*1*8), rax) // a += 8 (1 x mr) add(imm(4*1*8), rbx) // b += 4 (1 x nr) - - + + dec(rsi) // i -= 1; jne(.CLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.CPOSTACCUM) - + // ymm15: ymm13: ymm11: ymm9: - // ( ab00 ( ab01 ( ab02 ( ab03 - // ab10 ab11 ab12 ab13 - // ab21 ab20 ab23 ab22 - // ab31 ab30 ab33 ab32 - // ab42 ab43 ab40 ab41 - // ab52 ab53 ab50 ab51 - // ab63 ab62 ab61 ab60 + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab21 ab20 ab23 ab22 + // ab31 ab30 ab33 ab32 + // ab42 ab43 ab40 ab41 + // ab52 ab53 ab50 ab51 + // ab63 ab62 ab61 ab60 // ab73 ) ab72 ) ab71 ) ab70 ) - + // ymm14: ymm12: ymm10: ymm8: - // ( ab80 ( ab81 ( ab82 ( ab83 - // ab90 ab91 ab92 ab93 - // aba1 aba0 aba3 aba2 - // abb1 abb0 abb3 abb2 - // abc2 abc3 abc0 abc1 - // abd2 abd3 abd0 abd1 - // abe3 abe2 abe1 abe0 + // ( ab80 ( ab81 ( ab82 ( ab83 + // ab90 ab91 ab92 ab93 + // aba1 aba0 aba3 aba2 + // abb1 abb0 abb3 abb2 + // abc2 abc3 abc0 abc1 + // abd2 abd3 abd0 abd1 + // abe3 abe2 abe1 abe0 // abf3 abf2 abf1 abf0 ) GROUP_YMM_BY_4 // ymm15: ymm13: ymm11: ymm9: - // ( ab00 ( ab01 ( ab02 ( ab03 - // ab10 ab11 ab12 ab13 - // ab20 ab21 ab22 ab23 - // ab30 ab31 ab32 ab33 - // ab42 ab43 ab40 ab41 - // ab52 ab53 ab50 ab51 - // ab62 ab63 ab60 ab61 + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab20 ab21 ab22 ab23 + // ab30 ab31 ab32 ab33 + // ab42 ab43 ab40 ab41 + // ab52 ab53 ab50 ab51 + // ab62 ab63 ab60 ab61 // ab72 ) ab73 ) ab70 ) ab71 ) - + // ymm14: ymm12: ymm10: ymm8: - // ( ab80 ( ab81 ( ab82 ( ab83 - // ab90 ab91 ab92 ab93 - // aba0 aba1 aba2 aba3 - // abb0 abb1 abb2 abb3 - // abc2 abc3 abc0 abc1 - // abd2 abd3 abd0 abd1 - // abe2 abe3 abe0 abe1 + // ( ab80 ( ab81 ( ab82 ( ab83 + // ab90 ab91 ab92 ab93 + // aba0 aba1 aba2 aba3 + // abb0 abb1 abb2 abb3 + // abc2 abc3 abc0 abc1 + // abd2 abd3 abd0 abd1 + // abe2 abe3 abe0 abe1 // abf2 ) abf3 ) abf0 ) abf1 ) - + // ymm15: ymm13: ymm11: ymm9: - // ( ab00 ( ab01 ( ab02 ( ab03 - // ab10 ab11 ab12 ab13 - // ab20 ab21 ab22 ab23 - // ab30 ab31 ab32 ab33 - // ab40 ab41 ab42 ab43 - // ab50 ab51 ab52 ab53 - // ab60 ab61 ab62 ab63 + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab20 ab21 ab22 ab23 + // ab30 ab31 ab32 ab33 + // ab40 ab41 ab42 ab43 + // ab50 ab51 ab52 ab53 + // ab60 ab61 ab62 ab63 // ab70 ) ab71 ) ab72 ) ab73 ) - + // ymm14: ymm12: ymm10: ymm8: - // ( ab80 ( ab81 ( ab82 ( ab83 - // ab90 ab91 ab92 ab93 - // aba0 aba1 aba2 aba3 - // abb0 abb1 abb2 abb3 - // abc0 abc1 abc2 abc3 - // abd0 abd1 abd2 abd3 - // abe0 abe1 abe2 abe3 + // ( ab80 ( ab81 ( ab82 ( ab83 + // ab90 ab91 ab92 ab93 + // aba0 aba1 aba2 aba3 + // abb0 abb1 abb2 abb3 + // abc0 abc1 abc2 abc3 + // abd0 abd1 abd2 abd3 + // abe0 abe1 abe2 abe3 // abf0 ) abf1 ) abf2 ) abf3 ) - + // scale by alpha - + mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm7) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm6) // load alpha_i and duplicate - + vpermilps(imm(0xb1), ymm15, ymm3) vmulps(ymm7, ymm15, ymm15) vmulps(ymm6, ymm3, ymm3) vaddsubps(ymm3, ymm15, ymm15) - + vpermilps(imm(0xb1), ymm14, ymm2) vmulps(ymm7, ymm14, ymm14) vmulps(ymm6, ymm2, ymm2) vaddsubps(ymm2, ymm14, ymm14) - + vpermilps(imm(0xb1), ymm13, ymm1) vmulps(ymm7, ymm13, ymm13) vmulps(ymm6, ymm1, ymm1) vaddsubps(ymm1, ymm13, ymm13) - + vpermilps(imm(0xb1), ymm12, ymm0) vmulps(ymm7, ymm12, ymm12) vmulps(ymm6, ymm0, ymm0) vaddsubps(ymm0, ymm12, ymm12) - + vpermilps(imm(0xb1), ymm11, ymm3) vmulps(ymm7, ymm11, ymm11) vmulps(ymm6, ymm3, ymm3) vaddsubps(ymm3, ymm11, ymm11) - + vpermilps(imm(0xb1), ymm10, ymm2) vmulps(ymm7, ymm10, ymm10) vmulps(ymm6, ymm2, ymm2) vaddsubps(ymm2, ymm10, ymm10) - + vpermilps(imm(0xb1), ymm9, ymm1) vmulps(ymm7, ymm9, ymm9) vmulps(ymm6, ymm1, ymm1) vaddsubps(ymm1, ymm9, ymm9) - + vpermilps(imm(0xb1), ymm8, ymm0) vmulps(ymm7, ymm8, ymm8) vmulps(ymm6, ymm0, ymm0) vaddsubps(ymm0, ymm8, ymm8) - - - - + + + + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm7) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm6) // load beta_i and duplicate - - - - - - - - mov(var(rs_c), rsi) // load rs_c - lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex) - - lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; - - lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; - lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c; - - - - // determine if - // c % 32 == 0, AND - // 8*cs_c % 32 == 0, AND - // rs_c == 1 - // ie: aligned, ldim aligned, and - // column-stored - - cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8. - sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); - test(imm(31), rcx) // set ZF if c & 32 is zero. - setz(bh) // bh = ( ZF == 0 ? 1 : 0 ); - test(imm(31), rdi) // set ZF if (8*cs_c) & 32 is zero. - setz(al) // al = ( ZF == 0 ? 1 : 0 ); - // and(bl,bh) followed by - // and(bh,al) will reveal result - + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm7) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); @@ -1470,388 +1169,126 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.CBETAZERO) // if ZF = 0, jump to beta == 0 case - - - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.CCOLSTORED) // jump to column storage case - - - - label(.CGENSTORED) - - // update c00:c70 - - vmovlpd(mem(rcx), xmm0, xmm0) // load (c00,10) into xmm0[0:1] - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c20,30) into xmm0[2:3] - vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c40,50) into xmm2[0:1] - vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c60,70) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rcx)) // store (c00,c10) - vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c20,c30) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c40,c50) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c60,c70) - add(rdi, rcx) // c += cs_c; - - // update c80:cf0 - - vmovlpd(mem(rdx), xmm0, xmm0) // load (c80,90) into xmm0[0:1] - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca0,b0) into xmm0[2:3] - vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc0,d0) into xmm2[0:1] - vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce0,f0) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rdx)) // store (c80,c90) - vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca0,cb0) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc0,cd0) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce0,cf0) - add(rdi, rdx) // c += cs_c; - - // update c01:c71 - - vmovlpd(mem(rcx), xmm0, xmm0) // load (c01,11) into xmm0[0:1] - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c21,31) into xmm0[2:3] - vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c41,51) into xmm2[0:1] - vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c61,71) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rcx)) // store (c01,c11) - vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c21,c31) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c41,c51) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c61,c71) - add(rdi, rcx) // c += cs_c; - - // update c81:cf1 - - vmovlpd(mem(rdx), xmm0, xmm0) // load (c81,91) into xmm0[0:1] - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca1,b1) into xmm0[2:3] - vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc1,d1) into xmm2[0:1] - vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce1,f1) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rdx)) // store (c81,c91) - vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca1,cb1) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc1,cd1) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce1,cf1) - add(rdi, rdx) // c += cs_c; - - // update c02:c72 - - vmovlpd(mem(rcx), xmm0, xmm0) // load (c02,12) into xmm0[0:1] - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c22,32) into xmm0[2:3] - vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c42,52) into xmm2[0:1] - vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c62,72) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rcx)) // store (c02,c12) - vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c22,c32) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c42,c52) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c62,c72) - add(rdi, rcx) // c += cs_c; - - // update c82:cf2 - - vmovlpd(mem(rdx), xmm0, xmm0) // load (c82,92) into xmm0[0:1] - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca2,b2) into xmm0[2:3] - vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc2,d2) into xmm2[0:1] - vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce2,f2) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rdx)) // store (c82,c92) - vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca2,cb2) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc2,cd2) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce2,cf2) - add(rdi, rdx) // c += cs_c; - - // update c03:c73 - - vmovlpd(mem(rcx), xmm0, xmm0) // load (c03,13) into xmm0[0:1] - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c23,33) into xmm0[2:3] - vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c43,53) into xmm2[0:1] - vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c63,73) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rcx)) // store (c03,c13) - vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c23,c33) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c43,c53) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c63,c73) - add(rdi, rcx) // c += cs_c; - - // update c83:cf3 - - vmovlpd(mem(rdx), xmm0, xmm0) // load (c83,93) into xmm0[0:1] - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca3,b3) into xmm0[2:3] - vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc3,d3) into xmm2[0:1] - vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce3,f3) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rdx)) // store (c83,c93) - vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca3,cb3) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc3,cd3) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce3,cf3) - add(rdi, rdx) // c += cs_c; - - - - jmp(.CDONE) // jump to end. - - - - label(.CCOLSTORED) - - // update c00:c70 - - vmovaps(mem(rcx), ymm0) // load c00:c70 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0 - vmovaps(ymm0, mem(rcx)) // store c00:c70 - add(rdi, rcx) // c += cs_c; - - // update c80:cf0 - - vmovaps(mem(rdx), ymm0) // load c80:f0 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0 - vmovaps(ymm0, mem(rdx)) // store c80:cf0 - add(rdi, rdx) // c += cs_c; - - // update c00:c70 - - vmovaps(mem(rcx), ymm0) // load c01:c71 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0 - vmovaps(ymm0, mem(rcx)) // store c01:c71 - add(rdi, rcx) // c += cs_c; - - // update c81:cf1 - - vmovaps(mem(rdx), ymm0) // load c81:f1 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0 - vmovaps(ymm0, mem(rdx)) // store c81:cf1 - add(rdi, rdx) // c += cs_c; - - // update c02:c72 - vmovaps(mem(rcx), ymm0) // load c02:c72 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0 - vmovaps(ymm0, mem(rcx)) // store c02:c72 - add(rdi, rcx) // c += cs_c; - - // update c82:cf2 - vmovaps(mem(rdx), ymm0) // load c82:f2 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0 - vmovaps(ymm0, mem(rdx)) // store c82:cf2 - add(rdi, rdx) // c += cs_c; - - // update c03:c73 - vmovaps(mem(rcx), ymm0) // load c03:c73 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0 - vmovaps(ymm0, mem(rcx)) // store c03:c73 - add(rdi, rcx) // c += cs_c; - - // update c83:cf3 - vmovaps(mem(rdx), ymm0) // load c83:f3 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0 - vmovaps(ymm0, mem(rdx)) // store c83:cf3 - add(rdi, rdx) // c += cs_c; - - jmp(.CDONE) // jump to end. - - + + // update c00:c70 + + vmovaps(mem(rcx), ymm0) // load c00:c70 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rcx)) // store c00:c70 + + // update c80:cf0 + + vmovaps(mem(rcx,32), ymm0) // load c80:f0 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rcx,32)) // store c80:cf0 + add(rdi, rcx) // c += cs_c; + + // update c00:c70 + + vmovaps(mem(rcx), ymm0) // load c01:c71 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rcx)) // store c01:c71 + + // update c81:cf1 + + vmovaps(mem(rcx,32), ymm0) // load c81:f1 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rcx,32)) // store c81:cf1 + add(rdi, rcx) // c += cs_c; + + // update c02:c72 + vmovaps(mem(rcx), ymm0) // load c02:c72 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rcx)) // store c02:c72 + + // update c82:cf2 + vmovaps(mem(rcx,32), ymm0) // load c82:f2 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rcx,32)) // store c82:cf2 + add(rdi, rcx) // c += cs_c; + + // update c03:c73 + vmovaps(mem(rcx), ymm0) // load c03:c73 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rcx)) // store c03:c73 + + // update c83:cf3 + vmovaps(mem(rcx,32), ymm0) // load c83:f3 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0 + vmovaps(ymm0, mem(rcx,32)) // store c83:cf3 + //add(rdi, rcx) // c += cs_c; + + jmp(.CDONE) // jump to end. + label(.CBETAZERO) - // check if aligned/column-stored - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.CCOLSTORBZ) // jump to column storage case - - - label(.CGENSTORBZ) - // update c00:c70 - vextractf128(imm(1), ymm15, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm15, mem(rcx)) // store (c00,c10) - vmovhpd(xmm15, mem(rcx, rsi, 1)) // store (c20,c30) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c40,c50) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c60,c70) - add(rdi, rcx) // c += cs_c; - - // update c80:cf0 - vextractf128(imm(1), ymm14, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm14, mem(rdx)) // store (c80,c90) - vmovhpd(xmm14, mem(rdx, rsi, 1)) // store (ca0,cb0) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc0,cd0) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce0,cf0) - add(rdi, rdx) // c += cs_c; - - // update c01:c71 - vextractf128(imm(1), ymm13, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm13, mem(rcx)) // store (c01,c11) - vmovhpd(xmm13, mem(rcx, rsi, 1)) // store (c21,c31) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c41,c51) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c61,c71) - add(rdi, rcx) // c += cs_c; - - // update c81:cf1 - vextractf128(imm(1), ymm12, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm12, mem(rdx)) // store (c81,c91) - vmovhpd(xmm12, mem(rdx, rsi, 1)) // store (ca1,cb1) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc1,cd1) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce1,cf1) - add(rdi, rdx) // c += cs_c; - - // update c02:c72 - vextractf128(imm(1), ymm11, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm11, mem(rcx)) // store (c02,c12) - vmovhpd(xmm11, mem(rcx, rsi, 1)) // store (c22,c32) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c42,c52) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c62,c72) - add(rdi, rcx) // c += cs_c; - - // update c82:cf2 - vextractf128(imm(1), ymm10, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm10, mem(rdx)) // store (c82,c92) - vmovhpd(xmm10, mem(rdx, rsi, 1)) // store (ca2,cb2) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc2,cd2) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce2,cf2) - add(rdi, rdx) // c += cs_c; - - // update c03:c73 - vextractf128(imm(1), ymm9, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm9, mem(rcx)) // store (c03,c13) - vmovhpd(xmm9, mem(rcx, rsi, 1)) // store (c23,c33) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c43,c53) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c63,c73) - add(rdi, rcx) // c += cs_c; - - // update c83:cf3 - vextractf128(imm(1), ymm8, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm8, mem(rdx)) // store (c83,c93) - vmovhpd(xmm8, mem(rdx, rsi, 1)) // store (ca3,cb3) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc3,cd3) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce3,cf3) - add(rdi, rdx) // c += cs_c; - - - jmp(.CDONE) // jump to end. - - - label(.CCOLSTORBZ) - - vmovaps(ymm15, mem(rcx)) // store c00:c70 - add(rdi, rcx) // c += cs_c; - - vmovaps(ymm14, mem(rdx)) // store c80:cf0 - add(rdi, rdx) // c += cs_c; - - vmovaps(ymm13, mem(rcx)) // store c01:c71 - add(rdi, rcx) // c += cs_c; - - vmovaps(ymm12, mem(rdx)) // store c81:cf1 - add(rdi, rdx) // c += cs_c; - - vmovaps(ymm11, mem(rcx)) // store c02:c72 - add(rdi, rcx) // c += cs_c; - - vmovaps(ymm10, mem(rdx)) // store c82:cf2 - add(rdi, rdx) // c += cs_c; - - vmovaps(ymm9, mem(rcx)) // store c03:c73 - add(rdi, rcx) // c += cs_c; - - vmovaps(ymm8, mem(rdx)) // store c83:cf3 - add(rdi, rdx) // c += cs_c; - - - + + vmovaps(ymm15, mem(rcx)) // store c00:c70 + vmovaps(ymm14, mem(rcx,32)) // store c80:cf0 + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm13, mem(rcx)) // store c01:c71 + vmovaps(ymm12, mem(rcx,32)) // store c81:cf1 + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm11, mem(rcx)) // store c02:c72 + vmovaps(ymm10, mem(rcx,32)) // store c82:cf2 + add(rdi, rcx) // c += cs_c; + + vmovaps(ymm9, mem(rcx)) // store c03:c73 + vmovaps(ymm8, mem(rcx,32)) // store c83:cf3 + add(rdi, rcx) // c += cs_c; + label(.CDONE) - - end_asm( + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c), // 8 - [b_next] "m" (b_next)/*, // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next)/*, // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", @@ -1859,6 +1296,8 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) + + GEMM_UKR_FLUSH_CT( c ); } #define MADDSUBPD_TO_YMM \ @@ -1883,11 +1322,13 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 vmulpd(ymm7, ymm(i), ymm(i))\ vmulpd(ymm6, ymm(j), ymm(j))\ vaddsubpd(ymm(j), ymm(i), ymm(i))\ - + void bli_zgemm_bulldozer_asm_4x4_fma4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, @@ -1902,34 +1343,36 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT_ALIGNED( z, 4, 4, false, 32 ); + begin_asm() - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(var(b_next), r15) // load address of b_next. //mov(var(a_next), r14) // load address of a_next. - + vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovddup(mem(rbx, 0+0*32), ymm2) vmovddup(mem(rbx, 0+1*32), ymm3) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) lea(mem(, rdi, 2), rdi) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; - + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c - + vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) @@ -1938,28 +1381,28 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - + label(.ZLOOPKITER) // MAIN LOOP - + // iteration 0 vmovapd(mem(rax, 1*32), ymm1) vfmaddpd(ymm15, ymm0, ymm2, ymm15) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vfmaddpd(ymm11, ymm0, ymm3, ymm11) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) - + prefetch(0, mem(rax, 16*32)) vfmaddpd(ymm14, ymm1, ymm2, ymm14) vmovddup(mem(rbx, 8+0*32), ymm2) vfmaddpd(ymm10, ymm1, ymm3, ymm10) vmovddup(mem(rbx, 8+1*32), ymm3) - + MADDSUBPD_TO_YMM vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+2*32), ymm2) @@ -1967,31 +1410,31 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 vmovddup(mem(rbx, 0+3*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 2*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) - + // iteration 1 vmovapd(mem(rax, 3*32), ymm1) vfmaddpd(ymm15, ymm0, ymm2, ymm15) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vfmaddpd(ymm11, ymm0, ymm3, ymm11) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) - + prefetch(0, mem(rax, 18*32)) vfmaddpd(ymm14, ymm1, ymm2, ymm14) vmovddup(mem(rbx, 8+2*32), ymm2) vfmaddpd(ymm10, ymm1, ymm3, ymm10) vmovddup(mem(rbx, 8+3*32), ymm3) - + MADDSUBPD_TO_YMM vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+4*32), ymm2) @@ -1999,31 +1442,31 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 vmovddup(mem(rbx, 0+5*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 4*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) - + // iteration 2 vmovapd(mem(rax, 5*32), ymm1) vfmaddpd(ymm15, ymm0, ymm2, ymm15) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vfmaddpd(ymm11, ymm0, ymm3, ymm11) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) - + prefetch(0, mem(rax, 20*32)) vfmaddpd(ymm14, ymm1, ymm2, ymm14) vmovddup(mem(rbx, 8+4*32), ymm2) vfmaddpd(ymm10, ymm1, ymm3, ymm10) vmovddup(mem(rbx, 8+5*32), ymm3) - + MADDSUBPD_TO_YMM vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+6*32), ymm2) @@ -2031,31 +1474,31 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 vmovddup(mem(rbx, 0+7*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 6*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) - + // iteration 3 vmovapd(mem(rax, 7*32), ymm1) vfmaddpd(ymm15, ymm0, ymm2, ymm15) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vfmaddpd(ymm11, ymm0, ymm3, ymm11) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) - + prefetch(0, mem(rax, 22*32)) vfmaddpd(ymm14, ymm1, ymm2, ymm14) vmovddup(mem(rbx, 8+6*32), ymm2) vfmaddpd(ymm10, ymm1, ymm3, ymm10) vmovddup(mem(rbx, 8+7*32), ymm3) - + MADDSUBPD_TO_YMM vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+8*32), ymm2) @@ -2063,48 +1506,48 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 vmovddup(mem(rbx, 0+9*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 8*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) - + add(imm(4*4*16), rbx) // b += 4*4 (unroll x nr) add(imm(4*4*16), rax) // a += 4*4 (unroll x mr) - + dec(rsi) // i -= 1; jne(.ZLOOPKITER) // iterate again if i != 0. - - + + label(.ZCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.ZLOOPKLEFT) // EDGE LOOP - + // iteration 0 vmovapd(mem(rax, 1*32), ymm1) vfmaddpd(ymm15, ymm0, ymm2, ymm15) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vfmaddpd(ymm11, ymm0, ymm3, ymm11) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) - + prefetch(0, mem(rax, 16*32)) vfmaddpd(ymm14, ymm1, ymm2, ymm14) vmovddup(mem(rbx, 8+0*32), ymm2) vfmaddpd(ymm10, ymm1, ymm3, ymm10) vmovddup(mem(rbx, 8+1*32), ymm3) - + MADDSUBPD_TO_YMM vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+2*32), ymm2) @@ -2112,75 +1555,75 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 vmovddup(mem(rbx, 0+3*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 2*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) - - + + add(imm(4*1*16), rax) // a += 4 (1 x mr) add(imm(4*1*16), rbx) // b += 4 (1 x nr) - + dec(rsi) // i -= 1; jne(.ZLOOPKLEFT) // iterate again if i != 0. - - + + label(.ZPOSTACCUM) // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab01 ( ab02 ( ab03 - // ab10 ab11 ab12 ab13 + // ab10 ab11 ab12 ab13 // ab21 ab20 ab23 ab22 // ab31 ) ab30 ) ab33 ) ab32 ) - + // ymm14: ymm12: ymm10: ymm8: // ( ab40 ( ab41 ( ab42 ( ab43 - // ab50 ab51 ab52 ab53 + // ab50 ab51 ab52 ab53 // ab61 ab60 ab63 ab62 // ab71 ) ab70 ) ab73 ) ab72 ) - + vmovapd(ymm15, ymm7) vperm2f128(imm(0x12), ymm15, ymm13, ymm15) vperm2f128(imm(0x30), ymm7, ymm13, ymm13) - + vmovapd(ymm11, ymm7) vperm2f128(imm(0x12), ymm11, ymm9, ymm11) vperm2f128(imm(0x30), ymm7, ymm9, ymm9) - + vmovapd(ymm14, ymm7) vperm2f128(imm(0x12), ymm14, ymm12, ymm14) vperm2f128(imm(0x30), ymm7, ymm12, ymm12) - + vmovapd(ymm10, ymm7) vperm2f128(imm(0x12), ymm10, ymm8, ymm10) vperm2f128(imm(0x30), ymm7, ymm8, ymm8) - - + + // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab01 ( ab02 ( ab03 - // ab10 ab11 ab12 ab13 + // ab10 ab11 ab12 ab13 // ab20 ab21 ab22 ab23 // ab30 ) ab31 ) ab32 ) ab33 ) - + // ymm14: ymm12: ymm10: ymm8: // ( ab40 ( ab41 ( ab42 ( ab43 - // ab50 ab51 ab52 ab53 + // ab50 ab51 ab52 ab53 // ab60 ab61 ab62 ab63 // ab70 ) ab71 ) ab72 ) ab73 ) - - + + // scale by alpha - + mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm7) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm6) // load alpha_i and duplicate - + Z_ALPHA(15, 3) Z_ALPHA(14, 2) Z_ALPHA(13, 1) @@ -2190,38 +1633,14 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 Z_ALPHA(10, 2) Z_ALPHA(9, 1) Z_ALPHA(8, 0) - + mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm7) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm6) // load beta_i and duplicate - - - - mov(var(rs_c), rsi) // load rs_c - lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex) - lea(mem(, rsi, 2), rsi) - lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c; - - - - // determine if - // c % 32 == 0, AND - // 16*cs_c % 32 == 0, AND - // rs_c == 1 - // ie: aligned, ldim aligned, and - // column-stored - - cmp(imm(16), rsi) // set ZF if (16*rs_c) == 16. - sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); - test(imm(31), rcx) // set ZF if c & 32 is zero. - setz(bh) // bh = ( ZF == 0 ? 1 : 0 ); - test(imm(31), rdi) // set ZF if (16*cs_c) & 32 is zero. - setz(al) // al = ( ZF == 0 ? 1 : 0 ); - // and(bl,bh) followed by - // and(bh,al) will reveal result - + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm7) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); @@ -2229,287 +1648,91 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.ZBETAZERO) // if ZF = 0, jump to beta == 0 case - - - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.ZCOLSTORED) // jump to column storage case - - - - label(.ZGENSTORED) - // update c00:c30 - - vmovupd(mem(rcx), xmm0) // load (c00,c10) into xmm0 - vmovupd(mem(rcx, rsi, 1), xmm2) // load (c20,c30) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rcx)) // store (c00,c10) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c20,c30) - add(rdi, rcx) // c += cs_c; - - // update c40:c70 - - vmovupd(mem(rdx), xmm0) // load (c40,c50) into xmm0 - vmovupd(mem(rdx, rsi, 1), xmm2) // load (c60,c70) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rdx)) // store (c40,c50) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c60,c70) - add(rdi, rdx) // c += cs_c; - - // update c01:c31 - - vmovupd(mem(rcx), xmm0) // load (c01,c11) into xmm0 - vmovupd(mem(rcx, rsi, 1), xmm2) // load (c21,c31) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rcx)) // store (c01,c11) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c21,c31) - add(rdi, rcx) // c += cs_c; - - // update c41:c71 - - vmovupd(mem(rdx), xmm0) // load (c41,c51) into xmm0 - vmovupd(mem(rdx, rsi, 1), xmm2) // load (c61,c71) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rdx)) // store (c41,c51) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c61,c71) - add(rdi, rdx) // c += cs_c; - - // update c02:c32 - - vmovupd(mem(rcx), xmm0) // load (c02,c12) into xmm0 - vmovupd(mem(rcx, rsi, 1), xmm2) // load (c22,c32) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rcx)) // store (c02,c12) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c22,c32) - add(rdi, rcx) // c += cs_c; - - // update c42:c72 - - vmovupd(mem(rdx), xmm0) // load (c42,c52) into xmm0 - vmovupd(mem(rdx, rsi, 1), xmm2) // load (c62,c72) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rdx)) // store (c42,c52) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c62,c72) - add(rdi, rdx) // c += cs_c; - - // update c03:c33 - - vmovupd(mem(rcx), xmm0) // load (c03,c13) into xmm0 - vmovupd(mem(rcx, rsi, 1), xmm2) // load (c23,c33) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rcx)) // store (c03,c13) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c23,c33) - add(rdi, rcx) // c += cs_c; - - // update c43:c73 - - vmovupd(mem(rdx), xmm0) // load (c43,c53) into xmm0 - vmovupd(mem(rdx, rsi, 1), xmm2) // load (c63,c73) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rdx)) // store (c43,c53) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c63,c73) - - - - jmp(.ZDONE) // jump to end. - - - - label(.ZCOLSTORED) - // update c00:c30 - - vmovapd(mem(rcx), ymm0) // load c00:c30 into ymm0 - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0 - vmovapd(ymm0, mem(rcx)) // store c00:c30 - add(rdi, rcx) // c += cs_c; - - // update c40:c70 - - vmovapd(mem(rdx), ymm0) // load c40:c70 into ymm0 - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0 - vmovapd(ymm0, mem(rdx)) // store c40:c70 - add(rdi, rdx) // c += cs_c; - - // update c01:c31 - - vmovapd(mem(rcx), ymm0) // load c01:c31 into ymm0 - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0 - vmovapd(ymm0, mem(rcx)) // store c01:c31 - add(rdi, rcx) // c += cs_c; - - // update c41:c71 - - vmovapd(mem(rdx), ymm0) // load c41:c71 into ymm0 - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0 - vmovapd(ymm0, mem(rdx)) // store c41:c71 - add(rdi, rdx) // c += cs_c; - - // update c02:c32 - - vmovapd(mem(rcx), ymm0) // load c02:c32 into ymm0 - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0 - vmovapd(ymm0, mem(rcx)) // store c02:c32 - add(rdi, rcx) // c += cs_c; - - // update c42:c72 - - vmovapd(mem(rdx), ymm0) // load c42:c72 into ymm0 - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0 - vmovapd(ymm0, mem(rdx)) // store c42:c72 - add(rdi, rdx) // c += cs_c; - - // update c03:c33 - - vmovapd(mem(rcx), ymm0) // load c03:c33 into ymm0 - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0 - vmovapd(ymm0, mem(rcx)) // store c03:c33 - add(rdi, rcx) // c += cs_c; - - // update c43:c73 - - vmovapd(mem(rdx), ymm0) // load c43:c73 into ymm0 - Z_ALPHA(0, 2) // scale ymm0 by beta - vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0 - vmovapd(ymm0, mem(rdx)) // store c43:c73 - - - - jmp(.ZDONE) // jump to end. - - - + + // update c00:c30 + + vmovapd(mem(rcx), ymm0) // load c00:c30 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rcx)) // store c00:c30 + + // update c40:c70 + + vmovapd(mem(rcx,32), ymm0) // load c40:c70 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rcx,32)) // store c40:c70 + add(rdi, rcx) // c += cs_c; + + // update c01:c31 + + vmovapd(mem(rcx), ymm0) // load c01:c31 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rcx)) // store c01:c31 + + // update c41:c71 + + vmovapd(mem(rcx,32), ymm0) // load c41:c71 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rcx,32)) // store c41:c71 + add(rdi, rcx) // c += cs_c; + + // update c02:c32 + + vmovapd(mem(rcx), ymm0) // load c02:c32 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rcx)) // store c02:c32 + + // update c42:c72 + + vmovapd(mem(rcx,32), ymm0) // load c42:c72 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rcx,32)) // store c42:c72 + add(rdi, rcx) // c += cs_c; + + // update c03:c33 + + vmovapd(mem(rcx), ymm0) // load c03:c33 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rcx)) // store c03:c33 + + // update c43:c73 + + vmovapd(mem(rcx,32), ymm0) // load c43:c73 into ymm0 + Z_ALPHA(0, 2) // scale ymm0 by beta + vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0 + vmovapd(ymm0, mem(rcx,32)) // store c43:c73 + add(rdi, rcx) // c += cs_c; + + jmp(.ZDONE) // jump to end. + label(.ZBETAZERO) - // check if aligned/column-stored - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.ZCOLSTORBZ) // jump to column storage case - - - - label(.ZGENSTORBZ) - // update c00:c30 - - vextractf128(imm(1), ymm15, xmm2) - vmovupd(xmm15, mem(rcx)) // store (c00,c10) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c20,c30) - add(rdi, rcx) // c += cs_c; - - // update c40:c70 - - vextractf128(imm(1), ymm14, xmm2) - vmovupd(xmm14, mem(rdx)) // store (c40,c50) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c60,c70) - add(rdi, rdx) // c += cs_c; - - // update c01:c31 - - vextractf128(imm(1), ymm13, xmm2) - vmovupd(xmm13, mem(rcx)) // store (c01,c11) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c21,c31) - add(rdi, rcx) // c += cs_c; - - // update c41:c71 - - vextractf128(imm(1), ymm12, xmm2) - vmovupd(xmm12, mem(rdx)) // store (c41,c51) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c61,c71) - add(rdi, rdx) // c += cs_c; - - // update c02:c32 - - vextractf128(imm(1), ymm11, xmm2) - vmovupd(xmm11, mem(rcx)) // store (c02,c12) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c22,c32) - add(rdi, rcx) // c += cs_c; - - // update c42:c72 - - vextractf128(imm(1), ymm10, xmm2) - vmovupd(xmm10, mem(rdx)) // store (c42,c52) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c62,c72) - add(rdi, rdx) // c += cs_c; - - // update c03:c33 - - vextractf128(imm(1), ymm9, xmm2) - vmovupd(xmm9, mem(rcx)) // store (c03,c13) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c23,c33) - add(rdi, rcx) // c += cs_c; - - // update c43:c73 - - vextractf128(imm(1), ymm8, xmm2) - vmovupd(xmm8, mem(rdx)) // store (c43,c53) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c63,c73) - - - jmp(.ZDONE) // jump to end. - - - label(.ZCOLSTORBZ) - - - vmovapd(ymm15, mem(rcx)) // store c00:c30 - add(rdi, rcx) // c += cs_c; - - vmovapd(ymm14, mem(rdx)) // store c40:c70 - add(rdi, rdx) // c += cs_c; - - vmovapd(ymm13, mem(rcx)) // store c01:c31 - add(rdi, rcx) // c += cs_c; - - vmovapd(ymm12, mem(rdx)) // store c41:c71 - add(rdi, rdx) // c += cs_c; - - vmovapd(ymm11, mem(rcx)) // store c02:c32 - add(rdi, rcx) // c += cs_c; - - vmovapd(ymm10, mem(rdx)) // store c42:c72 - add(rdi, rdx) // c += cs_c; - - vmovapd(ymm9, mem(rcx)) // store c03:c33 - add(rdi, rcx) // c += cs_c; - - vmovapd(ymm8, mem(rdx)) // store c43:c73 - - + + vmovapd(ymm15, mem(rcx)) // store c00:c30 + vmovapd(ymm14, mem(rcx,32)) // store c40:c70 + add(rdi, rcx) // c += cs_c; + + vmovapd(ymm13, mem(rcx)) // store c01:c31 + vmovapd(ymm12, mem(rcx,32)) // store c41:c71 + add(rdi, rcx) // c += cs_c; + + vmovapd(ymm11, mem(rcx)) // store c02:c32 + vmovapd(ymm10, mem(rcx,32)) // store c42:c72 + add(rdi, rcx) // c += cs_c; + + vmovapd(ymm9, mem(rcx)) // store c03:c33 + vmovapd(ymm8, mem(rcx,32)) // store c43:c73 + //add(rdi, rcx) // c += cs_c; + label(.ZDONE) - - end_asm( + + end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 @@ -2524,7 +1747,7 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", @@ -2532,5 +1755,7 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) + + GEMM_UKR_FLUSH_CT( z ); } diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c index 7907bd901..d0e793867 100644 --- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c @@ -79,7 +79,9 @@ void bli_sgemm_haswell_asm_6x16 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, float* restrict alpha, float* restrict a, float* restrict b, @@ -94,11 +96,13 @@ void bli_sgemm_haswell_asm_6x16 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT_AMBI( s, 6, 16, true ); + begin_asm() vzeroall() // zero all xmm/ymm registers. @@ -109,36 +113,65 @@ void bli_sgemm_haswell_asm_6x16 //mov(%9, r15) // load address of b_next. add(imm(32*4), rbx) - // initialize loop by pre-loading + // initialize loop by pre-loading vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) - lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; - lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c - - - + cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. + jz(.SCOLPREFETCH) // jump to column prefetch case + + lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; + lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c + + jmp(.SPREFETCHDONE) + + label(.SCOLPREFETCH) + + lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; + lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 7*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 7*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 7*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 7*8)) // prefetch c + 5*cs_c + prefetch(0, mem(rdx, r13, 1, 7*8)) // prefetch c + 6*cs_c + prefetch(0, mem(rdx, rsi, 4, 7*8)) // prefetch c + 7*cs_c + lea(mem(rcx, rsi, 8), r14) // r14 = c + 8*cs_c; + lea(mem(r14, r13, 1), rdx) // rdx = c + 11*cs_c; + prefetch(0, mem(r14, 7*8)) // prefetch c + 8*cs_c + prefetch(0, mem(r14, rsi, 1, 7*8)) // prefetch c + 9*cs_c + prefetch(0, mem(r14, rsi, 2, 7*8)) // prefetch c + 10*cs_c + prefetch(0, mem(rdx, 7*8)) // prefetch c + 11*cs_c + prefetch(0, mem(rdx, rsi, 1, 7*8)) // prefetch c + 12*cs_c + prefetch(0, mem(rdx, rsi, 2, 7*8)) // prefetch c + 13*cs_c + prefetch(0, mem(rdx, r13, 1, 7*8)) // prefetch c + 14*cs_c + prefetch(0, mem(rdx, rsi, 4, 7*8)) // prefetch c + 15*cs_c + + label(.SPREFETCHDONE) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that - // contains the k_left loop. + // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP - // iteration 0 + // iteration 0 prefetch(0, mem(rax, 64*4)) vbroadcastss(mem(rax, 0*4), ymm2) @@ -165,7 +198,7 @@ void bli_sgemm_haswell_asm_6x16 vmovaps(mem(rbx, -2*32), ymm0) vmovaps(mem(rbx, -1*32), ymm1) - // iteration 1 + // iteration 1 vbroadcastss(mem(rax, 6*4), ymm2) vbroadcastss(mem(rax, 7*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) @@ -190,7 +223,7 @@ void bli_sgemm_haswell_asm_6x16 vmovaps(mem(rbx, 0*32), ymm0) vmovaps(mem(rbx, 1*32), ymm1) - // iteration 2 + // iteration 2 prefetch(0, mem(rax, 76*4)) vbroadcastss(mem(rax, 12*4), ymm2) @@ -217,7 +250,7 @@ void bli_sgemm_haswell_asm_6x16 vmovaps(mem(rbx, 2*32), ymm0) vmovaps(mem(rbx, 3*32), ymm1) - // iteration 3 + // iteration 3 vbroadcastss(mem(rax, 18*4), ymm2) vbroadcastss(mem(rax, 19*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) @@ -259,7 +292,7 @@ void bli_sgemm_haswell_asm_6x16 mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. - // else, we prepare to enter k_left loop. + // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP @@ -338,533 +371,330 @@ void bli_sgemm_haswell_asm_6x16 lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; - // now avoid loading C if beta == 0 + // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. - jz(.SROWSTORED) // jump to row storage case - - - cmp(imm(4), rdi) // set ZF if (4*cs_c) == 4. - jz(.SCOLSTORED) // jump to column storage case - - - - label(.SGENSTORED) - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm4, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm6, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm8, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm10, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm12, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm14, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - //add(rdi, rcx) // c += rs_c; - - - mov(rdx, rcx) // rcx = c + 8*cs_c - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm5, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm7, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm9, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm11, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm13, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm15, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - //add(rdi, rcx) // c += rs_c; - - - - jmp(.SDONE) // jump to end. - - - - label(.SROWSTORED) - - - vfmadd231ps(mem(rcx), ymm3, ymm4) - vmovups(ymm4, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rdx), ymm3, ymm5) - vmovups(ymm5, mem(rdx)) - add(rdi, rdx) - - - vfmadd231ps(mem(rcx), ymm3, ymm6) - vmovups(ymm6, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rdx), ymm3, ymm7) - vmovups(ymm7, mem(rdx)) - add(rdi, rdx) - - - vfmadd231ps(mem(rcx), ymm3, ymm8) - vmovups(ymm8, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rdx), ymm3, ymm9) - vmovups(ymm9, mem(rdx)) - add(rdi, rdx) - - - vfmadd231ps(mem(rcx), ymm3, ymm10) - vmovups(ymm10, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rdx), ymm3, ymm11) - vmovups(ymm11, mem(rdx)) - add(rdi, rdx) - - - vfmadd231ps(mem(rcx), ymm3, ymm12) - vmovups(ymm12, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rdx), ymm3, ymm13) - vmovups(ymm13, mem(rdx)) - add(rdi, rdx) - - - vfmadd231ps(mem(rcx), ymm3, ymm14) - vmovups(ymm14, mem(rcx)) - //add(rdi, rcx) - vfmadd231ps(mem(rdx), ymm3, ymm15) - vmovups(ymm15, mem(rdx)) - //add(rdi, rdx) - - - - jmp(.SDONE) // jump to end. - - - - label(.SCOLSTORED) - - - vbroadcastss(mem(rbx), ymm3) - - vunpcklps(ymm6, ymm4, ymm0) - vunpcklps(ymm10, ymm8, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vfmadd231ps(mem(rcx), xmm3, xmm0) - vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - - vextractf128(imm(0x1), ymm1, xmm2) - vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) - vfmadd231ps(mem(rcx, r15, 1), xmm3, xmm2) - vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) - vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) - - - vunpckhps(ymm6, ymm4, ymm0) - vunpckhps(ymm10, ymm8, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) - vfmadd231ps(mem(rcx, r13, 2), xmm3, xmm2) - vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) - vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) - - vextractf128(imm(0x1), ymm1, xmm2) - vfmadd231ps(mem(rcx, r13, 1), xmm3, xmm1) - vfmadd231ps(mem(rcx, r10, 1), xmm3, xmm2) - vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) - vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) - - lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - vunpcklps(ymm14, ymm12, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(mem(r14), xmm1, xmm1) - vmovhpd(mem(r14, rsi, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm0) - vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) - vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) - vmovlpd(mem(r14, rsi, 4), xmm1, xmm1) - vmovhpd(mem(r14, r15, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm2) - vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) - vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) - - vunpckhps(ymm14, ymm12, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(mem(r14, rsi, 2), xmm1, xmm1) - vmovhpd(mem(r14, r13, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm0) - vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) - vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) - vmovlpd(mem(r14, r13, 2), xmm1, xmm1) - vmovhpd(mem(r14, r10, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm2) - vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) - vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) - - lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c - - - - vunpcklps(ymm7, ymm5, ymm0) - vunpcklps(ymm11, ymm9, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vfmadd231ps(mem(rcx), xmm3, xmm0) - vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - - vextractf128(imm(0x1), ymm1, xmm2) - vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) - vfmadd231ps(mem(rcx, r15, 1), xmm3, xmm2) - vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) - vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) - - - vunpckhps(ymm7, ymm5, ymm0) - vunpckhps(ymm11, ymm9, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) - vfmadd231ps(mem(rcx, r13, 2), xmm3, xmm2) - vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) - vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) - - vextractf128(imm(0x1), ymm1, xmm2) - vfmadd231ps(mem(rcx, r13, 1), xmm3, xmm1) - vfmadd231ps(mem(rcx, r10, 1), xmm3, xmm2) - vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) - vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) - - //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - vunpcklps(ymm15, ymm13, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(mem(r14), xmm1, xmm1) - vmovhpd(mem(r14, rsi, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm0) - vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) - vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) - vmovlpd(mem(r14, rsi, 4), xmm1, xmm1) - vmovhpd(mem(r14, r15, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm2) - vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) - vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) - - vunpckhps(ymm15, ymm13, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(mem(r14, rsi, 2), xmm1, xmm1) - vmovhpd(mem(r14, r13, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm0) - vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) - vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) - vmovlpd(mem(r14, r13, 2), xmm1, xmm1) - vmovhpd(mem(r14, r10, 1), xmm1, xmm1) - vfmadd231ps(xmm1, xmm3, xmm2) - vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) - vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) - - //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c - - - - jmp(.SDONE) // jump to end. - - + cmp(imm(4), rdi) // set ZF if (4*cs_c) == 4. + jz(.SCOLSTORED) // jump to column storage case + + vfmadd231ps(mem(rcx), ymm3, ymm4) + vmovups(ymm4, mem(rcx)) + vfmadd231ps(mem(rcx,32), ymm3, ymm5) + vmovups(ymm5, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm6) + vmovups(ymm6, mem(rcx)) + vfmadd231ps(mem(rcx,32), ymm3, ymm7) + vmovups(ymm7, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm8) + vmovups(ymm8, mem(rcx)) + vfmadd231ps(mem(rcx,32), ymm3, ymm9) + vmovups(ymm9, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm10) + vmovups(ymm10, mem(rcx)) + vfmadd231ps(mem(rcx,32), ymm3, ymm11) + vmovups(ymm11, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm12) + vmovups(ymm12, mem(rcx)) + vfmadd231ps(mem(rcx,32), ymm3, ymm13) + vmovups(ymm13, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm14) + vmovups(ymm14, mem(rcx)) + vfmadd231ps(mem(rcx,32), ymm3, ymm15) + vmovups(ymm15, mem(rcx,32)) + //add(rdi, rcx) + + jmp(.SDONE) // jump to end. + + label(.SCOLSTORED) + + vunpcklps(ymm6, ymm4, ymm0) + vunpcklps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm0) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + + vextractf128(imm(0x1), ymm1, xmm2) + vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) + vfmadd231ps(mem(rcx, r15, 1), xmm3, xmm2) + vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) + + + vunpckhps(ymm6, ymm4, ymm0) + vunpckhps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) + vfmadd231ps(mem(rcx, r13, 2), xmm3, xmm2) + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) + vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) + + vextractf128(imm(0x1), ymm1, xmm2) + vfmadd231ps(mem(rcx, r13, 1), xmm3, xmm1) + vfmadd231ps(mem(rcx, r10, 1), xmm3, xmm2) + vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) + + lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c + + vunpcklps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(mem(r14), xmm1, xmm1) + vmovhpd(mem(r14, rsi, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm0) + vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) + vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) + vmovlpd(mem(r14, rsi, 4), xmm1, xmm1) + vmovhpd(mem(r14, r15, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm2) + vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) + vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) + + vunpckhps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(mem(r14, rsi, 2), xmm1, xmm1) + vmovhpd(mem(r14, r13, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm0) + vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) + vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) + vmovlpd(mem(r14, r13, 2), xmm1, xmm1) + vmovhpd(mem(r14, r10, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm2) + vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) + vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) + + lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c + + + + vunpcklps(ymm7, ymm5, ymm0) + vunpcklps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vfmadd231ps(mem(rcx), xmm3, xmm0) + vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + + vextractf128(imm(0x1), ymm1, xmm2) + vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) + vfmadd231ps(mem(rcx, r15, 1), xmm3, xmm2) + vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) + + + vunpckhps(ymm7, ymm5, ymm0) + vunpckhps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + + vextractf128(imm(0x1), ymm0, xmm2) + vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) + vfmadd231ps(mem(rcx, r13, 2), xmm3, xmm2) + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) + vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) + + vextractf128(imm(0x1), ymm1, xmm2) + vfmadd231ps(mem(rcx, r13, 1), xmm3, xmm1) + vfmadd231ps(mem(rcx, r10, 1), xmm3, xmm2) + vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) + + //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c + + vunpcklps(ymm15, ymm13, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(mem(r14), xmm1, xmm1) + vmovhpd(mem(r14, rsi, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm0) + vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) + vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) + vmovlpd(mem(r14, rsi, 4), xmm1, xmm1) + vmovhpd(mem(r14, r15, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm2) + vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) + vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) + + vunpckhps(ymm15, ymm13, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(mem(r14, rsi, 2), xmm1, xmm1) + vmovhpd(mem(r14, r13, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm0) + vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) + vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) + vmovlpd(mem(r14, r13, 2), xmm1, xmm1) + vmovhpd(mem(r14, r10, 1), xmm1, xmm1) + vfmadd231ps(xmm1, xmm3, xmm2) + vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) + vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) + + //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c + + jmp(.SDONE) // jump to end. label(.SBETAZERO) - cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. - jz(.SROWSTORBZ) // jump to row storage case - - cmp(imm(4), rdi) // set ZF if (4*cs_c) == 4. - jz(.SCOLSTORBZ) // jump to column storage case - - - - label(.SGENSTORBZ) - - - vmovaps(ymm4, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - vmovaps(ymm6, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - vmovaps(ymm8, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - vmovaps(ymm10, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - vmovaps(ymm12, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - vmovaps(ymm14, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - //add(rdi, rcx) // c += rs_c; - - - mov(rdx, rcx) // rcx = c + 8*cs_c - - - vmovaps(ymm5, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - vmovaps(ymm7, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - + cmp(imm(4), rdi) // set ZF if (4*cs_c) == 4. + jz(.SCOLSTORBZ) // jump to column storage case - vmovaps(ymm9, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; + vmovups(ymm4, mem(rcx)) + vmovups(ymm5, mem(rcx,32)) + add(rdi, rcx) + vmovups(ymm6, mem(rcx)) + vmovups(ymm7, mem(rcx,32)) + add(rdi, rcx) - vmovaps(ymm11, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; + vmovups(ymm8, mem(rcx)) + vmovups(ymm9, mem(rcx,32)) + add(rdi, rcx) - vmovaps(ymm13, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; + vmovups(ymm10, mem(rcx)) + vmovups(ymm11, mem(rcx,32)) + add(rdi, rcx) - vmovaps(ymm15, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - //add(rdi, rcx) // c += rs_c; + vmovups(ymm12, mem(rcx)) + vmovups(ymm13, mem(rcx,32)) + add(rdi, rcx) - jmp(.SDONE) // jump to end. + vmovups(ymm14, mem(rcx)) + vmovups(ymm15, mem(rcx,32)) + //add(rdi, rcx) + jmp(.SDONE) // jump to end. + label(.SCOLSTORBZ) - label(.SROWSTORBZ) + vunpcklps(ymm6, ymm4, ymm0) + vunpcklps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + vextractf128(imm(0x1), ymm0, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - vmovups(ymm4, mem(rcx)) - add(rdi, rcx) - vmovups(ymm5, mem(rdx)) - add(rdi, rdx) + vextractf128(imm(0x1), ymm1, xmm2) + vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) - vmovups(ymm6, mem(rcx)) - add(rdi, rcx) - vmovups(ymm7, mem(rdx)) - add(rdi, rdx) + vunpckhps(ymm6, ymm4, ymm0) + vunpckhps(ymm10, ymm8, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) - vmovups(ymm8, mem(rcx)) - add(rdi, rcx) - vmovups(ymm9, mem(rdx)) - add(rdi, rdx) + vextractf128(imm(0x1), ymm0, xmm2) + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) + vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) + vextractf128(imm(0x1), ymm1, xmm2) + vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) - vmovups(ymm10, mem(rcx)) - add(rdi, rcx) - vmovups(ymm11, mem(rdx)) - add(rdi, rdx) + lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c + vunpcklps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) + vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) + vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) + vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) - vmovups(ymm12, mem(rcx)) - add(rdi, rcx) - vmovups(ymm13, mem(rdx)) - add(rdi, rdx) + vunpckhps(ymm14, ymm12, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) + vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) + vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) + vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) + lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c - vmovups(ymm14, mem(rcx)) - //add(rdi, rcx) - vmovups(ymm15, mem(rdx)) - //add(rdi, rdx) + vunpcklps(ymm7, ymm5, ymm0) + vunpcklps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) - jmp(.SDONE) // jump to end. + vextractf128(imm(0x1), ymm0, xmm2) + vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) + vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) + vextractf128(imm(0x1), ymm1, xmm2) + vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) + vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) - label(.SCOLSTORBZ) + vunpckhps(ymm7, ymm5, ymm0) + vunpckhps(ymm11, ymm9, ymm1) + vshufps(imm(0x4e), ymm1, ymm0, ymm2) + vblendps(imm(0xcc), ymm2, ymm0, ymm0) + vblendps(imm(0x33), ymm2, ymm1, ymm1) + vextractf128(imm(0x1), ymm0, xmm2) + vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) + vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) - vunpcklps(ymm6, ymm4, ymm0) - vunpcklps(ymm10, ymm8, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) + vextractf128(imm(0x1), ymm1, xmm2) + vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) + vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) - vextractf128(imm(0x1), ymm0, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - - vextractf128(imm(0x1), ymm1, xmm2) - vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) - vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) - - - vunpckhps(ymm6, ymm4, ymm0) - vunpckhps(ymm10, ymm8, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) - vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) - - vextractf128(imm(0x1), ymm1, xmm2) - vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) - vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) - - lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - vunpcklps(ymm14, ymm12, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) - vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) - vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) - vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) - - vunpckhps(ymm14, ymm12, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) - vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) - vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) - vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) - - lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c - - - - vunpcklps(ymm7, ymm5, ymm0) - vunpcklps(ymm11, ymm9, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) - vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) - - vextractf128(imm(0x1), ymm1, xmm2) - vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) - vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) - - - vunpckhps(ymm7, ymm5, ymm0) - vunpckhps(ymm11, ymm9, ymm1) - vshufps(imm(0x4e), ymm1, ymm0, ymm2) - vblendps(imm(0xcc), ymm2, ymm0, ymm0) - vblendps(imm(0x33), ymm2, ymm1, ymm1) - - vextractf128(imm(0x1), ymm0, xmm2) - vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) - vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) - - vextractf128(imm(0x1), ymm1, xmm2) - vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) - vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) - - //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - vunpcklps(ymm15, ymm13, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) - vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) - vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) - vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) - - vunpckhps(ymm15, ymm13, ymm0) - vextractf128(imm(0x1), ymm0, xmm2) - vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) - vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) - vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) - vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) - - //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c + //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c + vunpcklps(ymm15, ymm13, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) + vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) + vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) + vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) + vunpckhps(ymm15, ymm13, ymm0) + vextractf128(imm(0x1), ymm0, xmm2) + vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) + vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) + vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) + vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) + //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_ label(.SDONE) @@ -896,6 +726,8 @@ void bli_sgemm_haswell_asm_6x16 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( s ); } @@ -927,7 +759,9 @@ void bli_sgemm_haswell_asm_6x16 void bli_dgemm_haswell_asm_6x8 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, double* restrict alpha, double* restrict a, double* restrict b, @@ -942,11 +776,13 @@ void bli_dgemm_haswell_asm_6x8 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT_AMBI( d, 6, 8, true ); + begin_asm() vzeroall() // zero all xmm/ymm registers. @@ -957,36 +793,56 @@ void bli_dgemm_haswell_asm_6x8 //mov(%9, r15) // load address of b_next. add(imm(32*4), rbx) - // initialize loop by pre-loading + // initialize loop by pre-loading vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) + mov(var(cs_c), rsi) // load cs_c + lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) - lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; - lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c; - prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c - prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c - prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c - prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c - prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c - prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c + cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. + jz(.SCOLPREFETCH) // jump to column prefetch case + + lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; + lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c; + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c + prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c + prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c + prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c + prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c + prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c + + jmp(.SPREFETCHDONE) + + label(.SCOLPREFETCH) + lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; + lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c; + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c + prefetch(0, mem(rcx, rsi, 1, 7*8)) // prefetch c + 1*cs_c + prefetch(0, mem(rcx, rsi, 2, 7*8)) // prefetch c + 2*cs_c + prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*cs_c + prefetch(0, mem(rdx, rsi, 1, 7*8)) // prefetch c + 4*cs_c + prefetch(0, mem(rdx, rsi, 2, 7*8)) // prefetch c + 5*cs_c + prefetch(0, mem(rdx, r13, 1, 7*8)) // prefetch c + 6*cs_c + prefetch(0, mem(rdx, rsi, 4, 7*8)) // prefetch c + 7*cs_c + label(.SPREFETCHDONE) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that - // contains the k_left loop. + // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP - // iteration 0 + // iteration 0 prefetch(0, mem(rax, 64*8)) vbroadcastsd(mem(rax, 0*8), ymm2) @@ -1013,7 +869,7 @@ void bli_dgemm_haswell_asm_6x8 vmovapd(mem(rbx, -2*32), ymm0) vmovapd(mem(rbx, -1*32), ymm1) - // iteration 1 + // iteration 1 prefetch(0, mem(rax, 72*8)) vbroadcastsd(mem(rax, 6*8), ymm2) @@ -1040,7 +896,7 @@ void bli_dgemm_haswell_asm_6x8 vmovapd(mem(rbx, 0*32), ymm0) vmovapd(mem(rbx, 1*32), ymm1) - // iteration 2 + // iteration 2 prefetch(0, mem(rax, 80*8)) vbroadcastsd(mem(rax, 12*8), ymm2) @@ -1067,7 +923,7 @@ void bli_dgemm_haswell_asm_6x8 vmovapd(mem(rbx, 2*32), ymm0) vmovapd(mem(rbx, 3*32), ymm1) - // iteration 3 + // iteration 3 vbroadcastsd(mem(rax, 18*8), ymm2) vbroadcastsd(mem(rax, 19*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) @@ -1109,7 +965,7 @@ void bli_dgemm_haswell_asm_6x8 mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. - // else, we prepare to enter k_left loop. + // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP @@ -1188,428 +1044,232 @@ void bli_dgemm_haswell_asm_6x8 //lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; - // now avoid loading C if beta == 0 + // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORED) // jump to column storage case + + vfmadd231pd(mem(rcx), ymm3, ymm4) + vmovupd(ymm4, mem(rcx)) + vfmadd231pd(mem(rcx,32), ymm3, ymm5) + vmovupd(ymm5, mem(rcx,32)) + add(rdi, rcx) - cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. - jz(.DROWSTORED) // jump to row storage case - - - cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. - jz(.DCOLSTORED) // jump to column storage case - - - - label(.DGENSTORED) - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm4, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm6, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm8, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm10, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm12, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm14, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - - - mov(rdx, rcx) // rcx = c + 4*cs_c - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm5, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm7, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm9, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm11, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm13, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm15, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - - - - jmp(.DDONE) // jump to end. - - - - label(.DROWSTORED) - - - vfmadd231pd(mem(rcx), ymm3, ymm4) - vmovupd(ymm4, mem(rcx)) - add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm5) - vmovupd(ymm5, mem(rdx)) - add(rdi, rdx) - - - vfmadd231pd(mem(rcx), ymm3, ymm6) - vmovupd(ymm6, mem(rcx)) - add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm7) - vmovupd(ymm7, mem(rdx)) - add(rdi, rdx) - - - vfmadd231pd(mem(rcx), ymm3, ymm8) - vmovupd(ymm8, mem(rcx)) - add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm9) - vmovupd(ymm9, mem(rdx)) - add(rdi, rdx) - - - vfmadd231pd(mem(rcx), ymm3, ymm10) - vmovupd(ymm10, mem(rcx)) - add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm11) - vmovupd(ymm11, mem(rdx)) - add(rdi, rdx) - - - vfmadd231pd(mem(rcx), ymm3, ymm12) - vmovupd(ymm12, mem(rcx)) - add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm13) - vmovupd(ymm13, mem(rdx)) - add(rdi, rdx) - - - vfmadd231pd(mem(rcx), ymm3, ymm14) - vmovupd(ymm14, mem(rcx)) - //add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm15) - vmovupd(ymm15, mem(rdx)) - //add(rdi, rdx) - - - - jmp(.DDONE) // jump to end. - - - - label(.DCOLSTORED) - - - vunpcklpd(ymm6, ymm4, ymm0) - vunpckhpd(ymm6, ymm4, ymm1) - vunpcklpd(ymm10, ymm8, ymm2) - vunpckhpd(ymm10, ymm8, ymm3) - vinsertf128(imm(0x1), xmm2, ymm0, ymm4) - vinsertf128(imm(0x1), xmm3, ymm1, ymm6) - vperm2f128(imm(0x31), ymm2, ymm0, ymm8) - vperm2f128(imm(0x31), ymm3, ymm1, ymm10) - - vbroadcastsd(mem(rbx), ymm3) - - vfmadd231pd(mem(rcx), ymm3, ymm4) - vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) - vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) - vfmadd231pd(mem(rcx, r13, 1), ymm3, ymm10) - vmovupd(ymm4, mem(rcx)) - vmovupd(ymm6, mem(rcx, rsi, 1)) - vmovupd(ymm8, mem(rcx, rsi, 2)) - vmovupd(ymm10, mem(rcx, r13, 1)) - - lea(mem(rcx, rsi, 4), rcx) - - vunpcklpd(ymm14, ymm12, ymm0) - vunpckhpd(ymm14, ymm12, ymm1) - vextractf128(imm(0x1), ymm0, xmm2) - vextractf128(imm(0x1), ymm1, xmm4) - - vfmadd231pd(mem(r14), xmm3, xmm0) - vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1) - vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2) - vfmadd231pd(mem(r14, r13, 1), xmm3, xmm4) - vmovupd(xmm0, mem(r14)) - vmovupd(xmm1, mem(r14, rsi, 1)) - vmovupd(xmm2, mem(r14, rsi, 2)) - vmovupd(xmm4, mem(r14, r13, 1)) - - lea(mem(r14, rsi, 4), r14) - - - vunpcklpd(ymm7, ymm5, ymm0) - vunpckhpd(ymm7, ymm5, ymm1) - vunpcklpd(ymm11, ymm9, ymm2) - vunpckhpd(ymm11, ymm9, ymm3) - vinsertf128(imm(0x1), xmm2, ymm0, ymm5) - vinsertf128(imm(0x1), xmm3, ymm1, ymm7) - vperm2f128(imm(0x31), ymm2, ymm0, ymm9) - vperm2f128(imm(0x31), ymm3, ymm1, ymm11) - - vbroadcastsd(mem(rbx), ymm3) - - vfmadd231pd(mem(rcx), ymm3, ymm5) - vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) - vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) - vfmadd231pd(mem(rcx, r13, 1), ymm3, ymm11) - vmovupd(ymm5, mem(rcx)) - vmovupd(ymm7, mem(rcx, rsi, 1)) - vmovupd(ymm9, mem(rcx, rsi, 2)) - vmovupd(ymm11, mem(rcx, r13, 1)) - - //lea(mem(rcx, rsi, 4), rcx) - - vunpcklpd(ymm15, ymm13, ymm0) - vunpckhpd(ymm15, ymm13, ymm1) - vextractf128(imm(0x1), ymm0, xmm2) - vextractf128(imm(0x1), ymm1, xmm4) - - vfmadd231pd(mem(r14), xmm3, xmm0) - vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1) - vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2) - vfmadd231pd(mem(r14, r13, 1), xmm3, xmm4) - vmovupd(xmm0, mem(r14)) - vmovupd(xmm1, mem(r14, rsi, 1)) - vmovupd(xmm2, mem(r14, rsi, 2)) - vmovupd(xmm4, mem(r14, r13, 1)) - - //lea(mem(r14, rsi, 4), r14) - - - - jmp(.DDONE) // jump to end. + vfmadd231pd(mem(rcx), ymm3, ymm6) + vmovupd(ymm6, mem(rcx)) + vfmadd231pd(mem(rcx,32), ymm3, ymm7) + vmovupd(ymm7, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231pd(mem(rcx), ymm3, ymm8) + vmovupd(ymm8, mem(rcx)) + vfmadd231pd(mem(rcx,32), ymm3, ymm9) + vmovupd(ymm9, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231pd(mem(rcx), ymm3, ymm10) + vmovupd(ymm10, mem(rcx)) + vfmadd231pd(mem(rcx,32), ymm3, ymm11) + vmovupd(ymm11, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231pd(mem(rcx), ymm3, ymm12) + vmovupd(ymm12, mem(rcx)) + vfmadd231pd(mem(rcx,32), ymm3, ymm13) + vmovupd(ymm13, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231pd(mem(rcx), ymm3, ymm14) + vmovupd(ymm14, mem(rcx)) + vfmadd231pd(mem(rcx,32), ymm3, ymm15) + vmovupd(ymm15, mem(rcx,32)) + //add(rdi, rcx) + + jmp(.DDONE) // jump to end. + + label(.DCOLSTORED) + + vunpcklpd(ymm6, ymm4, ymm0) + vunpckhpd(ymm6, ymm4, ymm1) + vunpcklpd(ymm10, ymm8, ymm2) + vunpckhpd(ymm10, ymm8, ymm3) + vinsertf128(imm(0x1), xmm2, ymm0, ymm4) + vinsertf128(imm(0x1), xmm3, ymm1, ymm6) + vperm2f128(imm(0x31), ymm2, ymm0, ymm8) + vperm2f128(imm(0x31), ymm3, ymm1, ymm10) + + vbroadcastsd(mem(rbx), ymm3) + + vfmadd231pd(mem(rcx), ymm3, ymm4) + vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) + vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) + vfmadd231pd(mem(rcx, r13, 1), ymm3, ymm10) + vmovupd(ymm4, mem(rcx)) + vmovupd(ymm6, mem(rcx, rsi, 1)) + vmovupd(ymm8, mem(rcx, rsi, 2)) + vmovupd(ymm10, mem(rcx, r13, 1)) + + lea(mem(rcx, rsi, 4), rcx) + + vunpcklpd(ymm14, ymm12, ymm0) + vunpckhpd(ymm14, ymm12, ymm1) + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm4) + + vfmadd231pd(mem(r14), xmm3, xmm0) + vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1) + vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2) + vfmadd231pd(mem(r14, r13, 1), xmm3, xmm4) + vmovupd(xmm0, mem(r14)) + vmovupd(xmm1, mem(r14, rsi, 1)) + vmovupd(xmm2, mem(r14, rsi, 2)) + vmovupd(xmm4, mem(r14, r13, 1)) + + lea(mem(r14, rsi, 4), r14) + + + vunpcklpd(ymm7, ymm5, ymm0) + vunpckhpd(ymm7, ymm5, ymm1) + vunpcklpd(ymm11, ymm9, ymm2) + vunpckhpd(ymm11, ymm9, ymm3) + vinsertf128(imm(0x1), xmm2, ymm0, ymm5) + vinsertf128(imm(0x1), xmm3, ymm1, ymm7) + vperm2f128(imm(0x31), ymm2, ymm0, ymm9) + vperm2f128(imm(0x31), ymm3, ymm1, ymm11) + + vbroadcastsd(mem(rbx), ymm3) + + vfmadd231pd(mem(rcx), ymm3, ymm5) + vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) + vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) + vfmadd231pd(mem(rcx, r13, 1), ymm3, ymm11) + vmovupd(ymm5, mem(rcx)) + vmovupd(ymm7, mem(rcx, rsi, 1)) + vmovupd(ymm9, mem(rcx, rsi, 2)) + vmovupd(ymm11, mem(rcx, r13, 1)) + + //lea(mem(rcx, rsi, 4), rcx) + vunpcklpd(ymm15, ymm13, ymm0) + vunpckhpd(ymm15, ymm13, ymm1) + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm4) + + vfmadd231pd(mem(r14), xmm3, xmm0) + vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1) + vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2) + vfmadd231pd(mem(r14, r13, 1), xmm3, xmm4) + vmovupd(xmm0, mem(r14)) + vmovupd(xmm1, mem(r14, rsi, 1)) + vmovupd(xmm2, mem(r14, rsi, 2)) + vmovupd(xmm4, mem(r14, r13, 1)) + + //lea(mem(r14, rsi, 4), r14) + + jmp(.DDONE) // jump to end. label(.DBETAZERO) - cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. - jz(.DROWSTORBZ) // jump to row storage case - - cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. - jz(.DCOLSTORBZ) // jump to column storage case - - - - label(.DGENSTORBZ) - - - vmovapd(ymm4, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - vmovapd(ymm6, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - vmovapd(ymm8, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - vmovapd(ymm10, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - vmovapd(ymm12, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - - - vmovapd(ymm14, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - - - mov(rdx, rcx) // rcx = c + 4*cs_c - - - vmovapd(ymm5, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLSTORBZ) // jump to column storage case - vmovapd(ymm7, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; + vmovupd(ymm4, mem(rcx)) + vmovupd(ymm5, mem(rcx,32)) + add(rdi, rcx) + vmovupd(ymm6, mem(rcx)) + vmovupd(ymm7, mem(rcx,32)) + add(rdi, rcx) - vmovapd(ymm9, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; + vmovupd(ymm8, mem(rcx)) + vmovupd(ymm9, mem(rcx,32)) + add(rdi, rcx) - vmovapd(ymm11, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; + vmovupd(ymm10, mem(rcx)) + vmovupd(ymm11, mem(rcx,32)) + add(rdi, rcx) - vmovapd(ymm13, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += rs_c; + vmovupd(ymm12, mem(rcx)) + vmovupd(ymm13, mem(rcx,32)) + add(rdi, rcx) - vmovapd(ymm15, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ + vmovupd(ymm14, mem(rcx)) + vmovupd(ymm15, mem(rcx,32)) + //add(rdi, rcx) + jmp(.DDONE) // jump to end. - jmp(.DDONE) // jump to end. + label(.DCOLSTORBZ) + vunpcklpd(ymm6, ymm4, ymm0) + vunpckhpd(ymm6, ymm4, ymm1) + vunpcklpd(ymm10, ymm8, ymm2) + vunpckhpd(ymm10, ymm8, ymm3) + vinsertf128(imm(0x1), xmm2, ymm0, ymm4) + vinsertf128(imm(0x1), xmm3, ymm1, ymm6) + vperm2f128(imm(0x31), ymm2, ymm0, ymm8) + vperm2f128(imm(0x31), ymm3, ymm1, ymm10) + vmovupd(ymm4, mem(rcx)) + vmovupd(ymm6, mem(rcx, rsi, 1)) + vmovupd(ymm8, mem(rcx, rsi, 2)) + vmovupd(ymm10, mem(rcx, r13, 1)) - label(.DROWSTORBZ) + lea(mem(rcx, rsi, 4), rcx) + vunpcklpd(ymm14, ymm12, ymm0) + vunpckhpd(ymm14, ymm12, ymm1) + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm4) - vmovupd(ymm4, mem(rcx)) - add(rdi, rcx) - vmovupd(ymm5, mem(rdx)) - add(rdi, rdx) + vmovupd(xmm0, mem(r14)) + vmovupd(xmm1, mem(r14, rsi, 1)) + vmovupd(xmm2, mem(r14, rsi, 2)) + vmovupd(xmm4, mem(r14, r13, 1)) - vmovupd(ymm6, mem(rcx)) - add(rdi, rcx) - vmovupd(ymm7, mem(rdx)) - add(rdi, rdx) + lea(mem(r14, rsi, 4), r14) - vmovupd(ymm8, mem(rcx)) - add(rdi, rcx) - vmovupd(ymm9, mem(rdx)) - add(rdi, rdx) + vunpcklpd(ymm7, ymm5, ymm0) + vunpckhpd(ymm7, ymm5, ymm1) + vunpcklpd(ymm11, ymm9, ymm2) + vunpckhpd(ymm11, ymm9, ymm3) + vinsertf128(imm(0x1), xmm2, ymm0, ymm5) + vinsertf128(imm(0x1), xmm3, ymm1, ymm7) + vperm2f128(imm(0x31), ymm2, ymm0, ymm9) + vperm2f128(imm(0x31), ymm3, ymm1, ymm11) + vmovupd(ymm5, mem(rcx)) + vmovupd(ymm7, mem(rcx, rsi, 1)) + vmovupd(ymm9, mem(rcx, rsi, 2)) + vmovupd(ymm11, mem(rcx, r13, 1)) - vmovupd(ymm10, mem(rcx)) - add(rdi, rcx) - vmovupd(ymm11, mem(rdx)) - add(rdi, rdx) - - - vmovupd(ymm12, mem(rcx)) - add(rdi, rcx) - vmovupd(ymm13, mem(rdx)) - add(rdi, rdx) - - - vmovupd(ymm14, mem(rcx)) - //add(rdi, rcx) - vmovupd(ymm15, mem(rdx)) - //add(rdi, rdx) - - - jmp(.DDONE) // jump to end. - - - - label(.DCOLSTORBZ) - - - vunpcklpd(ymm6, ymm4, ymm0) - vunpckhpd(ymm6, ymm4, ymm1) - vunpcklpd(ymm10, ymm8, ymm2) - vunpckhpd(ymm10, ymm8, ymm3) - vinsertf128(imm(0x1), xmm2, ymm0, ymm4) - vinsertf128(imm(0x1), xmm3, ymm1, ymm6) - vperm2f128(imm(0x31), ymm2, ymm0, ymm8) - vperm2f128(imm(0x31), ymm3, ymm1, ymm10) - - vmovupd(ymm4, mem(rcx)) - vmovupd(ymm6, mem(rcx, rsi, 1)) - vmovupd(ymm8, mem(rcx, rsi, 2)) - vmovupd(ymm10, mem(rcx, r13, 1)) - - lea(mem(rcx, rsi, 4), rcx) - - vunpcklpd(ymm14, ymm12, ymm0) - vunpckhpd(ymm14, ymm12, ymm1) - vextractf128(imm(0x1), ymm0, xmm2) - vextractf128(imm(0x1), ymm1, xmm4) - - vmovupd(xmm0, mem(r14)) - vmovupd(xmm1, mem(r14, rsi, 1)) - vmovupd(xmm2, mem(r14, rsi, 2)) - vmovupd(xmm4, mem(r14, r13, 1)) - - lea(mem(r14, rsi, 4), r14) - - - vunpcklpd(ymm7, ymm5, ymm0) - vunpckhpd(ymm7, ymm5, ymm1) - vunpcklpd(ymm11, ymm9, ymm2) - vunpckhpd(ymm11, ymm9, ymm3) - vinsertf128(imm(0x1), xmm2, ymm0, ymm5) - vinsertf128(imm(0x1), xmm3, ymm1, ymm7) - vperm2f128(imm(0x31), ymm2, ymm0, ymm9) - vperm2f128(imm(0x31), ymm3, ymm1, ymm11) - - vmovupd(ymm5, mem(rcx)) - vmovupd(ymm7, mem(rcx, rsi, 1)) - vmovupd(ymm9, mem(rcx, rsi, 2)) - vmovupd(ymm11, mem(rcx, r13, 1)) - - //lea(mem(rcx, rsi, 4), rcx) - - vunpcklpd(ymm15, ymm13, ymm0) - vunpckhpd(ymm15, ymm13, ymm1) - vextractf128(imm(0x1), ymm0, xmm2) - vextractf128(imm(0x1), ymm1, xmm4) - - vmovupd(xmm0, mem(r14)) - vmovupd(xmm1, mem(r14, rsi, 1)) - vmovupd(xmm2, mem(r14, rsi, 2)) - vmovupd(xmm4, mem(r14, r13, 1)) - - //lea(mem(r14, rsi, 4), r14) + //lea(mem(rcx, rsi, 4), rcx) + vunpcklpd(ymm15, ymm13, ymm0) + vunpckhpd(ymm15, ymm13, ymm1) + vextractf128(imm(0x1), ymm0, xmm2) + vextractf128(imm(0x1), ymm1, xmm4) + vmovupd(xmm0, mem(r14)) + vmovupd(xmm1, mem(r14, rsi, 1)) + vmovupd(xmm2, mem(r14, rsi, 2)) + vmovupd(xmm4, mem(r14, r13, 1)) + //lea(mem(r14, rsi, 4), r14) label(.DDONE) @@ -1641,45 +1301,26 @@ void bli_dgemm_haswell_asm_6x8 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( d ); } -// assumes beta.r, beta.i have been broadcast into ymm1, ymm2. -// outputs to ymm0 -#define CGEMM_INPUT_SCALE_GS_BETA_NZ \ - vmovlpd(mem(rcx), xmm0, xmm0) \ - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \ - vmovlpd(mem(rcx, rsi, 2), xmm3, xmm3) \ - vmovhpd(mem(rcx, r13, 1), xmm3, xmm3) \ - vinsertf128(imm(1), xmm3, ymm0, ymm0) \ - vpermilps(imm(0xb1), ymm0, ymm3) \ - vmulps(ymm1, ymm0, ymm0) \ - vmulps(ymm2, ymm3, ymm3) \ - vaddsubps(ymm3, ymm0, ymm0) -// assumes values to output are in ymm0 -#define CGEMM_OUTPUT_GS \ - vextractf128(imm(1), ymm0, xmm3) \ - vmovlpd(xmm0, mem(rcx)) \ - vmovhpd(xmm0, mem(rcx, rsi, 1)) \ - vmovlpd(xmm3, mem(rcx, rsi, 2)) \ - vmovhpd(xmm3, mem(rcx, r13, 1)) - -#define CGEMM_INPUT_SCALE_RS_BETA_NZ \ - vmovups(mem(rcx), ymm0) \ +#define CGEMM_INPUT_SCALE_RS_BETA_NZ(where) \ + vmovups(where, ymm0) \ vpermilps(imm(0xb1), ymm0, ymm3) \ vmulps(ymm1, ymm0, ymm0) \ vmulps(ymm2, ymm3, ymm3) \ vaddsubps(ymm3, ymm0, ymm0) -#define CGEMM_OUTPUT_RS \ - vmovups(ymm0, mem(rcx)) \ - void bli_cgemm_haswell_asm_3x8 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, @@ -1694,11 +1335,13 @@ void bli_cgemm_haswell_asm_3x8 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( c, 3, 8, true ); + begin_asm() vzeroall() // zero all xmm/ymm registers. @@ -1709,7 +1352,7 @@ void bli_cgemm_haswell_asm_3x8 //mov(%9, r15) // load address of b_next. add(imm(32*4), rbx) - // initialize loop by pre-loading + // initialize loop by pre-loading vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) @@ -1730,13 +1373,13 @@ void bli_cgemm_haswell_asm_3x8 mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that - // contains the k_left loop. + // contains the k_left loop. label(.CLOOPKITER) // MAIN LOOP - // iteration 0 + // iteration 0 prefetch(0, mem(rax, 32*8)) vbroadcastss(mem(rax, 0*4), ymm2) @@ -1763,7 +1406,7 @@ void bli_cgemm_haswell_asm_3x8 vmovaps(mem(rbx, -2*32), ymm0) vmovaps(mem(rbx, -1*32), ymm1) - // iteration 1 + // iteration 1 vbroadcastss(mem(rax, 6*4), ymm2) vbroadcastss(mem(rax, 7*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) @@ -1788,7 +1431,7 @@ void bli_cgemm_haswell_asm_3x8 vmovaps(mem(rbx, 0*32), ymm0) vmovaps(mem(rbx, 1*32), ymm1) - // iteration 2 + // iteration 2 prefetch(0, mem(rax, 38*8)) vbroadcastss(mem(rax, 12*4), ymm2) @@ -1815,7 +1458,7 @@ void bli_cgemm_haswell_asm_3x8 vmovaps(mem(rbx, 2*32), ymm0) vmovaps(mem(rbx, 3*32), ymm1) - // iteration 3 + // iteration 3 vbroadcastss(mem(rax, 18*4), ymm2) vbroadcastss(mem(rax, 19*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) @@ -1857,7 +1500,7 @@ void bli_cgemm_haswell_asm_3x8 mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CPOSTACCUM) // if i == 0, we're done; jump to end. - // else, we prepare to enter k_left loop. + // else, we prepare to enter k_left loop. label(.CLOOPKLEFT) // EDGE LOOP @@ -1900,8 +1543,8 @@ void bli_cgemm_haswell_asm_3x8 label(.CPOSTACCUM) - // permute even and odd elements - // of ymm6/7, ymm10/11, ymm/14/15 + // permute even and odd elements + // of ymm6/7, ymm10/11, ymm/14/15 vpermilps(imm(0xb1), ymm6, ymm6) vpermilps(imm(0xb1), ymm7, ymm7) vpermilps(imm(0xb1), ymm10, ymm10) @@ -1910,7 +1553,7 @@ void bli_cgemm_haswell_asm_3x8 vpermilps(imm(0xb1), ymm15, ymm15) - // subtract/add even/odd elements + // subtract/add even/odd elements vaddsubps(ymm6, ymm4, ymm4) vaddsubps(ymm7, ymm5, ymm5) @@ -1969,16 +1612,7 @@ void bli_cgemm_haswell_asm_3x8 vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate - - - mov(var(cs_c), rsi) // load cs_c - lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(scomplex) - lea(mem(, rsi, 4), rdx) // rdx = 4*cs_c; - lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; - - - - // now avoid loading C if beta == 0 + // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); @@ -1987,162 +1621,49 @@ void bli_cgemm_haswell_asm_3x8 and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.CBETAZERO) // if ZF = 1, jump to beta == 0 case + CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(rcx)) + vaddps(ymm4, ymm0, ymm0) + vmovups(ymm0, mem(rcx)) - cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. - jz(.CROWSTORED) // jump to row storage case - - - - label(.CGENSTORED) - - - CGEMM_INPUT_SCALE_GS_BETA_NZ - vaddps(ymm4, ymm0, ymm0) - CGEMM_OUTPUT_GS - add(rdx, rcx) // c += 4*cs_c; - - - CGEMM_INPUT_SCALE_GS_BETA_NZ - vaddps(ymm5, ymm0, ymm0) - CGEMM_OUTPUT_GS - mov(r11, rcx) // rcx = c + 1*rs_c + CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(rcx,32)) + vaddps(ymm5, ymm0, ymm0) + vmovups(ymm0, mem(rcx,32)) - CGEMM_INPUT_SCALE_GS_BETA_NZ - vaddps(ymm8, ymm0, ymm0) - CGEMM_OUTPUT_GS - add(rdx, rcx) // c += 4*cs_c; + CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r11)) + vaddps(ymm8, ymm0, ymm0) + vmovups(ymm0, mem(r11)) - CGEMM_INPUT_SCALE_GS_BETA_NZ - vaddps(ymm9, ymm0, ymm0) - CGEMM_OUTPUT_GS - mov(r12, rcx) // rcx = c + 2*rs_c + CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r11,32)) + vaddps(ymm9, ymm0, ymm0) + vmovups(ymm0, mem(r11,32)) - CGEMM_INPUT_SCALE_GS_BETA_NZ - vaddps(ymm12, ymm0, ymm0) - CGEMM_OUTPUT_GS - add(rdx, rcx) // c += 4*cs_c; + CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r12)) + vaddps(ymm12, ymm0, ymm0) + vmovups(ymm0, mem(r12)) - CGEMM_INPUT_SCALE_GS_BETA_NZ - vaddps(ymm13, ymm0, ymm0) - CGEMM_OUTPUT_GS - - - - jmp(.CDONE) // jump to end. - - - - label(.CROWSTORED) - - - CGEMM_INPUT_SCALE_RS_BETA_NZ - vaddps(ymm4, ymm0, ymm0) - CGEMM_OUTPUT_RS - add(rdx, rcx) // c += 4*cs_c; - - - CGEMM_INPUT_SCALE_RS_BETA_NZ - vaddps(ymm5, ymm0, ymm0) - CGEMM_OUTPUT_RS - mov(r11, rcx) // rcx = c + 1*rs_c - - - - CGEMM_INPUT_SCALE_RS_BETA_NZ - vaddps(ymm8, ymm0, ymm0) - CGEMM_OUTPUT_RS - add(rdx, rcx) // c += 4*cs_c; - - - CGEMM_INPUT_SCALE_RS_BETA_NZ - vaddps(ymm9, ymm0, ymm0) - CGEMM_OUTPUT_RS - mov(r12, rcx) // rcx = c + 2*rs_c - - - - CGEMM_INPUT_SCALE_RS_BETA_NZ - vaddps(ymm12, ymm0, ymm0) - CGEMM_OUTPUT_RS - add(rdx, rcx) // c += 4*cs_c; - - - CGEMM_INPUT_SCALE_RS_BETA_NZ - vaddps(ymm13, ymm0, ymm0) - CGEMM_OUTPUT_RS - - - - jmp(.CDONE) // jump to end. + CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r12,32)) + vaddps(ymm13, ymm0, ymm0) + vmovups(ymm0, mem(r12,32)) + jmp(.CDONE) // jump to end. label(.CBETAZERO) - cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. - jz(.CROWSTORBZ) // jump to row storage case - - - - label(.CGENSTORBZ) - - - vmovaps(ymm4, ymm0) - CGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*cs_c; - - - vmovaps(ymm5, ymm0) - CGEMM_OUTPUT_GS - mov(r11, rcx) // rcx = c + 1*rs_c - - - - vmovaps(ymm8, ymm0) - CGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*cs_c; - - - vmovaps(ymm9, ymm0) - CGEMM_OUTPUT_GS - mov(r12, rcx) // rcx = c + 2*rs_c - - - - vmovaps(ymm12, ymm0) - CGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*cs_c; - - - vmovaps(ymm13, ymm0) - CGEMM_OUTPUT_GS - - - - jmp(.CDONE) // jump to end. - - - - label(.CROWSTORBZ) - - - vmovups(ymm4, mem(rcx)) - vmovups(ymm5, mem(rcx, rdx, 1)) - - vmovups(ymm8, mem(r11)) - vmovups(ymm9, mem(r11, rdx, 1)) - - vmovups(ymm12, mem(r12)) - vmovups(ymm13, mem(r12, rdx, 1)) - + vmovups(ymm4, mem(rcx)) + vmovups(ymm5, mem(rcx,32)) + vmovups(ymm8, mem(r11)) + vmovups(ymm9, mem(r11,32)) + vmovups(ymm12, mem(r12)) + vmovups(ymm13, mem(r12,32)) label(.CDONE) @@ -2174,41 +1695,25 @@ void bli_cgemm_haswell_asm_3x8 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( c ); } -// assumes beta.r, beta.i have been broadcast into ymm1, ymm2. -// outputs to ymm0 -#define ZGEMM_INPUT_SCALE_GS_BETA_NZ \ - vmovupd(mem(rcx), xmm0) \ - vmovupd(mem(rcx, rsi, 1), xmm3) \ - vinsertf128(imm(1), xmm3, ymm0, ymm0) \ +#define ZGEMM_INPUT_SCALE_RS_BETA_NZ(where) \ + vmovupd(where, ymm0) \ vpermilpd(imm(0x5), ymm0, ymm3) \ vmulpd(ymm1, ymm0, ymm0) \ vmulpd(ymm2, ymm3, ymm3) \ vaddsubpd(ymm3, ymm0, ymm0) -// assumes values to output are in ymm0 -#define ZGEMM_OUTPUT_GS \ - vextractf128(imm(1), ymm0, xmm3) \ - vmovupd(xmm0, mem(rcx)) \ - vmovupd(xmm3, mem(rcx, rsi, 1)) \ - -#define ZGEMM_INPUT_SCALE_RS_BETA_NZ \ - vmovupd(mem(rcx), ymm0) \ - vpermilpd(imm(0x5), ymm0, ymm3) \ - vmulpd(ymm1, ymm0, ymm0) \ - vmulpd(ymm2, ymm3, ymm3) \ - vaddsubpd(ymm3, ymm0, ymm0) - -#define ZGEMM_OUTPUT_RS \ - vmovupd(ymm0, mem(rcx)) \ - void bli_zgemm_haswell_asm_3x4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, @@ -2223,11 +1728,13 @@ void bli_zgemm_haswell_asm_3x4 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( z, 3, 4, true ); + begin_asm() vzeroall() // zero all xmm/ymm registers. @@ -2238,7 +1745,7 @@ void bli_zgemm_haswell_asm_3x4 //mov(%9, r15) // load address of b_next. add(imm(32*4), rbx) - // initialize loop by pre-loading + // initialize loop by pre-loading vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) @@ -2260,13 +1767,13 @@ void bli_zgemm_haswell_asm_3x4 mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that - // contains the k_left loop. + // contains the k_left loop. label(.ZLOOPKITER) // MAIN LOOP - // iteration 0 + // iteration 0 prefetch(0, mem(rax, 32*16)) vbroadcastsd(mem(rax, 0*8), ymm2) @@ -2293,7 +1800,7 @@ void bli_zgemm_haswell_asm_3x4 vmovapd(mem(rbx, -2*32), ymm0) vmovapd(mem(rbx, -1*32), ymm1) - // iteration 1 + // iteration 1 prefetch(0, mem(rax, 36*16)) vbroadcastsd(mem(rax, 6*8), ymm2) @@ -2320,7 +1827,7 @@ void bli_zgemm_haswell_asm_3x4 vmovapd(mem(rbx, 0*32), ymm0) vmovapd(mem(rbx, 1*32), ymm1) - // iteration 2 + // iteration 2 prefetch(0, mem(rax, 40*16)) vbroadcastsd(mem(rax, 12*8), ymm2) @@ -2347,7 +1854,7 @@ void bli_zgemm_haswell_asm_3x4 vmovapd(mem(rbx, 2*32), ymm0) vmovapd(mem(rbx, 3*32), ymm1) - // iteration 3 + // iteration 3 vbroadcastsd(mem(rax, 18*8), ymm2) vbroadcastsd(mem(rax, 19*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) @@ -2389,7 +1896,7 @@ void bli_zgemm_haswell_asm_3x4 mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. - // else, we prepare to enter k_left loop. + // else, we prepare to enter k_left loop. label(.ZLOOPKLEFT) // EDGE LOOP @@ -2431,8 +1938,8 @@ void bli_zgemm_haswell_asm_3x4 label(.ZPOSTACCUM) - // permute even and odd elements - // of ymm6/7, ymm10/11, ymm/14/15 + // permute even and odd elements + // of ymm6/7, ymm10/11, ymm/14/15 vpermilpd(imm(0x5), ymm6, ymm6) vpermilpd(imm(0x5), ymm7, ymm7) vpermilpd(imm(0x5), ymm10, ymm10) @@ -2441,7 +1948,7 @@ void bli_zgemm_haswell_asm_3x4 vpermilpd(imm(0x5), ymm15, ymm15) - // subtract/add even/odd elements + // subtract/add even/odd elements vaddsubpd(ymm6, ymm4, ymm4) vaddsubpd(ymm7, ymm5, ymm5) @@ -2501,15 +2008,7 @@ void bli_zgemm_haswell_asm_3x4 - - mov(var(cs_c), rsi) // load cs_c - lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dcomplex) - lea(mem(, rsi, 2), rsi) - lea(mem(, rsi, 2), rdx) // rdx = 2*cs_c; - - - - // now avoid loading C if beta == 0 + // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm1) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); @@ -2518,162 +2017,49 @@ void bli_zgemm_haswell_asm_3x4 and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.ZBETAZERO) // if ZF = 1, jump to beta == 0 case - - cmp(imm(16), rsi) // set ZF if (16*cs_c) == 16. - jz(.ZROWSTORED) // jump to row storage case - - - - label(.ZGENSTORED) - - - ZGEMM_INPUT_SCALE_GS_BETA_NZ - vaddpd(ymm4, ymm0, ymm0) - ZGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*cs_c; - - - ZGEMM_INPUT_SCALE_GS_BETA_NZ - vaddpd(ymm5, ymm0, ymm0) - ZGEMM_OUTPUT_GS - mov(r11, rcx) // rcx = c + 1*rs_c - + ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(rcx)) + vaddpd(ymm4, ymm0, ymm0) + vmovupd(ymm0, mem(rcx)) - ZGEMM_INPUT_SCALE_GS_BETA_NZ - vaddpd(ymm8, ymm0, ymm0) - ZGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*cs_c; + ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(rcx,32)) + vaddpd(ymm5, ymm0, ymm0) + vmovupd(ymm0, mem(rcx,32)) - ZGEMM_INPUT_SCALE_GS_BETA_NZ - vaddpd(ymm9, ymm0, ymm0) - ZGEMM_OUTPUT_GS - mov(r12, rcx) // rcx = c + 2*rs_c + ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r11)) + vaddpd(ymm8, ymm0, ymm0) + vmovupd(ymm0, mem(r11)) - ZGEMM_INPUT_SCALE_GS_BETA_NZ - vaddpd(ymm12, ymm0, ymm0) - ZGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*cs_c; + ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r11,32)) + vaddpd(ymm9, ymm0, ymm0) + vmovupd(ymm0, mem(r11,32)) - ZGEMM_INPUT_SCALE_GS_BETA_NZ - vaddpd(ymm13, ymm0, ymm0) - ZGEMM_OUTPUT_GS + ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r12)) + vaddpd(ymm12, ymm0, ymm0) + vmovupd(ymm0, mem(r12)) - jmp(.ZDONE) // jump to end. - - - - label(.ZROWSTORED) - - - ZGEMM_INPUT_SCALE_RS_BETA_NZ - vaddpd(ymm4, ymm0, ymm0) - ZGEMM_OUTPUT_RS - add(rdx, rcx) // c += 2*cs_c; - - - ZGEMM_INPUT_SCALE_RS_BETA_NZ - vaddpd(ymm5, ymm0, ymm0) - ZGEMM_OUTPUT_RS - mov(r11, rcx) // rcx = c + 1*rs_c - - - - ZGEMM_INPUT_SCALE_RS_BETA_NZ - vaddpd(ymm8, ymm0, ymm0) - ZGEMM_OUTPUT_RS - add(rdx, rcx) // c += 2*cs_c; - - - ZGEMM_INPUT_SCALE_RS_BETA_NZ - vaddpd(ymm9, ymm0, ymm0) - ZGEMM_OUTPUT_RS - mov(r12, rcx) // rcx = c + 2*rs_c - - - - ZGEMM_INPUT_SCALE_RS_BETA_NZ - vaddpd(ymm12, ymm0, ymm0) - ZGEMM_OUTPUT_RS - add(rdx, rcx) // c += 2*cs_c; - - - ZGEMM_INPUT_SCALE_RS_BETA_NZ - vaddpd(ymm13, ymm0, ymm0) - ZGEMM_OUTPUT_RS - - - - jmp(.ZDONE) // jump to end. - + ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r12,32)) + vaddpd(ymm13, ymm0, ymm0) + vmovupd(ymm0, mem(r12,32)) + jmp(.ZDONE) // jump to end. label(.ZBETAZERO) - cmp(imm(16), rsi) // set ZF if (16*cs_c) == 16. - jz(.ZROWSTORBZ) // jump to row storage case - - - - label(.ZGENSTORBZ) - - - vmovapd(ymm4, ymm0) - ZGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*cs_c; - - - vmovapd(ymm5, ymm0) - ZGEMM_OUTPUT_GS - mov(r11, rcx) // rcx = c + 1*rs_c - - - - vmovapd(ymm8, ymm0) - ZGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*cs_c; - - - vmovapd(ymm9, ymm0) - ZGEMM_OUTPUT_GS - mov(r12, rcx) // rcx = c + 2*rs_c - - - - vmovapd(ymm12, ymm0) - ZGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*cs_c; - - - vmovapd(ymm13, ymm0) - ZGEMM_OUTPUT_GS - - - - jmp(.ZDONE) // jump to end. - - - - label(.ZROWSTORBZ) - - - vmovupd(ymm4, mem(rcx)) - vmovupd(ymm5, mem(rcx, rdx, 1)) - - vmovupd(ymm8, mem(r11)) - vmovupd(ymm9, mem(r11, rdx, 1)) - - vmovupd(ymm12, mem(r12)) - vmovupd(ymm13, mem(r12, rdx, 1)) - + vmovupd(ymm4, mem(rcx)) + vmovupd(ymm5, mem(rcx,32)) + vmovupd(ymm8, mem(r11)) + vmovupd(ymm9, mem(r11,32)) + vmovupd(ymm12, mem(r12)) + vmovupd(ymm13, mem(r12,32)) label(.ZDONE) @@ -2705,6 +2091,8 @@ void bli_zgemm_haswell_asm_3x4 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( z ); } diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c index b074da965..a3a8b0b09 100644 --- a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c +++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c @@ -78,7 +78,9 @@ void bli_sgemm_haswell_asm_16x6 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, float* restrict alpha, float* restrict a, float* restrict b, @@ -93,29 +95,31 @@ void bli_sgemm_haswell_asm_16x6 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( s, 16, 6, true ); + begin_asm() - + vzeroall() // zero all xmm/ymm registers. - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. - + add(imm(32*4), rax) // initialize loop by pre-loading vmovaps(mem(rax, -4*32), ymm0) vmovaps(mem(rax, -3*32), ymm1) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) - + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c @@ -124,46 +128,46 @@ void bli_sgemm_haswell_asm_16x6 prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*cs_c - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // iteration 0 prefetch(0, mem(rax, 128*4)) - + vbroadcastss(mem(rbx, 0*4), ymm2) vbroadcastss(mem(rbx, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rbx, 2*4), ymm2) vbroadcastss(mem(rbx, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rbx, 4*4), ymm2) vbroadcastss(mem(rbx, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rax, -2*32), ymm0) vmovaps(mem(rax, -1*32), ymm1) - + // iteration 1 vbroadcastss(mem(rbx, 6*4), ymm2) vbroadcastss(mem(rbx, 7*4), ymm3) @@ -171,51 +175,51 @@ void bli_sgemm_haswell_asm_16x6 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rbx, 8*4), ymm2) vbroadcastss(mem(rbx, 9*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rbx, 10*4), ymm2) vbroadcastss(mem(rbx, 11*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rax, 0*32), ymm0) vmovaps(mem(rax, 1*32), ymm1) - + // iteration 2 prefetch(0, mem(rax, 152*4)) - + vbroadcastss(mem(rbx, 12*4), ymm2) vbroadcastss(mem(rbx, 13*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rbx, 14*4), ymm2) vbroadcastss(mem(rbx, 15*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rbx, 16*4), ymm2) vbroadcastss(mem(rbx, 17*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rax, 2*32), ymm0) vmovaps(mem(rax, 3*32), ymm1) - + // iteration 3 vbroadcastss(mem(rbx, 18*4), ymm2) vbroadcastss(mem(rbx, 19*4), ymm3) @@ -223,91 +227,91 @@ void bli_sgemm_haswell_asm_16x6 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rbx, 20*4), ymm2) vbroadcastss(mem(rbx, 21*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rbx, 22*4), ymm2) vbroadcastss(mem(rbx, 23*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + add(imm(4*16*4), rax) // a += 4*16 (unroll x mr) add(imm(4*6*4), rbx) // b += 4*6 (unroll x nr) - + vmovaps(mem(rax, -4*32), ymm0) vmovaps(mem(rax, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP - + prefetch(0, mem(rax, 128*4)) - + vbroadcastss(mem(rbx, 0*4), ymm2) vbroadcastss(mem(rbx, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rbx, 2*4), ymm2) vbroadcastss(mem(rbx, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rbx, 4*4), ymm2) vbroadcastss(mem(rbx, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + add(imm(1*16*4), rax) // a += 1*16 (unroll x mr) add(imm(1*6*4), rbx) // b += 1*6 (unroll x nr) - + vmovaps(mem(rax, -4*32), ymm0) vmovaps(mem(rax, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - - - + + + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) @@ -320,315 +324,107 @@ void bli_sgemm_haswell_asm_16x6 vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) - - - - - - - mov(var(rs_c), rsi) // load rs_c - lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) - - lea(mem(rcx, rsi, 8), rdx) // load address of c + 8*rs_c; - - lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c; - lea(mem(rsi, rsi, 4), r15) // r15 = 5*rs_c; - lea(mem(r13, rsi, 4), r10) // r10 = 7*rs_c; - - + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - - cmp(imm(4), rsi) // set ZF if (4*rs_c) == 4. - jz(.SCOLSTORED) // jump to column storage case - - - - label(.SGENSTORED) - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm4, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm6, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm8, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm10, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm12, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm14, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - //add(rdi, rcx) // c += cs_c; - - - mov(rdx, rcx) // rcx = c + 8*rs_c - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm5, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm7, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm9, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm11, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm13, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - SGEMM_INPUT_GS_BETA_NZ - vfmadd213ps(ymm15, ymm3, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - //add(rdi, rcx) // c += cs_c; - - - - jmp(.SDONE) // jump to end. - - - - label(.SCOLSTORED) - - - vfmadd231ps(mem(rcx), ymm3, ymm4) - vmovups(ymm4, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rdx), ymm3, ymm5) - vmovups(ymm5, mem(rdx)) - add(rdi, rdx) - - - vfmadd231ps(mem(rcx), ymm3, ymm6) - vmovups(ymm6, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rdx), ymm3, ymm7) - vmovups(ymm7, mem(rdx)) - add(rdi, rdx) - - - vfmadd231ps(mem(rcx), ymm3, ymm8) - vmovups(ymm8, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rdx), ymm3, ymm9) - vmovups(ymm9, mem(rdx)) - add(rdi, rdx) - - - vfmadd231ps(mem(rcx), ymm3, ymm10) - vmovups(ymm10, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rdx), ymm3, ymm11) - vmovups(ymm11, mem(rdx)) - add(rdi, rdx) - - - vfmadd231ps(mem(rcx), ymm3, ymm12) - vmovups(ymm12, mem(rcx)) - add(rdi, rcx) - vfmadd231ps(mem(rdx), ymm3, ymm13) - vmovups(ymm13, mem(rdx)) - add(rdi, rdx) - - - vfmadd231ps(mem(rcx), ymm3, ymm14) - vmovups(ymm14, mem(rcx)) - //add(rdi, rcx) - vfmadd231ps(mem(rdx), ymm3, ymm15) - vmovups(ymm15, mem(rdx)) - //add(rdi, rdx) - - - - + + vfmadd231ps(mem(rcx), ymm3, ymm4) + vmovups(ymm4, mem(rcx)) + vfmadd231ps(mem(rcx,32), ymm3, ymm5) + vmovups(ymm5, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm6) + vmovups(ymm6, mem(rcx)) + vfmadd231ps(mem(rcx,32), ymm3, ymm7) + vmovups(ymm7, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm8) + vmovups(ymm8, mem(rcx)) + vfmadd231ps(mem(rcx,32), ymm3, ymm9) + vmovups(ymm9, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm10) + vmovups(ymm10, mem(rcx)) + vfmadd231ps(mem(rcx,32), ymm3, ymm11) + vmovups(ymm11, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm12) + vmovups(ymm12, mem(rcx)) + vfmadd231ps(mem(rcx,32), ymm3, ymm13) + vmovups(ymm13, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231ps(mem(rcx), ymm3, ymm14) + vmovups(ymm14, mem(rcx)) + vfmadd231ps(mem(rcx,32), ymm3, ymm15) + vmovups(ymm15, mem(rcx,32)) + //add(rdi, rcx) + jmp(.SDONE) // jump to end. - - - + label(.SBETAZERO) - - cmp(imm(4), rsi) // set ZF if (4*rs_c) == 4. - jz(.SCOLSTORBZ) // jump to column storage case - - - - label(.SGENSTORBZ) - - - vmovaps(ymm4, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovaps(ymm6, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovaps(ymm8, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovaps(ymm10, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovaps(ymm12, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovaps(ymm14, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - //add(rdi, rcx) // c += cs_c; - - - mov(rdx, rcx) // rcx = c + 8*rs_c - - - vmovaps(ymm5, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovaps(ymm7, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovaps(ymm9, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovaps(ymm11, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovaps(ymm13, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovaps(ymm15, ymm0) - SGEMM_OUTPUT_GS_BETA_NZ - //add(rdi, rcx) // c += cs_c; - - - - jmp(.SDONE) // jump to end. - - - - label(.SCOLSTORBZ) - - - vmovups(ymm4, mem(rcx)) - add(rdi, rcx) - vmovups(ymm5, mem(rdx)) - add(rdi, rdx) - - vmovups(ymm6, mem(rcx)) - add(rdi, rcx) - vmovups(ymm7, mem(rdx)) - add(rdi, rdx) - - - vmovups(ymm8, mem(rcx)) - add(rdi, rcx) - vmovups(ymm9, mem(rdx)) - add(rdi, rdx) - - - vmovups(ymm10, mem(rcx)) - add(rdi, rcx) - vmovups(ymm11, mem(rdx)) - add(rdi, rdx) - - - vmovups(ymm12, mem(rcx)) - add(rdi, rcx) - vmovups(ymm13, mem(rdx)) - add(rdi, rdx) - - - vmovups(ymm14, mem(rcx)) - //add(rdi, rcx) - vmovups(ymm15, mem(rdx)) - //add(rdi, rdx) - - - - - - - + + vmovups(ymm4, mem(rcx)) + vmovups(ymm5, mem(rcx,32)) + add(rdi, rcx) + + vmovups(ymm6, mem(rcx)) + vmovups(ymm7, mem(rcx,32)) + add(rdi, rcx) + + + vmovups(ymm8, mem(rcx)) + vmovups(ymm9, mem(rcx,32)) + add(rdi, rcx) + + + vmovups(ymm10, mem(rcx)) + vmovups(ymm11, mem(rcx,32)) + add(rdi, rcx) + + + vmovups(ymm12, mem(rcx)) + vmovups(ymm13, mem(rcx,32)) + add(rdi, rcx) + + + vmovups(ymm14, mem(rcx)) + vmovups(ymm15, mem(rcx,32)) + //add(rdi, rcx) + label(.SDONE) - - + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c)/*, // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -636,6 +432,8 @@ void bli_sgemm_haswell_asm_16x6 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( s ); } #define DGEMM_INPUT_GS_BETA_NZ \ @@ -664,7 +462,9 @@ void bli_sgemm_haswell_asm_16x6 void bli_dgemm_haswell_asm_8x6 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, double* restrict alpha, double* restrict a, double* restrict b, @@ -679,29 +479,31 @@ void bli_dgemm_haswell_asm_8x6 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( d, 8, 6, false ); + begin_asm() - + vzeroall() // zero all xmm/ymm registers. - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. - + add(imm(32*4), rax) // initialize loop by pre-loading vmovapd(mem(rax, -4*32), ymm0) vmovapd(mem(rax, -3*32), ymm1) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) - + lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c @@ -710,46 +512,46 @@ void bli_dgemm_haswell_asm_8x6 prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*cs_c - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // iteration 0 prefetch(0, mem(rax, 64*8)) - + vbroadcastsd(mem(rbx, 0*8), ymm2) vbroadcastsd(mem(rbx, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rbx, 2*8), ymm2) vbroadcastsd(mem(rbx, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rbx, 4*8), ymm2) vbroadcastsd(mem(rbx, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rax, -2*32), ymm0) vmovapd(mem(rax, -1*32), ymm1) - + // iteration 1 vbroadcastsd(mem(rbx, 6*8), ymm2) vbroadcastsd(mem(rbx, 7*8), ymm3) @@ -757,51 +559,51 @@ void bli_dgemm_haswell_asm_8x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rbx, 8*8), ymm2) vbroadcastsd(mem(rbx, 9*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rbx, 10*8), ymm2) vbroadcastsd(mem(rbx, 11*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rax, 0*32), ymm0) vmovapd(mem(rax, 1*32), ymm1) - + // iteration 2 prefetch(0, mem(rax, 76*8)) - + vbroadcastsd(mem(rbx, 12*8), ymm2) vbroadcastsd(mem(rbx, 13*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rbx, 14*8), ymm2) vbroadcastsd(mem(rbx, 15*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rbx, 16*8), ymm2) vbroadcastsd(mem(rbx, 17*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rax, 2*32), ymm0) vmovapd(mem(rax, 3*32), ymm1) - + // iteration 3 vbroadcastsd(mem(rbx, 18*8), ymm2) vbroadcastsd(mem(rbx, 19*8), ymm3) @@ -809,91 +611,91 @@ void bli_dgemm_haswell_asm_8x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rbx, 20*8), ymm2) vbroadcastsd(mem(rbx, 21*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rbx, 22*8), ymm2) vbroadcastsd(mem(rbx, 23*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + add(imm(4*8*8), rax) // a += 4*8 (unroll x mr) add(imm(4*6*8), rbx) // b += 4*6 (unroll x nr) - + vmovapd(mem(rax, -4*32), ymm0) vmovapd(mem(rax, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + prefetch(0, mem(rax, 64*8)) - + vbroadcastsd(mem(rbx, 0*8), ymm2) vbroadcastsd(mem(rbx, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rbx, 2*8), ymm2) vbroadcastsd(mem(rbx, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rbx, 4*8), ymm2) vbroadcastsd(mem(rbx, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + add(imm(1*8*8), rax) // a += 1*8 (unroll x mr) add(imm(1*6*8), rbx) // b += 1*6 (unroll x nr) - + vmovapd(mem(rax, -4*32), ymm0) vmovapd(mem(rax, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - - - + + + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) @@ -906,314 +708,107 @@ void bli_dgemm_haswell_asm_8x6 vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(ymm0, ymm15, ymm15) - - - - - - - mov(var(rs_c), rsi) // load rs_c - lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) - - lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; - - lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c; - //lea(mem(rsi, rsi, 4), r15) // r15 = 5*rs_c; - //lea(mem(r13, rsi, 4), r10) // r10 = 7*rs_c; - - + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - - cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8. - jz(.DCOLSTORED) // jump to column storage case - - - - label(.DGENSTORED) - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm4, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm6, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm8, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm10, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm12, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm14, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - //add(rdi, rcx) // c += cs_c; - - - mov(rdx, rcx) // rcx = c + 4*rs_c - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm5, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm7, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm9, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm11, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm13, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - DGEMM_INPUT_GS_BETA_NZ - vfmadd213pd(ymm15, ymm3, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - //add(rdi, rcx) // c += cs_c; - - - - jmp(.DDONE) // jump to end. - - - - label(.DCOLSTORED) - - - vfmadd231pd(mem(rcx), ymm3, ymm4) - vmovupd(ymm4, mem(rcx)) - add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm5) - vmovupd(ymm5, mem(rdx)) - add(rdi, rdx) - - - vfmadd231pd(mem(rcx), ymm3, ymm6) - vmovupd(ymm6, mem(rcx)) - add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm7) - vmovupd(ymm7, mem(rdx)) - add(rdi, rdx) - - - vfmadd231pd(mem(rcx), ymm3, ymm8) - vmovupd(ymm8, mem(rcx)) - add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm9) - vmovupd(ymm9, mem(rdx)) - add(rdi, rdx) - - - vfmadd231pd(mem(rcx), ymm3, ymm10) - vmovupd(ymm10, mem(rcx)) - add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm11) - vmovupd(ymm11, mem(rdx)) - add(rdi, rdx) - - - vfmadd231pd(mem(rcx), ymm3, ymm12) - vmovupd(ymm12, mem(rcx)) - add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm13) - vmovupd(ymm13, mem(rdx)) - add(rdi, rdx) - - - vfmadd231pd(mem(rcx), ymm3, ymm14) - vmovupd(ymm14, mem(rcx)) - //add(rdi, rcx) - vfmadd231pd(mem(rdx), ymm3, ymm15) - vmovupd(ymm15, mem(rdx)) - //add(rdi, rdx) - - - - jmp(.DDONE) // jump to end. - - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) + vmovupd(ymm4, mem(rcx)) + vfmadd231pd(mem(rcx,32), ymm3, ymm5) + vmovupd(ymm5, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231pd(mem(rcx), ymm3, ymm6) + vmovupd(ymm6, mem(rcx)) + vfmadd231pd(mem(rcx,32), ymm3, ymm7) + vmovupd(ymm7, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231pd(mem(rcx), ymm3, ymm8) + vmovupd(ymm8, mem(rcx)) + vfmadd231pd(mem(rcx,32), ymm3, ymm9) + vmovupd(ymm9, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231pd(mem(rcx), ymm3, ymm10) + vmovupd(ymm10, mem(rcx)) + vfmadd231pd(mem(rcx,32), ymm3, ymm11) + vmovupd(ymm11, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231pd(mem(rcx), ymm3, ymm12) + vmovupd(ymm12, mem(rcx)) + vfmadd231pd(mem(rcx,32), ymm3, ymm13) + vmovupd(ymm13, mem(rcx,32)) + add(rdi, rcx) + + + vfmadd231pd(mem(rcx), ymm3, ymm14) + vmovupd(ymm14, mem(rcx)) + vfmadd231pd(mem(rcx,32), ymm3, ymm15) + vmovupd(ymm15, mem(rcx,32)) + //add(rdi, rcx) + + jmp(.DDONE) // jump to end. + label(.DBETAZERO) - - cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8. - jz(.DCOLSTORBZ) // jump to column storage case - - - - label(.DGENSTORBZ) - - - vmovapd(ymm4, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovapd(ymm6, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovapd(ymm8, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovapd(ymm10, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovapd(ymm12, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovapd(ymm14, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - //add(rdi, rcx) // c += cs_c; - - - mov(rdx, rcx) // rcx = c + 4*rs_c - - - vmovapd(ymm5, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovapd(ymm7, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovapd(ymm9, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovapd(ymm11, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovapd(ymm13, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - add(rdi, rcx) // c += cs_c; - - - vmovapd(ymm15, ymm0) - DGEMM_OUTPUT_GS_BETA_NZ - //add(rdi, rcx) // c += cs_c; - - - - jmp(.DDONE) // jump to end. - - - - label(.DCOLSTORBZ) - - - vmovupd(ymm4, mem(rcx)) - add(rdi, rcx) - vmovupd(ymm5, mem(rdx)) - add(rdi, rdx) - - vmovupd(ymm6, mem(rcx)) - add(rdi, rcx) - vmovupd(ymm7, mem(rdx)) - add(rdi, rdx) - - - vmovupd(ymm8, mem(rcx)) - add(rdi, rcx) - vmovupd(ymm9, mem(rdx)) - add(rdi, rdx) - - - vmovupd(ymm10, mem(rcx)) - add(rdi, rcx) - vmovupd(ymm11, mem(rdx)) - add(rdi, rdx) - - - vmovupd(ymm12, mem(rcx)) - add(rdi, rcx) - vmovupd(ymm13, mem(rdx)) - add(rdi, rdx) - - - vmovupd(ymm14, mem(rcx)) - //add(rdi, rcx) - vmovupd(ymm15, mem(rdx)) - //add(rdi, rdx) - - - - - - - + + vmovupd(ymm4, mem(rcx)) + vmovupd(ymm5, mem(rcx,32)) + add(rdi, rcx) + + vmovupd(ymm6, mem(rcx)) + vmovupd(ymm7, mem(rcx,32)) + add(rdi, rcx) + + + vmovupd(ymm8, mem(rcx)) + vmovupd(ymm9, mem(rcx,32)) + add(rdi, rcx) + + + vmovupd(ymm10, mem(rcx)) + vmovupd(ymm11, mem(rcx,32)) + add(rdi, rcx) + + + vmovupd(ymm12, mem(rcx)) + vmovupd(ymm13, mem(rcx,32)) + add(rdi, rcx) + + + vmovupd(ymm14, mem(rcx)) + vmovupd(ymm15, mem(rcx,32)) + //add(rdi, rcx) + label(.DDONE) - - - end_asm( + + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c)/*, // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1221,45 +816,25 @@ void bli_dgemm_haswell_asm_8x6 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( d ); } -// assumes beta.r, beta.i have been broadcast into ymm1, ymm2. -// outputs to ymm0 -#define CGEMM_INPUT_SCALE_GS_BETA_NZ \ - vmovlpd(mem(rcx), xmm0, xmm0) \ - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \ - vmovlpd(mem(rcx, rsi, 2), xmm3, xmm3) \ - vmovhpd(mem(rcx, r13, 1), xmm3, xmm3) \ - vinsertf128(imm(1), xmm3, ymm0, ymm0) \ +#define CGEMM_INPUT_SCALE_CS_BETA_NZ(where) \ + vmovups(where, ymm0) \ vpermilps(imm(0xb1), ymm0, ymm3) \ vmulps(ymm1, ymm0, ymm0) \ vmulps(ymm2, ymm3, ymm3) \ vaddsubps(ymm3, ymm0, ymm0) -// assumes values to output are in ymm0 -#define CGEMM_OUTPUT_GS \ - vextractf128(imm(1), ymm0, xmm3) \ - vmovlpd(xmm0, mem(rcx)) \ - vmovhpd(xmm0, mem(rcx, rsi, 1)) \ - vmovlpd(xmm3, mem(rcx, rsi, 2)) \ - vmovhpd(xmm3, mem(rcx, r13, 1)) - -#define CGEMM_INPUT_SCALE_CS_BETA_NZ \ - vmovups(mem(rcx), ymm0) \ - vpermilps(imm(0xb1), ymm0, ymm3) \ - vmulps(ymm1, ymm0, ymm0) \ - vmulps(ymm2, ymm3, ymm3) \ - vaddsubps(ymm3, ymm0, ymm0) - -#define CGEMM_OUTPUT_CS \ - vmovups(ymm0, mem(rcx)) \ - void bli_cgemm_haswell_asm_8x3 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, @@ -1274,75 +849,77 @@ void bli_cgemm_haswell_asm_8x3 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( c, 8, 3, false ); + begin_asm() - + vzeroall() // zero all xmm/ymm registers. - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. - + add(imm(32*4), rax) // initialize loop by pre-loading vmovaps(mem(rax, -4*32), ymm0) vmovaps(mem(rax, -3*32), ymm1) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) - + lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*cs_c; lea(mem(rcx, rdi, 2), r12) // r12 = c + 2*cs_c; - + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c prefetch(0, mem(r11, 7*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, 7*8)) // prefetch c + 2*cs_c - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.CLOOPKITER) // MAIN LOOP - - + + // iteration 0 prefetch(0, mem(rax, 32*8)) - + vbroadcastss(mem(rbx, 0*4), ymm2) vbroadcastss(mem(rbx, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rbx, 2*4), ymm2) vbroadcastss(mem(rbx, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rbx, 4*4), ymm2) vbroadcastss(mem(rbx, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rax, -2*32), ymm0) vmovaps(mem(rax, -1*32), ymm1) - + // iteration 1 vbroadcastss(mem(rbx, 6*4), ymm2) vbroadcastss(mem(rbx, 7*4), ymm3) @@ -1350,51 +927,51 @@ void bli_cgemm_haswell_asm_8x3 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rbx, 8*4), ymm2) vbroadcastss(mem(rbx, 9*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rbx, 10*4), ymm2) vbroadcastss(mem(rbx, 11*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rax, 0*32), ymm0) vmovaps(mem(rax, 1*32), ymm1) - + // iteration 2 prefetch(0, mem(rax, 38*8)) - + vbroadcastss(mem(rbx, 12*4), ymm2) vbroadcastss(mem(rbx, 13*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rbx, 14*4), ymm2) vbroadcastss(mem(rbx, 15*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rbx, 16*4), ymm2) vbroadcastss(mem(rbx, 17*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + vmovaps(mem(rax, 2*32), ymm0) vmovaps(mem(rax, 3*32), ymm1) - + // iteration 3 vbroadcastss(mem(rbx, 18*4), ymm2) vbroadcastss(mem(rbx, 19*4), ymm3) @@ -1402,84 +979,84 @@ void bli_cgemm_haswell_asm_8x3 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rbx, 20*4), ymm2) vbroadcastss(mem(rbx, 21*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rbx, 22*4), ymm2) vbroadcastss(mem(rbx, 23*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + add(imm(4*8*8), rax) // a += 4*8 (unroll x mr) add(imm(4*3*8), rbx) // b += 4*3 (unroll x nr) - + vmovaps(mem(rax, -4*32), ymm0) vmovaps(mem(rax, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.CLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.CCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.CLOOPKLEFT) // EDGE LOOP - + prefetch(0, mem(rax, 32*8)) - + vbroadcastss(mem(rbx, 0*4), ymm2) vbroadcastss(mem(rbx, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rbx, 2*4), ymm2) vbroadcastss(mem(rbx, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rbx, 4*4), ymm2) vbroadcastss(mem(rbx, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + add(imm(1*8*8), rax) // a += 1*8 (unroll x mr) add(imm(1*3*8), rbx) // b += 1*3 (unroll x nr) - + vmovaps(mem(rax, -4*32), ymm0) vmovaps(mem(rax, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.CLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.CPOSTACCUM) - - + + // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilps(imm(0xb1), ymm6, ymm6) @@ -1488,76 +1065,68 @@ void bli_cgemm_haswell_asm_8x3 vpermilps(imm(0xb1), ymm11, ymm11) vpermilps(imm(0xb1), ymm14, ymm14) vpermilps(imm(0xb1), ymm15, ymm15) - - + + // subtract/add even/odd elements vaddsubps(ymm6, ymm4, ymm4) vaddsubps(ymm7, ymm5, ymm5) - + vaddsubps(ymm10, ymm8, ymm8) vaddsubps(ymm11, ymm9, ymm9) - + vaddsubps(ymm14, ymm12, ymm12) vaddsubps(ymm15, ymm13, ymm13) - - - - + + + + mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate - - + + vpermilps(imm(0xb1), ymm4, ymm3) vmulps(ymm0, ymm4, ymm4) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm4, ymm4) - + vpermilps(imm(0xb1), ymm5, ymm3) vmulps(ymm0, ymm5, ymm5) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm5, ymm5) - - + + vpermilps(imm(0xb1), ymm8, ymm3) vmulps(ymm0, ymm8, ymm8) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm8, ymm8) - + vpermilps(imm(0xb1), ymm9, ymm3) vmulps(ymm0, ymm9, ymm9) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm9, ymm9) - - + + vpermilps(imm(0xb1), ymm12, ymm3) vmulps(ymm0, ymm12, ymm12) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm12, ymm12) - + vpermilps(imm(0xb1), ymm13, ymm3) vmulps(ymm0, ymm13, ymm13) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm13, ymm13) - - - - - + + + + + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate - - - - - mov(var(rs_c), rsi) // load rs_c - lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex) - lea(mem(, rsi, 4), rdx) // rdx = 4*rs_c; - lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c; - - - + + + // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. @@ -1566,186 +1135,71 @@ void bli_cgemm_haswell_asm_8x3 sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.CBETAZERO) // if ZF = 1, jump to beta == 0 case - - - cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. - jz(.CCOLSTORED) // jump to row storage case - - - - label(.CGENSTORED) - - - CGEMM_INPUT_SCALE_GS_BETA_NZ - vaddps(ymm4, ymm0, ymm0) - CGEMM_OUTPUT_GS - add(rdx, rcx) // c += 4*rs_c; - - - CGEMM_INPUT_SCALE_GS_BETA_NZ - vaddps(ymm5, ymm0, ymm0) - CGEMM_OUTPUT_GS - mov(r11, rcx) // rcx = c + 1*cs_c - - - - CGEMM_INPUT_SCALE_GS_BETA_NZ - vaddps(ymm8, ymm0, ymm0) - CGEMM_OUTPUT_GS - add(rdx, rcx) // c += 4*rs_c; - - - CGEMM_INPUT_SCALE_GS_BETA_NZ - vaddps(ymm9, ymm0, ymm0) - CGEMM_OUTPUT_GS - mov(r12, rcx) // rcx = c + 2*cs_c - - - - CGEMM_INPUT_SCALE_GS_BETA_NZ - vaddps(ymm12, ymm0, ymm0) - CGEMM_OUTPUT_GS - add(rdx, rcx) // c += 4*rs_c; - - - CGEMM_INPUT_SCALE_GS_BETA_NZ - vaddps(ymm13, ymm0, ymm0) - CGEMM_OUTPUT_GS - - - - jmp(.CDONE) // jump to end. - - - - label(.CCOLSTORED) - - - CGEMM_INPUT_SCALE_CS_BETA_NZ - vaddps(ymm4, ymm0, ymm0) - CGEMM_OUTPUT_CS - add(rdx, rcx) // c += 4*rs_c; - - - CGEMM_INPUT_SCALE_CS_BETA_NZ - vaddps(ymm5, ymm0, ymm0) - CGEMM_OUTPUT_CS - mov(r11, rcx) // rcx = c + 1*cs_c - - - - CGEMM_INPUT_SCALE_CS_BETA_NZ - vaddps(ymm8, ymm0, ymm0) - CGEMM_OUTPUT_CS - add(rdx, rcx) // c += 4*rs_c; - - - CGEMM_INPUT_SCALE_CS_BETA_NZ - vaddps(ymm9, ymm0, ymm0) - CGEMM_OUTPUT_CS - mov(r12, rcx) // rcx = c + 2*cs_c - - - - CGEMM_INPUT_SCALE_CS_BETA_NZ - vaddps(ymm12, ymm0, ymm0) - CGEMM_OUTPUT_CS - add(rdx, rcx) // c += 4*rs_c; - - - CGEMM_INPUT_SCALE_CS_BETA_NZ - vaddps(ymm13, ymm0, ymm0) - CGEMM_OUTPUT_CS - - - - jmp(.CDONE) // jump to end. - - - + + CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(rcx)) + vaddps(ymm4, ymm0, ymm0) + vmovups(ymm0, mem(rcx)) + + + CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(rcx,32)) + vaddps(ymm5, ymm0, ymm0) + vmovups(ymm0, mem(rcx,32)) + + + + CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r11)) + vaddps(ymm8, ymm0, ymm0) + vmovups(ymm0, mem(r11)) + + + CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r11,32)) + vaddps(ymm9, ymm0, ymm0) + vmovups(ymm0, mem(r11,32)) + + + + CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r12)) + vaddps(ymm12, ymm0, ymm0) + vmovups(ymm0, mem(r12)) + + + CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r12,32)) + vaddps(ymm13, ymm0, ymm0) + vmovups(ymm0, mem(r12,32)) + + jmp(.CDONE) // jump to end. + label(.CBETAZERO) - - cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8. - jz(.CCOLSTORBZ) // jump to row storage case - - - - label(.CGENSTORBZ) - - - vmovaps(ymm4, ymm0) - CGEMM_OUTPUT_GS - add(rdx, rcx) // c += 4*rs_c; - - - vmovaps(ymm5, ymm0) - CGEMM_OUTPUT_GS - mov(r11, rcx) // rcx = c + 1*cs_c - - - - vmovaps(ymm8, ymm0) - CGEMM_OUTPUT_GS - add(rdx, rcx) // c += 4*rs_c; - - - vmovaps(ymm9, ymm0) - CGEMM_OUTPUT_GS - mov(r12, rcx) // rcx = c + 2*cs_c - - - - vmovaps(ymm12, ymm0) - CGEMM_OUTPUT_GS - add(rdx, rcx) // c += 4*rs_c; - - - vmovaps(ymm13, ymm0) - CGEMM_OUTPUT_GS - - - - jmp(.CDONE) // jump to end. - - - - label(.CCOLSTORBZ) - - - vmovups(ymm4, mem(rcx)) - vmovups(ymm5, mem(rcx, rdx, 1)) - - vmovups(ymm8, mem(r11)) - vmovups(ymm9, mem(r11, rdx, 1)) - - vmovups(ymm12, mem(r12)) - vmovups(ymm13, mem(r12, rdx, 1)) - - - - - - + + vmovups(ymm4, mem(rcx)) + vmovups(ymm5, mem(rcx,32)) + + vmovups(ymm8, mem(r11)) + vmovups(ymm9, mem(r11,32)) + + vmovups(ymm12, mem(r12)) + vmovups(ymm13, mem(r12,32)) + label(.CDONE) - - - end_asm( + + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c)/*, // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1753,41 +1207,25 @@ void bli_cgemm_haswell_asm_8x3 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( c ); } -// assumes beta.r, beta.i have been broadcast into ymm1, ymm2. -// outputs to ymm0 -#define ZGEMM_INPUT_SCALE_GS_BETA_NZ \ - vmovupd(mem(rcx), xmm0) \ - vmovupd(mem(rcx, rsi, 1), xmm3) \ - vinsertf128(imm(1), xmm3, ymm0, ymm0) \ +#define ZGEMM_INPUT_SCALE_CS_BETA_NZ(where) \ + vmovups(where, ymm0) \ vpermilpd(imm(0x5), ymm0, ymm3) \ vmulpd(ymm1, ymm0, ymm0) \ vmulpd(ymm2, ymm3, ymm3) \ vaddsubpd(ymm3, ymm0, ymm0) - -// assumes values to output are in ymm0 -#define ZGEMM_OUTPUT_GS \ - vextractf128(imm(1), ymm0, xmm3) \ - vmovupd(xmm0, mem(rcx)) \ - vmovupd(xmm3, mem(rcx, rsi, 1)) \ - -#define ZGEMM_INPUT_SCALE_CS_BETA_NZ \ - vmovups(mem(rcx), ymm0) \ - vpermilpd(imm(0x5), ymm0, ymm3) \ - vmulpd(ymm1, ymm0, ymm0) \ - vmulpd(ymm2, ymm3, ymm3) \ - vaddsubpd(ymm3, ymm0, ymm0) - -#define ZGEMM_OUTPUT_CS \ - vmovupd(ymm0, mem(rcx)) \ void bli_zgemm_haswell_asm_4x3 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, @@ -1802,76 +1240,78 @@ void bli_zgemm_haswell_asm_4x3 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( z, 4, 3, false ); + begin_asm() - + vzeroall() // zero all xmm/ymm registers. - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. - + add(imm(32*4), rax) // initialize loop by pre-loading vmovapd(mem(rax, -4*32), ymm0) vmovapd(mem(rax, -3*32), ymm1) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) lea(mem(, rdi, 2), rdi) - + lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*cs_c; lea(mem(rcx, rdi, 2), r12) // r12 = c + 2*cs_c; - + prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c prefetch(0, mem(r11, 7*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, 7*8)) // prefetch c + 2*cs_c - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.ZLOOPKITER) // MAIN LOOP - - + + // iteration 0 prefetch(0, mem(rax, 32*16)) - + vbroadcastsd(mem(rbx, 0*8), ymm2) vbroadcastsd(mem(rbx, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rbx, 2*8), ymm2) vbroadcastsd(mem(rbx, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rbx, 4*8), ymm2) vbroadcastsd(mem(rbx, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rax, -2*32), ymm0) vmovapd(mem(rax, -1*32), ymm1) - + // iteration 1 vbroadcastsd(mem(rbx, 6*8), ymm2) vbroadcastsd(mem(rbx, 7*8), ymm3) @@ -1879,51 +1319,51 @@ void bli_zgemm_haswell_asm_4x3 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rbx, 8*8), ymm2) vbroadcastsd(mem(rbx, 9*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rbx, 10*8), ymm2) vbroadcastsd(mem(rbx, 11*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rax, 0*32), ymm0) vmovapd(mem(rax, 1*32), ymm1) - + // iteration 2 prefetch(0, mem(rax, 38*16)) - + vbroadcastsd(mem(rbx, 12*8), ymm2) vbroadcastsd(mem(rbx, 13*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rbx, 14*8), ymm2) vbroadcastsd(mem(rbx, 15*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rbx, 16*8), ymm2) vbroadcastsd(mem(rbx, 17*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + vmovapd(mem(rax, 2*32), ymm0) vmovapd(mem(rax, 3*32), ymm1) - + // iteration 3 vbroadcastsd(mem(rbx, 18*8), ymm2) vbroadcastsd(mem(rbx, 19*8), ymm3) @@ -1931,83 +1371,83 @@ void bli_zgemm_haswell_asm_4x3 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rbx, 20*8), ymm2) vbroadcastsd(mem(rbx, 21*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rbx, 22*8), ymm2) vbroadcastsd(mem(rbx, 23*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + add(imm(4*4*16), rax) // a += 4*4 (unroll x mr) add(imm(4*3*16), rbx) // b += 4*3 (unroll x nr) - + vmovapd(mem(rax, -4*32), ymm0) vmovapd(mem(rax, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.ZLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.ZCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.ZLOOPKLEFT) // EDGE LOOP - + prefetch(0, mem(rax, 32*16)) - + vbroadcastsd(mem(rbx, 0*8), ymm2) vbroadcastsd(mem(rbx, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rbx, 2*8), ymm2) vbroadcastsd(mem(rbx, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rbx, 4*8), ymm2) vbroadcastsd(mem(rbx, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + add(imm(1*4*16), rax) // a += 1*4 (unroll x mr) add(imm(1*3*16), rbx) // b += 1*3 (unroll x nr) - + vmovapd(mem(rax, -4*32), ymm0) vmovapd(mem(rax, -3*32), ymm1) - - + + dec(rsi) // i -= 1; jne(.ZLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.ZPOSTACCUM) - + // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilpd(imm(0x5), ymm6, ymm6) @@ -2016,76 +1456,69 @@ void bli_zgemm_haswell_asm_4x3 vpermilpd(imm(0x5), ymm11, ymm11) vpermilpd(imm(0x5), ymm14, ymm14) vpermilpd(imm(0x5), ymm15, ymm15) - - + + // subtract/add even/odd elements vaddsubpd(ymm6, ymm4, ymm4) vaddsubpd(ymm7, ymm5, ymm5) - + vaddsubpd(ymm10, ymm8, ymm8) vaddsubpd(ymm11, ymm9, ymm9) - + vaddsubpd(ymm14, ymm12, ymm12) vaddsubpd(ymm15, ymm13, ymm13) - - - - + + + + mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate - - + + vpermilpd(imm(0x5), ymm4, ymm3) vmulpd(ymm0, ymm4, ymm4) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm4, ymm4) - + vpermilpd(imm(0x5), ymm5, ymm3) vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm5, ymm5) - - + + vpermilpd(imm(0x5), ymm8, ymm3) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm8, ymm8) - + vpermilpd(imm(0x5), ymm9, ymm3) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm9, ymm9) - - + + vpermilpd(imm(0x5), ymm12, ymm3) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm12, ymm12) - + vpermilpd(imm(0x5), ymm13, ymm3) vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm13, ymm13) - - - - - + + + + + mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate - - - - - mov(var(rs_c), rsi) // load rs_c - lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex) - lea(mem(, rsi, 2), rsi) - lea(mem(, rsi, 2), rdx) // rdx = 2*rs_c; - - - + + + + // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm1) // set ZF if beta_r == 0. @@ -2094,171 +1527,56 @@ void bli_zgemm_haswell_asm_4x3 sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.ZBETAZERO) // if ZF = 1, jump to beta == 0 case - - - cmp(imm(16), rsi) // set ZF if (16*rs_c) == 16. - jz(.ZCOLSTORED) // jump to row storage case - - - - label(.ZGENSTORED) - - - ZGEMM_INPUT_SCALE_GS_BETA_NZ - vaddpd(ymm4, ymm0, ymm0) - ZGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*rs_c; - - - ZGEMM_INPUT_SCALE_GS_BETA_NZ - vaddpd(ymm5, ymm0, ymm0) - ZGEMM_OUTPUT_GS - mov(r11, rcx) // rcx = c + 1*cs_c - - - - ZGEMM_INPUT_SCALE_GS_BETA_NZ - vaddpd(ymm8, ymm0, ymm0) - ZGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*rs_c; - - - ZGEMM_INPUT_SCALE_GS_BETA_NZ - vaddpd(ymm9, ymm0, ymm0) - ZGEMM_OUTPUT_GS - mov(r12, rcx) // rcx = c + 2*cs_c - - - - ZGEMM_INPUT_SCALE_GS_BETA_NZ - vaddpd(ymm12, ymm0, ymm0) - ZGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*rs_c; - - - ZGEMM_INPUT_SCALE_GS_BETA_NZ - vaddpd(ymm13, ymm0, ymm0) - ZGEMM_OUTPUT_GS - - - - jmp(.ZDONE) // jump to end. - - - - label(.ZCOLSTORED) - - - ZGEMM_INPUT_SCALE_CS_BETA_NZ - vaddpd(ymm4, ymm0, ymm0) - ZGEMM_OUTPUT_CS - add(rdx, rcx) // c += 2*rs_c; - - - ZGEMM_INPUT_SCALE_CS_BETA_NZ - vaddpd(ymm5, ymm0, ymm0) - ZGEMM_OUTPUT_CS - mov(r11, rcx) // rcx = c + 1*cs_c - - - - ZGEMM_INPUT_SCALE_CS_BETA_NZ - vaddpd(ymm8, ymm0, ymm0) - ZGEMM_OUTPUT_CS - add(rdx, rcx) // c += 2*rs_c; - - - ZGEMM_INPUT_SCALE_CS_BETA_NZ - vaddpd(ymm9, ymm0, ymm0) - ZGEMM_OUTPUT_CS - mov(r12, rcx) // rcx = c + 2*cs_c - - - - ZGEMM_INPUT_SCALE_CS_BETA_NZ - vaddpd(ymm12, ymm0, ymm0) - ZGEMM_OUTPUT_CS - add(rdx, rcx) // c += 2*rs_c; - - - ZGEMM_INPUT_SCALE_CS_BETA_NZ - vaddpd(ymm13, ymm0, ymm0) - ZGEMM_OUTPUT_CS - - - - jmp(.ZDONE) // jump to end. - - - + + ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(rcx)) + vaddpd(ymm4, ymm0, ymm0) + vmovupd(ymm0, mem(rcx)) + + + ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(rcx,32)) + vaddpd(ymm5, ymm0, ymm0) + vmovupd(ymm0, mem(rcx,32)) + + + + ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r11)) + vaddpd(ymm8, ymm0, ymm0) + vmovupd(ymm0, mem(r11)) + + + ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r11,32)) + vaddpd(ymm9, ymm0, ymm0) + vmovupd(ymm0, mem(r11,32)) + + + + ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r12)) + vaddpd(ymm12, ymm0, ymm0) + vmovupd(ymm0, mem(r12)) + + + ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r12,32)) + vaddpd(ymm13, ymm0, ymm0) + vmovupd(ymm0, mem(r12,32)) + + jmp(.ZDONE) // jump to end. + label(.ZBETAZERO) - - cmp(imm(16), rsi) // set ZF if (16*rs_c) == 16. - jz(.ZCOLSTORBZ) // jump to row storage case - - - - label(.ZGENSTORBZ) - - - vmovapd(ymm4, ymm0) - ZGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*rs_c; - - - vmovapd(ymm5, ymm0) - ZGEMM_OUTPUT_GS - mov(r11, rcx) // rcx = c + 1*cs_c - - - - vmovapd(ymm8, ymm0) - ZGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*rs_c; - - - vmovapd(ymm9, ymm0) - ZGEMM_OUTPUT_GS - mov(r12, rcx) // rcx = c + 2*cs_c - - - - vmovapd(ymm12, ymm0) - ZGEMM_OUTPUT_GS - add(rdx, rcx) // c += 2*rs_c; - - - vmovapd(ymm13, ymm0) - ZGEMM_OUTPUT_GS - - - - jmp(.ZDONE) // jump to end. - - - - label(.ZCOLSTORBZ) - - - vmovupd(ymm4, mem(rcx)) - vmovupd(ymm5, mem(rcx, rdx, 1)) - - vmovupd(ymm8, mem(r11)) - vmovupd(ymm9, mem(r11, rdx, 1)) - - vmovupd(ymm12, mem(r12)) - vmovupd(ymm13, mem(r12, rdx, 1)) - - - - - - + + vmovupd(ymm4, mem(rcx)) + vmovupd(ymm5, mem(rcx,32)) + + vmovupd(ymm8, mem(r11)) + vmovupd(ymm9, mem(r11,32)) + + vmovupd(ymm12, mem(r12)) + vmovupd(ymm13, mem(r12,32)) + label(.ZDONE) - - - end_asm( + + + end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 @@ -2273,7 +1591,7 @@ void bli_zgemm_haswell_asm_4x3 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2281,6 +1599,8 @@ void bli_zgemm_haswell_asm_4x3 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( z ); } diff --git a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c index 880632ae0..f20e43f7c 100644 --- a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c +++ b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c @@ -256,6 +256,8 @@ extern int offsets[16]; //#define LOOPMON void bli_dgemm_knc_asm_30x8 ( + dim_t m, + dim_t n, dim_t k, double* restrict alpha, double* restrict a, @@ -273,80 +275,82 @@ void bli_dgemm_knc_asm_30x8 uint64_t k64 = k; + GEMM_UKR_SETUP_CT( d, 30, 8, true ); + #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; #endif #ifdef LOOPMON int tlooph, tloopl, blooph, bloopl; #endif - + __asm { #ifdef MONITORS rdtsc mov topl, eax - mov toph, edx + mov toph, edx #endif vpxord zmm0, zmm0, zmm0 vmovaps zmm1, zmm0 //clear out registers - vmovaps zmm2, zmm0 + vmovaps zmm2, zmm0 mov rsi, k64 //loop index - vmovaps zmm3, zmm0 + vmovaps zmm3, zmm0 mov r11, rs_c //load row stride - vmovaps zmm4, zmm0 + vmovaps zmm4, zmm0 sal r11, 3 //scale row stride - vmovaps zmm5, zmm0 + vmovaps zmm5, zmm0 mov r15, a //load address of a - vmovaps zmm6, zmm0 + vmovaps zmm6, zmm0 mov rbx, b //load address of b - vmovaps zmm7, zmm0 + vmovaps zmm7, zmm0 - vmovaps zmm8, zmm0 + vmovaps zmm8, zmm0 lea r10, [r11 + 2*r11 + 0] //r10 has 3 * r11 vmovaps zmm9, zmm0 - vmovaps zmm10, zmm0 - mov rdi, r11 - vmovaps zmm11, zmm0 + vmovaps zmm10, zmm0 + mov rdi, r11 + vmovaps zmm11, zmm0 sal rdi, 2 //rdi has 4*r11 - vmovaps zmm12, zmm0 + vmovaps zmm12, zmm0 mov rcx, c //load address of c for prefetching - vmovaps zmm13, zmm0 - vmovaps zmm14, zmm0 + vmovaps zmm13, zmm0 + vmovaps zmm14, zmm0 mov r8, k64 - vmovaps zmm15, zmm0 + vmovaps zmm15, zmm0 vmovaps zmm16, zmm0 vmovaps zmm17, zmm0 mov r13, L2_PREFETCH_DIST*8*8 - vmovaps zmm18, zmm0 + vmovaps zmm18, zmm0 mov r14, L2_PREFETCH_DIST*8*32 - vmovaps zmm19, zmm0 - vmovaps zmm20, zmm0 - vmovaps zmm21, zmm0 - vmovaps zmm22, zmm0 + vmovaps zmm19, zmm0 + vmovaps zmm20, zmm0 + vmovaps zmm21, zmm0 + vmovaps zmm22, zmm0 - vmovaps zmm23, zmm0 + vmovaps zmm23, zmm0 sub r8, 30 + L2_PREFETCH_DIST //Check if we have over 40 operations to do. - vmovaps zmm24, zmm0 + vmovaps zmm24, zmm0 mov r8, 30 - vmovaps zmm25, zmm0 + vmovaps zmm25, zmm0 mov r9, 8*8 //amount to increment b* by each iteration - vmovaps zmm26, zmm0 + vmovaps zmm26, zmm0 mov r12, 32*8 //amount to increment a* by each iteration - vmovaps zmm27, zmm0 - vmovaps zmm28, zmm0 - vmovaps zmm29, zmm0 + vmovaps zmm27, zmm0 + vmovaps zmm28, zmm0 + vmovaps zmm29, zmm0 #ifdef MONITORS rdtsc mov midl, eax - mov midh, edx + mov midh, edx #endif jle CONSIDER_UNDER_40 sub rsi, 30 + L2_PREFETCH_DIST - + //First 30 iterations LOOPREFECHCL2: ONE_ITER_PC_L2(rcx) @@ -357,26 +361,26 @@ void bli_dgemm_knc_asm_30x8 LOOPMAIN: ONE_ITER_MAIN_LOOP(rcx, rsi) jne LOOPMAIN - + //Penultimate 22 iterations. //Break these off from the main loop to avoid prefetching extra shit. mov r14, a_next mov r13, b_next sub r14, r15 sub r13, rbx - + mov rsi, L2_PREFETCH_DIST-10 LOOPMAIN2: ONE_ITER_MAIN_LOOP(rcx, rsi) jne LOOPMAIN2 - - + + //Last 10 iterations mov r8, 10 LOOPREFETCHCL1: ONE_ITER_PC_L1(rcx) jne LOOPREFETCHCL1 - + jmp POSTACCUM @@ -403,14 +407,8 @@ void bli_dgemm_knc_asm_30x8 mov r9, c //load address of c for update mov r12, alpha //load address of alpha - // Check if C is row stride. If not, jump to the slow scattered update - mov r14, cs_c - dec r14 - jne SCATTEREDUPDATE - mov r14, beta - vbroadcastsd zmm31, 0[r14] - + vbroadcastsd zmm31, 0[r14] vmulpd zmm0, zmm0, 0[r12]{1to8} vmulpd zmm1, zmm1, 0[r12]{1to8} @@ -467,7 +465,7 @@ void bli_dgemm_knc_asm_30x8 vmovapd [r9+2*r11+0], zmm14 vmovapd [r9+r10+0], zmm15 add r9, rdi - + vmulpd zmm16, zmm16, 0[r12]{1to8} vmulpd zmm17, zmm17, 0[r12]{1to8} vmulpd zmm18, zmm18, 0[r12]{1to8} @@ -516,47 +514,6 @@ void bli_dgemm_knc_asm_30x8 vfmadd231pd zmm29, zmm31, [r9+r11+0] vmovapd [r9+0], zmm28 vmovapd [r9+r11+0], zmm29 - - jmp END - - SCATTEREDUPDATE: - mov r10, offsetPtr - vmovapd zmm31, 0[r10] - vpbroadcastd zmm30, cs_c - mov r13, beta - vpmulld zmm30, zmm31, zmm30 - - mov ebx, 255 - UPDATE_C_ROW_SCATTERED(zmm0, 0, r9) - UPDATE_C_ROW_SCATTERED(zmm1, 1, r9) - UPDATE_C_ROW_SCATTERED(zmm2, 2, r9) - UPDATE_C_ROW_SCATTERED(zmm3, 3, r9) - UPDATE_C_ROW_SCATTERED(zmm4, 4, r9) - UPDATE_C_ROW_SCATTERED(zmm5, 5, r9) - UPDATE_C_ROW_SCATTERED(zmm6, 6, r9) - UPDATE_C_ROW_SCATTERED(zmm7, 7, r9) - UPDATE_C_ROW_SCATTERED(zmm8, 8, r9) - UPDATE_C_ROW_SCATTERED(zmm9, 9, r9) - UPDATE_C_ROW_SCATTERED(zmm10, 10, r9) - UPDATE_C_ROW_SCATTERED(zmm11, 11, r9) - UPDATE_C_ROW_SCATTERED(zmm12, 12, r9) - UPDATE_C_ROW_SCATTERED(zmm13, 13, r9) - UPDATE_C_ROW_SCATTERED(zmm14, 14, r9) - UPDATE_C_ROW_SCATTERED(zmm15, 15, r9) - UPDATE_C_ROW_SCATTERED(zmm16, 16, r9) - UPDATE_C_ROW_SCATTERED(zmm17, 17, r9) - UPDATE_C_ROW_SCATTERED(zmm18, 18, r9) - UPDATE_C_ROW_SCATTERED(zmm19, 19, r9) - UPDATE_C_ROW_SCATTERED(zmm20, 20, r9) - UPDATE_C_ROW_SCATTERED(zmm21, 21, r9) - UPDATE_C_ROW_SCATTERED(zmm22, 22, r9) - UPDATE_C_ROW_SCATTERED(zmm23, 23, r9) - UPDATE_C_ROW_SCATTERED(zmm24, 24, r9) - UPDATE_C_ROW_SCATTERED(zmm25, 25, r9) - UPDATE_C_ROW_SCATTERED(zmm26, 26, r9) - UPDATE_C_ROW_SCATTERED(zmm27, 27, r9) - UPDATE_C_ROW_SCATTERED(zmm28, 28, r9) - UPDATE_C_ROW_SCATTERED(zmm29, 29, r9) END: #ifdef MONITORS @@ -566,6 +523,8 @@ void bli_dgemm_knc_asm_30x8 #endif } + GEMM_UKR_FLUSH_CT( d ); + #ifdef LOOPMON printf("looptime = \t%d\n", bloopl - tloopl); #endif diff --git a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c index 866cb62ec..18a8e5e2e 100644 --- a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c +++ b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c @@ -256,6 +256,8 @@ int offsets[16] __attribute__((aligned(0x1000))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9 //#define LOOPMON void bli_sgemm_knc_asm_30x16 ( + dim_t m, + dim_t n, dim_t k, float* restrict alpha, float* restrict a, @@ -273,80 +275,82 @@ void bli_sgemm_knc_asm_30x16 uint64_t k64 = k; + GEMM_UKR_SETUP_CT( s, 30, 16, true ); + #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; #endif #ifdef LOOPMON int tlooph, tloopl, blooph, bloopl; #endif - + __asm { #ifdef MONITORS rdtsc mov topl, eax - mov toph, edx + mov toph, edx #endif vpxord zmm0, zmm0, zmm0 vmovaps zmm1, zmm0 //clear out registers - vmovaps zmm2, zmm0 + vmovaps zmm2, zmm0 mov rsi, k64 //loop index - vmovaps zmm3, zmm0 + vmovaps zmm3, zmm0 mov r11, rs_c //load row stride - vmovaps zmm4, zmm0 + vmovaps zmm4, zmm0 sal r11, 2 //scale row stride - vmovaps zmm5, zmm0 + vmovaps zmm5, zmm0 mov r15, a //load address of a - vmovaps zmm6, zmm0 + vmovaps zmm6, zmm0 mov rbx, b //load address of b - vmovaps zmm7, zmm0 + vmovaps zmm7, zmm0 - vmovaps zmm8, zmm0 + vmovaps zmm8, zmm0 lea r10, [r11 + 2*r11 + 0] //r10 has 3 * r11 vmovaps zmm9, zmm0 - vmovaps zmm10, zmm0 - mov rdi, r11 - vmovaps zmm11, zmm0 + vmovaps zmm10, zmm0 + mov rdi, r11 + vmovaps zmm11, zmm0 sal rdi, 2 //rdi has 4*r11 - vmovaps zmm12, zmm0 + vmovaps zmm12, zmm0 mov rcx, c //load address of c for prefetching - vmovaps zmm13, zmm0 - vmovaps zmm14, zmm0 + vmovaps zmm13, zmm0 + vmovaps zmm14, zmm0 mov r8, k64 - vmovaps zmm15, zmm0 + vmovaps zmm15, zmm0 vmovaps zmm16, zmm0 vmovaps zmm17, zmm0 mov r13, L2_PREFETCH_DIST*4*16 - vmovaps zmm18, zmm0 + vmovaps zmm18, zmm0 mov r14, L2_PREFETCH_DIST*4*32 - vmovaps zmm19, zmm0 - vmovaps zmm20, zmm0 - vmovaps zmm21, zmm0 - vmovaps zmm22, zmm0 + vmovaps zmm19, zmm0 + vmovaps zmm20, zmm0 + vmovaps zmm21, zmm0 + vmovaps zmm22, zmm0 - vmovaps zmm23, zmm0 + vmovaps zmm23, zmm0 sub r8, 30 + L2_PREFETCH_DIST //Check if we have over 40 operations to do. - vmovaps zmm24, zmm0 + vmovaps zmm24, zmm0 mov r8, 30 - vmovaps zmm25, zmm0 + vmovaps zmm25, zmm0 mov r9, 16*4 //amount to increment b* by each iteration - vmovaps zmm26, zmm0 + vmovaps zmm26, zmm0 mov r12, 32*4 //amount to increment a* by each iteration - vmovaps zmm27, zmm0 - vmovaps zmm28, zmm0 - vmovaps zmm29, zmm0 + vmovaps zmm27, zmm0 + vmovaps zmm28, zmm0 + vmovaps zmm29, zmm0 #ifdef MONITORS rdtsc mov midl, eax - mov midh, edx + mov midh, edx #endif jle CONSIDER_UNDER_40 sub rsi, 30 + L2_PREFETCH_DIST - + //First 30 iterations LOOPREFECHCL2: ONE_ITER_PC_L2(rcx) @@ -357,26 +361,26 @@ void bli_sgemm_knc_asm_30x16 LOOPMAIN: ONE_ITER_MAIN_LOOP(rcx, rsi) jne LOOPMAIN - + //Penultimate 22 iterations. //Break these off from the main loop to avoid prefetching extra shit. mov r14, a_next mov r13, b_next sub r14, r15 sub r13, rbx - + mov rsi, L2_PREFETCH_DIST-10 LOOPMAIN2: ONE_ITER_MAIN_LOOP(rcx, rsi) jne LOOPMAIN2 - - + + //Last 10 iterations mov r8, 10 LOOPREFETCHCL1: ONE_ITER_PC_L1(rcx) jne LOOPREFETCHCL1 - + jmp POSTACCUM @@ -384,7 +388,7 @@ void bli_sgemm_knc_asm_30x16 //Used when <= 40 iterations CONSIDER_UNDER_40: mov rsi, k64 - test rsi, rsi + test rsi, rsi je POSTACCUM LOOP_UNDER_40: ONE_ITER_MAIN_LOOP(rcx, rsi) @@ -403,13 +407,8 @@ void bli_sgemm_knc_asm_30x16 mov r9, c //load address of c for update mov r12, alpha //load address of alpha - // Check if C is row stride. If not, jump to the slow scattered update - mov r14, cs_c - dec r14 - jne SCATTEREDUPDATE - mov r14, beta - vbroadcastss zmm31, 0[r14] + vbroadcastss zmm31, 0[r14] vmulps zmm0, zmm0, 0[r12]{1to16} @@ -467,7 +466,7 @@ void bli_sgemm_knc_asm_30x16 vmovaps [r9+2*r11+0], zmm14 vmovaps [r9+r10+0], zmm15 add r9, rdi - + vmulps zmm16, zmm16, 0[r12]{1to16} vmulps zmm17, zmm17, 0[r12]{1to16} vmulps zmm18, zmm18, 0[r12]{1to16} @@ -516,48 +515,6 @@ void bli_sgemm_knc_asm_30x16 vfmadd231ps zmm29, zmm31, [r9+r11+0] vmovaps [r9+0], zmm28 vmovaps [r9+r11+0], zmm29 - - jmp END - - SCATTEREDUPDATE: - - mov r10, offsetPtr - vmovaps zmm31, 0[r10] - vpbroadcastd zmm30, cs_c - mov r13, beta - vpmulld zmm30, zmm31, zmm30 - - mov ebx, 0xFFFF - UPDATE_C_ROW_SCATTERED(zmm0, 0, r9) - UPDATE_C_ROW_SCATTERED(zmm1, 1, r9) - UPDATE_C_ROW_SCATTERED(zmm2, 2, r9) - UPDATE_C_ROW_SCATTERED(zmm3, 3, r9) - UPDATE_C_ROW_SCATTERED(zmm4, 4, r9) - UPDATE_C_ROW_SCATTERED(zmm5, 5, r9) - UPDATE_C_ROW_SCATTERED(zmm6, 6, r9) - UPDATE_C_ROW_SCATTERED(zmm7, 7, r9) - UPDATE_C_ROW_SCATTERED(zmm8, 8, r9) - UPDATE_C_ROW_SCATTERED(zmm9, 9, r9) - UPDATE_C_ROW_SCATTERED(zmm10, 10, r9) - UPDATE_C_ROW_SCATTERED(zmm11, 11, r9) - UPDATE_C_ROW_SCATTERED(zmm12, 12, r9) - UPDATE_C_ROW_SCATTERED(zmm13, 13, r9) - UPDATE_C_ROW_SCATTERED(zmm14, 14, r9) - UPDATE_C_ROW_SCATTERED(zmm15, 15, r9) - UPDATE_C_ROW_SCATTERED(zmm16, 16, r9) - UPDATE_C_ROW_SCATTERED(zmm17, 17, r9) - UPDATE_C_ROW_SCATTERED(zmm18, 18, r9) - UPDATE_C_ROW_SCATTERED(zmm19, 19, r9) - UPDATE_C_ROW_SCATTERED(zmm20, 20, r9) - UPDATE_C_ROW_SCATTERED(zmm21, 21, r9) - UPDATE_C_ROW_SCATTERED(zmm22, 22, r9) - UPDATE_C_ROW_SCATTERED(zmm23, 23, r9) - UPDATE_C_ROW_SCATTERED(zmm24, 24, r9) - UPDATE_C_ROW_SCATTERED(zmm25, 25, r9) - UPDATE_C_ROW_SCATTERED(zmm26, 26, r9) - UPDATE_C_ROW_SCATTERED(zmm27, 27, r9) - UPDATE_C_ROW_SCATTERED(zmm28, 28, r9) - UPDATE_C_ROW_SCATTERED(zmm29, 29, r9) END: #ifdef MONITORS @@ -567,6 +524,8 @@ void bli_sgemm_knc_asm_30x16 #endif } + GEMM_UKR_FLUSH_CT( s ); + #ifdef LOOPMON printf("looptime = \t%d\n", bloopl - tloopl); #endif diff --git a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c index b794e7c05..a7f860ae0 100644 --- a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c +++ b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c @@ -185,6 +185,8 @@ static int32_t offsets[32] __attribute__((aligned(64))) = //#define LOOPMON void bli_dgemm_knl_asm_24x8 ( + dim_t m, + dim_t n, dim_t k_, double* restrict alpha, double* restrict a, @@ -201,10 +203,12 @@ void bli_dgemm_knl_asm_24x8 const double * a_next = bli_auxinfo_next_a( data ); const double * b_next = bli_auxinfo_next_b( data ); - const int32_t * offsetPtr = &offsets[0]; - const int64_t k = k_; - const int64_t rs_c = rs_c_; - const int64_t cs_c = cs_c_; + int32_t * offsetPtr = &offsets[0]; + int64_t k = k_; + int64_t rs_c = rs_c_; + int64_t cs_c = cs_c_; + + GEMM_UKR_SETUP_CT( d, 24, 8, true ); #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; @@ -565,10 +569,7 @@ void bli_dgemm_knl_asm_24x8 // Check if C is row stride. If not, jump to the slow scattered update MOV(RAX, VAR(rs_c)) LEA(RAX, MEM(,RAX,8)) - MOV(RBX, VAR(cs_c)) LEA(RDI, MEM(RAX,RAX,2)) - CMP(RBX, IMM(1)) - JNE(SCATTEREDUPDATE) VMOVQ(RDX, XMM(1)) SAL(RDX) //shift out sign bit @@ -592,74 +593,6 @@ void bli_dgemm_knl_asm_24x8 UPDATE_C_BZ_FOUR_ROWS(24,25,26,27) UPDATE_C_BZ_FOUR_ROWS(28,29,30,31) - JMP(END) - - LABEL(SCATTEREDUPDATE) - - MOV(RDI, VAR(offsetPtr)) - VMOVAPS(ZMM(2), MEM(RDI)) - /* Note that this ignores the upper 32 bits in cs_c */ - VPBROADCASTD(ZMM(3), EBX) - VPMULLD(ZMM(2), ZMM(3), ZMM(2)) - - VMOVQ(RDX, XMM(1)) - SAL(RDX) //shift out sign bit - JZ(SCATTERBZ) - - UPDATE_C_ROW_SCATTERED( 8) - UPDATE_C_ROW_SCATTERED( 9) - UPDATE_C_ROW_SCATTERED(10) - UPDATE_C_ROW_SCATTERED(11) - UPDATE_C_ROW_SCATTERED(12) - UPDATE_C_ROW_SCATTERED(13) - UPDATE_C_ROW_SCATTERED(14) - UPDATE_C_ROW_SCATTERED(15) - UPDATE_C_ROW_SCATTERED(16) - UPDATE_C_ROW_SCATTERED(17) - UPDATE_C_ROW_SCATTERED(18) - UPDATE_C_ROW_SCATTERED(19) - UPDATE_C_ROW_SCATTERED(20) - UPDATE_C_ROW_SCATTERED(21) - UPDATE_C_ROW_SCATTERED(22) - UPDATE_C_ROW_SCATTERED(23) - UPDATE_C_ROW_SCATTERED(24) - UPDATE_C_ROW_SCATTERED(25) - UPDATE_C_ROW_SCATTERED(26) - UPDATE_C_ROW_SCATTERED(27) - UPDATE_C_ROW_SCATTERED(28) - UPDATE_C_ROW_SCATTERED(29) - UPDATE_C_ROW_SCATTERED(30) - UPDATE_C_ROW_SCATTERED(31) - - JMP(END) - - LABEL(SCATTERBZ) - - UPDATE_C_BZ_ROW_SCATTERED( 8) - UPDATE_C_BZ_ROW_SCATTERED( 9) - UPDATE_C_BZ_ROW_SCATTERED(10) - UPDATE_C_BZ_ROW_SCATTERED(11) - UPDATE_C_BZ_ROW_SCATTERED(12) - UPDATE_C_BZ_ROW_SCATTERED(13) - UPDATE_C_BZ_ROW_SCATTERED(14) - UPDATE_C_BZ_ROW_SCATTERED(15) - UPDATE_C_BZ_ROW_SCATTERED(16) - UPDATE_C_BZ_ROW_SCATTERED(17) - UPDATE_C_BZ_ROW_SCATTERED(18) - UPDATE_C_BZ_ROW_SCATTERED(19) - UPDATE_C_BZ_ROW_SCATTERED(20) - UPDATE_C_BZ_ROW_SCATTERED(21) - UPDATE_C_BZ_ROW_SCATTERED(22) - UPDATE_C_BZ_ROW_SCATTERED(23) - UPDATE_C_BZ_ROW_SCATTERED(24) - UPDATE_C_BZ_ROW_SCATTERED(25) - UPDATE_C_BZ_ROW_SCATTERED(26) - UPDATE_C_BZ_ROW_SCATTERED(27) - UPDATE_C_BZ_ROW_SCATTERED(28) - UPDATE_C_BZ_ROW_SCATTERED(29) - UPDATE_C_BZ_ROW_SCATTERED(30) - UPDATE_C_BZ_ROW_SCATTERED(31) - LABEL(END) #ifdef MONITORS @@ -701,6 +634,8 @@ void bli_dgemm_knl_asm_24x8 "zmm30", "zmm31", "memory" ) + GEMM_UKR_FLUSH_CT( d ); + #ifdef LOOPMON printf("looptime = \t%d\n", bloopl - tloopl); #endif diff --git a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c index 6d485b530..64feba09f 100644 --- a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c +++ b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c @@ -182,6 +182,8 @@ static int32_t offsets[32] __attribute__((aligned(64))) = //#define LOOPMON void bli_sgemm_knl_asm_24x16 ( + dim_t m, + dim_t n, dim_t k_, float* restrict alpha, float* restrict a, @@ -198,10 +200,12 @@ void bli_sgemm_knl_asm_24x16 const double * a_next = bli_auxinfo_next_a( data ); const double * b_next = bli_auxinfo_next_b( data ); - const int32_t * offsetPtr = &offsets[0]; - const int64_t k = k_; - const int64_t rs_c = rs_c_; - const int64_t cs_c = cs_c_; + int32_t * offsetPtr = &offsets[0]; + int64_t k = k_; + int64_t rs_c = rs_c_; + int64_t cs_c = cs_c_; + + GEMM_UKR_SETUP_CT( s, 24, 16, true ); #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; @@ -562,10 +566,7 @@ void bli_sgemm_knl_asm_24x16 // Check if C is row stride. If not, jump to the slow scattered update MOV(RAX, VAR(rs_c)) LEA(RAX, MEM(,RAX,4)) - MOV(RBX, VAR(cs_c)) LEA(RDI, MEM(RAX,RAX,2)) - CMP(RBX, IMM(1)) - JNE(SCATTEREDUPDATE) VMOVD(EDX, XMM(1)) SAL(EDX) //shift out sign bit @@ -589,74 +590,6 @@ void bli_sgemm_knl_asm_24x16 UPDATE_C_BZ_FOUR_ROWS(24,25,26,27) UPDATE_C_BZ_FOUR_ROWS(28,29,30,31) - JMP(END) - - LABEL(SCATTEREDUPDATE) - - MOV(RDI, VAR(offsetPtr)) - VMOVAPS(ZMM(2), MEM(RDI)) - /* Note that this ignores the upper 32 bits in cs_c */ - VPBROADCASTD(ZMM(3), EBX) - VPMULLD(ZMM(2), ZMM(3), ZMM(2)) - - VMOVD(EDX, XMM(1)) - SAL(EDX) //shift out sign bit - JZ(SCATTERBZ) - - UPDATE_C_ROW_SCATTERED( 8) - UPDATE_C_ROW_SCATTERED( 9) - UPDATE_C_ROW_SCATTERED(10) - UPDATE_C_ROW_SCATTERED(11) - UPDATE_C_ROW_SCATTERED(12) - UPDATE_C_ROW_SCATTERED(13) - UPDATE_C_ROW_SCATTERED(14) - UPDATE_C_ROW_SCATTERED(15) - UPDATE_C_ROW_SCATTERED(16) - UPDATE_C_ROW_SCATTERED(17) - UPDATE_C_ROW_SCATTERED(18) - UPDATE_C_ROW_SCATTERED(19) - UPDATE_C_ROW_SCATTERED(20) - UPDATE_C_ROW_SCATTERED(21) - UPDATE_C_ROW_SCATTERED(22) - UPDATE_C_ROW_SCATTERED(23) - UPDATE_C_ROW_SCATTERED(24) - UPDATE_C_ROW_SCATTERED(25) - UPDATE_C_ROW_SCATTERED(26) - UPDATE_C_ROW_SCATTERED(27) - UPDATE_C_ROW_SCATTERED(28) - UPDATE_C_ROW_SCATTERED(29) - UPDATE_C_ROW_SCATTERED(30) - UPDATE_C_ROW_SCATTERED(31) - - JMP(END) - - LABEL(SCATTERBZ) - - UPDATE_C_BZ_ROW_SCATTERED( 8) - UPDATE_C_BZ_ROW_SCATTERED( 9) - UPDATE_C_BZ_ROW_SCATTERED(10) - UPDATE_C_BZ_ROW_SCATTERED(11) - UPDATE_C_BZ_ROW_SCATTERED(12) - UPDATE_C_BZ_ROW_SCATTERED(13) - UPDATE_C_BZ_ROW_SCATTERED(14) - UPDATE_C_BZ_ROW_SCATTERED(15) - UPDATE_C_BZ_ROW_SCATTERED(16) - UPDATE_C_BZ_ROW_SCATTERED(17) - UPDATE_C_BZ_ROW_SCATTERED(18) - UPDATE_C_BZ_ROW_SCATTERED(19) - UPDATE_C_BZ_ROW_SCATTERED(20) - UPDATE_C_BZ_ROW_SCATTERED(21) - UPDATE_C_BZ_ROW_SCATTERED(22) - UPDATE_C_BZ_ROW_SCATTERED(23) - UPDATE_C_BZ_ROW_SCATTERED(24) - UPDATE_C_BZ_ROW_SCATTERED(25) - UPDATE_C_BZ_ROW_SCATTERED(26) - UPDATE_C_BZ_ROW_SCATTERED(27) - UPDATE_C_BZ_ROW_SCATTERED(28) - UPDATE_C_BZ_ROW_SCATTERED(29) - UPDATE_C_BZ_ROW_SCATTERED(30) - UPDATE_C_BZ_ROW_SCATTERED(31) - LABEL(END) #ifdef MONITORS @@ -698,6 +631,8 @@ void bli_sgemm_knl_asm_24x16 "zmm30", "zmm31", "memory" ) + GEMM_UKR_FLUSH_CT( s ); + #ifdef LOOPMON printf("looptime = \t%d\n", bloopl - tloopl); #endif diff --git a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c index e52cc9e0e..a3e39c3ac 100644 --- a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c @@ -39,7 +39,9 @@ void bli_sgemm_penryn_asm_8x4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, float* restrict alpha, float* restrict a, float* restrict b, @@ -54,38 +56,40 @@ void bli_sgemm_penryn_asm_8x4 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT_ALIGNED( s, 8, 4, false, 16 ); + begin_asm() - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r9) // load address of b_next. - + sub(imm(0-8*16), rax) // increment pointers to allow byte sub(imm(0-8*16), rbx) // offsets in the unrolled iterations. - + movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements movaps(mem(rax, -7*16), xmm1) // of a and b. movaps(mem(rbx, -8*16), xmm2) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) mov(rdi, r12) // make a copy of cs_c (in bytes) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; - + prefetch(2, mem(r9, 0*4)) // prefetch b_next - + xorps(xmm3, xmm3) xorps(xmm4, xmm4) xorps(xmm5, xmm5) xorps(xmm6, xmm6) - + prefetch(2, mem(rcx, 6*4)) // prefetch c + 0*cs_c xorps(xmm8, xmm8) xorps(xmm9, xmm9) @@ -98,33 +102,33 @@ void bli_sgemm_penryn_asm_8x4 prefetch(2, mem(r10, rdi, 1, 6*4)) // prefetch c + 3*cs_c xorps(xmm14, xmm14) xorps(xmm15, xmm15) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - + prefetch(0, mem(rax, (4*35+1)*8)) - + addps(xmm6, xmm10) // iteration 0 addps(xmm3, xmm14) movaps(xmm2, xmm3) pshufd(imm(0x39), xmm2, xmm7) mulps(xmm0, xmm2) mulps(xmm1, xmm3) - + addps(xmm4, xmm11) addps(xmm5, xmm15) movaps(xmm7, xmm5) pshufd(imm(0x39), xmm7, xmm6) mulps(xmm0, xmm7) mulps(xmm1, xmm5) - + addps(xmm2, xmm8) movaps(mem(rbx, -7*16), xmm2) addps(xmm3, xmm12) @@ -132,7 +136,7 @@ void bli_sgemm_penryn_asm_8x4 pshufd(imm(0x39), xmm6, xmm4) mulps(xmm0, xmm6) mulps(xmm1, xmm3) - + addps(xmm7, xmm9) addps(xmm5, xmm13) movaps(xmm4, xmm5) @@ -140,22 +144,22 @@ void bli_sgemm_penryn_asm_8x4 movaps(mem(rax, -6*16), xmm0) mulps(xmm1, xmm5) movaps(mem(rax, -5*16), xmm1) - - + + addps(xmm6, xmm10) // iteration 1 addps(xmm3, xmm14) movaps(xmm2, xmm3) pshufd(imm(0x39), xmm2, xmm7) mulps(xmm0, xmm2) mulps(xmm1, xmm3) - + addps(xmm4, xmm11) addps(xmm5, xmm15) movaps(xmm7, xmm5) pshufd(imm(0x39), xmm7, xmm6) mulps(xmm0, xmm7) mulps(xmm1, xmm5) - + addps(xmm2, xmm8) movaps(mem(rbx, -6*16), xmm2) addps(xmm3, xmm12) @@ -163,7 +167,7 @@ void bli_sgemm_penryn_asm_8x4 pshufd(imm(0x39), xmm6, xmm4) mulps(xmm0, xmm6) mulps(xmm1, xmm3) - + addps(xmm7, xmm9) addps(xmm5, xmm13) movaps(xmm4, xmm5) @@ -171,22 +175,22 @@ void bli_sgemm_penryn_asm_8x4 movaps(mem(rax, -4*16), xmm0) mulps(xmm1, xmm5) movaps(mem(rax, -3*16), xmm1) - - + + addps(xmm6, xmm10) // iteration 2 addps(xmm3, xmm14) movaps(xmm2, xmm3) pshufd(imm(0x39), xmm2, xmm7) mulps(xmm0, xmm2) mulps(xmm1, xmm3) - + addps(xmm4, xmm11) addps(xmm5, xmm15) movaps(xmm7, xmm5) pshufd(imm(0x39), xmm7, xmm6) mulps(xmm0, xmm7) mulps(xmm1, xmm5) - + addps(xmm2, xmm8) movaps(mem(rbx, -5*16), xmm2) addps(xmm3, xmm12) @@ -194,7 +198,7 @@ void bli_sgemm_penryn_asm_8x4 pshufd(imm(0x39), xmm6, xmm4) mulps(xmm0, xmm6) mulps(xmm1, xmm3) - + addps(xmm7, xmm9) addps(xmm5, xmm13) movaps(xmm4, xmm5) @@ -202,26 +206,26 @@ void bli_sgemm_penryn_asm_8x4 movaps(mem(rax, -2*16), xmm0) mulps(xmm1, xmm5) movaps(mem(rax, -1*16), xmm1) - - + + addps(xmm6, xmm10) // iteration 3 addps(xmm3, xmm14) movaps(xmm2, xmm3) pshufd(imm(0x39), xmm2, xmm7) mulps(xmm0, xmm2) mulps(xmm1, xmm3) - + sub(imm(0-4*8*4), rax) // a += 4*8 (unroll x mr) - + addps(xmm4, xmm11) addps(xmm5, xmm15) movaps(xmm7, xmm5) pshufd(imm(0x39), xmm7, xmm6) mulps(xmm0, xmm7) mulps(xmm1, xmm5) - + sub(imm(0-4*4*4), r9) // b_next += 4*4 (unroll x nr) - + addps(xmm2, xmm8) movaps(mem(rbx, -4*16), xmm2) addps(xmm3, xmm12) @@ -229,9 +233,9 @@ void bli_sgemm_penryn_asm_8x4 pshufd(imm(0x39), xmm6, xmm4) mulps(xmm0, xmm6) mulps(xmm1, xmm3) - + sub(imm(0-4*4*4), rbx) // b += 4*4 (unroll x nr) - + addps(xmm7, xmm9) addps(xmm5, xmm13) movaps(xmm4, xmm5) @@ -239,40 +243,40 @@ void bli_sgemm_penryn_asm_8x4 movaps(mem(rax, -8*16), xmm0) mulps(xmm1, xmm5) movaps(mem(rax, -7*16), xmm1) - + prefetch(2, mem(r9, 0*4)) // prefetch b_next[0] prefetch(2, mem(r9, 16*4)) // prefetch b_next[16] - - + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP - + addps(xmm6, xmm10) // iteration 0 addps(xmm3, xmm14) movaps(xmm2, xmm3) pshufd(imm(0x39), xmm2, xmm7) mulps(xmm0, xmm2) mulps(xmm1, xmm3) - + addps(xmm4, xmm11) addps(xmm5, xmm15) movaps(xmm7, xmm5) pshufd(imm(0x39), xmm7, xmm6) mulps(xmm0, xmm7) mulps(xmm1, xmm5) - + addps(xmm2, xmm8) movaps(mem(rbx, -7*16), xmm2) addps(xmm3, xmm12) @@ -280,7 +284,7 @@ void bli_sgemm_penryn_asm_8x4 pshufd(imm(0x39), xmm6, xmm4) mulps(xmm0, xmm6) mulps(xmm1, xmm3) - + addps(xmm7, xmm9) addps(xmm5, xmm13) movaps(xmm4, xmm5) @@ -288,40 +292,40 @@ void bli_sgemm_penryn_asm_8x4 movaps(mem(rax, -6*16), xmm0) mulps(xmm1, xmm5) movaps(mem(rax, -5*16), xmm1) - + sub(imm(0-1*8*4), rax) // a += 8 (1 x mr) sub(imm(0-1*4*4), rbx) // b += 4 (1 x nr) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + addps(xmm6, xmm10) addps(xmm3, xmm14) addps(xmm4, xmm11) addps(xmm5, xmm15) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta movss(mem(rax), xmm6) // load alpha to bottom 4 bytes of xmm6 movss(mem(rbx), xmm7) // load beta to bottom 4 bytes of xmm7 pshufd(imm(0x00), xmm6, xmm6) // populate xmm6 with four alphas pshufd(imm(0x00), xmm7, xmm7) // populate xmm7 with four betas - - + + mov(var(rs_c), rsi) // load rs_c mov(rsi, r8) // make a copy of rs_c - + lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) lea(mem(rsi, rsi, 2), r11) // r11 = 3*(rs_c * sizeof(float)) - + lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; - + // xmm8: xmm9: xmm10: xmm11: // ( ab00 ( ab01 ( ab02 ( ab03 // ab11 ab12 ab13 ab10 @@ -338,20 +342,20 @@ void bli_sgemm_penryn_asm_8x4 shufps(imm(0xd8), xmm11, xmm8) shufps(imm(0xd8), xmm10, xmm11) shufps(imm(0xd8), xmm4, xmm10) - + movaps(xmm8, xmm4) shufps(imm(0xd8), xmm10, xmm8) shufps(imm(0xd8), xmm4, xmm10) movaps(xmm9, xmm5) shufps(imm(0xd8), xmm11, xmm9) shufps(imm(0xd8), xmm5, xmm11) - + movaps(xmm13, xmm4) shufps(imm(0xd8), xmm12, xmm13) shufps(imm(0xd8), xmm15, xmm12) shufps(imm(0xd8), xmm14, xmm15) shufps(imm(0xd8), xmm4, xmm14) - + movaps(xmm12, xmm4) shufps(imm(0xd8), xmm14, xmm12) shufps(imm(0xd8), xmm4, xmm14) @@ -369,471 +373,133 @@ void bli_sgemm_penryn_asm_8x4 // ab50 ab51 ab52 ab53 // ab60 ab61 ab62 ab63 // ab70 ) ab71 ) ab72 ) ab73 ) - - - - // determine if - // c % 16 == 0, AND - // 8*cs_c % 16 == 0, AND - // rs_c == 1 - // ie: aligned, ldim aligned, and - // column-stored - - cmp(imm(1), r8) // set ZF if rs_c == 1. - sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); - test(imm(15), rcx) // set ZF if c & 16 is zero. - setz(bh) // bh = ( ZF == 1 ? 1 : 0 ); - test(imm(15), r12) // set ZF if (4*cs_c) & 16 is zero. - setz(al) // al = ( ZF == 1 ? 1 : 0 ); - // and(bl,bh) followed by - // and(bh,al) will reveal result - + // now avoid loading C if beta == 0 - + xorpd(xmm0, xmm0) // set xmm0 to zero. ucomisd(xmm0, xmm7) // check if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.SCOLSTORED) // jump to column storage case - - - - label(.SGENSTORED) - - movlps(mem(rcx), xmm0) // load c00 ~ c30 - movhps(mem(rcx, rsi, 1), xmm0) - movlps(mem(rcx, rsi, 2), xmm1) - movhps(mem(rcx, r11, 1), xmm1) - shufps(imm(0x88), xmm1, xmm0) - - mulps(xmm6, xmm8) // scale by alpha, - mulps(xmm7, xmm0) // scale by beta, - addps(xmm8, xmm0) // add the gemm result, - - movss(xmm0, mem(rcx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rcx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rcx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rcx, r11, 1)) - - add(rdi, rcx) - - - movlps(mem(rdx), xmm0) // load c40 ~ c70 - movhps(mem(rdx, rsi, 1), xmm0) - movlps(mem(rdx, rsi, 2), xmm1) - movhps(mem(rdx, r11, 1), xmm1) - shufps(imm(0x88), xmm1, xmm0) - - mulps(xmm6, xmm12) // scale by alpha, - mulps(xmm7, xmm0) // scale by beta, - addps(xmm12, xmm0) // add the gemm result, - - movss(xmm0, mem(rdx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rdx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rdx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rdx, r11, 1)) - - add(rdi, rdx) - - - movlps(mem(rcx), xmm0) // load c01 ~ c31 - movhps(mem(rcx, rsi, 1), xmm0) - movlps(mem(rcx, rsi, 2), xmm1) - movhps(mem(rcx, r11, 1), xmm1) - shufps(imm(0x88), xmm1, xmm0) - - mulps(xmm6, xmm9) // scale by alpha, - mulps(xmm7, xmm0) // scale by beta, - addps(xmm9, xmm0) // add the gemm result, - - movss(xmm0, mem(rcx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rcx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rcx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rcx, r11, 1)) - - add(rdi, rcx) - - - movlps(mem(rdx), xmm0) // load c41 ~ c71 - movhps(mem(rdx, rsi, 1), xmm0) - movlps(mem(rdx, rsi, 2), xmm1) - movhps(mem(rdx, r11, 1), xmm1) - shufps(imm(0x88), xmm1, xmm0) - - mulps(xmm6, xmm13) // scale by alpha, - mulps(xmm7, xmm0) // scale by beta, - addps(xmm13, xmm0) // add the gemm result, - - movss(xmm0, mem(rdx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rdx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rdx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rdx, r11, 1)) - - add(rdi, rdx) - - - movlps(mem(rcx), xmm0) // load c02 ~ c32 - movhps(mem(rcx, rsi, 1), xmm0) - movlps(mem(rcx, rsi, 2), xmm1) - movhps(mem(rcx, r11, 1), xmm1) - shufps(imm(0x88), xmm1, xmm0) - - mulps(xmm6, xmm10) // scale by alpha, - mulps(xmm7, xmm0) // scale by beta, - addps(xmm10, xmm0) // add the gemm result, - - movss(xmm0, mem(rcx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rcx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rcx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rcx, r11, 1)) - - add(rdi, rcx) - - - movlps(mem(rdx), xmm0) // load c42 ~ c72 - movhps(mem(rdx, rsi, 1), xmm0) - movlps(mem(rdx, rsi, 2), xmm1) - movhps(mem(rdx, r11, 1), xmm1) - shufps(imm(0x88), xmm1, xmm0) - - mulps(xmm6, xmm14) // scale by alpha, - mulps(xmm7, xmm0) // scale by beta, - addps(xmm14, xmm0) // add the gemm result, - - movss(xmm0, mem(rdx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rdx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rdx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rdx, r11, 1)) - - add(rdi, rdx) - - - movlps(mem(rcx), xmm0) // load c03 ~ c33 - movhps(mem(rcx, rsi, 1), xmm0) - movlps(mem(rcx, rsi, 2), xmm1) - movhps(mem(rcx, r11, 1), xmm1) - shufps(imm(0x88), xmm1, xmm0) - - mulps(xmm6, xmm11) // scale by alpha, - mulps(xmm7, xmm0) // scale by beta, - addps(xmm11, xmm0) // add the gemm result, - - movss(xmm0, mem(rcx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rcx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rcx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rcx, r11, 1)) - - - - - movlps(mem(rdx), xmm0) // load c43 ~ c73 - movhps(mem(rdx, rsi, 1), xmm0) - movlps(mem(rdx, rsi, 2), xmm1) - movhps(mem(rdx, r11, 1), xmm1) - shufps(imm(0x88), xmm1, xmm0) - - mulps(xmm6, xmm15) // scale by alpha, - mulps(xmm7, xmm0) // scale by beta, - addps(xmm15, xmm0) // add the gemm result, - - movss(xmm0, mem(rdx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rdx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rdx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rdx, r11, 1)) - - - - - jmp(.SDONE) // jump to end. - - - - label(.SCOLSTORED) - - movaps(mem(rcx), xmm0) // load c00 ~ c30, - mulps(xmm6, xmm8) // scale by alpha, - mulps(xmm7, xmm0) // scale by beta, - addps(xmm8, xmm0) // add the gemm result, - movaps(xmm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) - - movaps(mem(rdx), xmm1) // load c40 ~ c70, - mulps(xmm6, xmm12) // scale by alpha, - mulps(xmm7, xmm1) // scale by beta, - addps(xmm12, xmm1) // add the gemm result, - movaps(xmm1, mem(rdx)) // and store back to memory. - add(rdi, rdx) - - - - movaps(mem(rcx), xmm0) // load c01 ~ c31, - mulps(xmm6, xmm9) // scale by alpha, - mulps(xmm7, xmm0) // scale by beta, - addps(xmm9, xmm0) // add the gemm result, - movaps(xmm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) - - movaps(mem(rdx), xmm1) // load c41 ~ c71, - mulps(xmm6, xmm13) // scale by alpha, - mulps(xmm7, xmm1) // scale by beta, - addps(xmm13, xmm1) // add the gemm result, - movaps(xmm1, mem(rdx)) // and store back to memory. - add(rdi, rdx) - - - - movaps(mem(rcx), xmm0) // load c02 ~ c32, - mulps(xmm6, xmm10) // scale by alpha, - mulps(xmm7, xmm0) // scale by beta, - addps(xmm10, xmm0) // add the gemm result, - movaps(xmm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) - - movaps(mem(rdx), xmm1) // load c42 ~ c72, - mulps(xmm6, xmm14) // scale by alpha, - mulps(xmm7, xmm1) // scale by beta, - addps(xmm14, xmm1) // add the gemm result, - movaps(xmm1, mem(rdx)) // and store back to memory. - add(rdi, rdx) - - - - movaps(mem(rcx), xmm0) // load c03 ~ c33, - mulps(xmm6, xmm11) // scale by alpha, - mulps(xmm7, xmm0) // scale by beta, - addps(xmm11, xmm0) // add the gemm result, - movaps(xmm0, mem(rcx)) // and store back to memory. - - - movaps(mem(rdx), xmm1) // load c43 ~ c73, - mulps(xmm6, xmm15) // scale by alpha, - mulps(xmm7, xmm1) // scale by beta, - addps(xmm15, xmm1) // add the gemm result, - movaps(xmm1, mem(rdx)) // and store back to memory. - - jmp(.SDONE) // jump to end. - - - - + + movaps(mem(rcx), xmm0) // load c00 ~ c30, + mulps(xmm6, xmm8) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm8, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) + + movaps(mem(rdx), xmm1) // load c40 ~ c70, + mulps(xmm6, xmm12) // scale by alpha, + mulps(xmm7, xmm1) // scale by beta, + addps(xmm12, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + + movaps(mem(rcx), xmm0) // load c01 ~ c31, + mulps(xmm6, xmm9) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm9, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) + + movaps(mem(rdx), xmm1) // load c41 ~ c71, + mulps(xmm6, xmm13) // scale by alpha, + mulps(xmm7, xmm1) // scale by beta, + addps(xmm13, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + + movaps(mem(rcx), xmm0) // load c02 ~ c32, + mulps(xmm6, xmm10) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm10, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) + + movaps(mem(rdx), xmm1) // load c42 ~ c72, + mulps(xmm6, xmm14) // scale by alpha, + mulps(xmm7, xmm1) // scale by beta, + addps(xmm14, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + + movaps(mem(rcx), xmm0) // load c03 ~ c33, + mulps(xmm6, xmm11) // scale by alpha, + mulps(xmm7, xmm0) // scale by beta, + addps(xmm11, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + + + movaps(mem(rdx), xmm1) // load c43 ~ c73, + mulps(xmm6, xmm15) // scale by alpha, + mulps(xmm7, xmm1) // scale by beta, + addps(xmm15, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + + jmp(.SDONE) // jump to end. + label(.SBETAZERO) - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.SCOLSTORBZ) // jump to column storage case - - - - label(.SGENSTORBZ) - - mulps(xmm6, xmm8) // scale by alpha, - movaps(xmm8, xmm0) - - movss(xmm0, mem(rcx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rcx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rcx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rcx, r11, 1)) - - add(rdi, rcx) - - - mulps(xmm6, xmm12) // scale by alpha, - movaps(xmm12, xmm0) - - movss(xmm0, mem(rdx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rdx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rdx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rdx, r11, 1)) - - add(rdi, rdx) - - - mulps(xmm6, xmm9) // scale by alpha, - movaps(xmm9, xmm0) - - movss(xmm0, mem(rcx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rcx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rcx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rcx, r11, 1)) - - add(rdi, rcx) - - - mulps(xmm6, xmm13) // scale by alpha, - movaps(xmm13, xmm0) - - movss(xmm0, mem(rdx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rdx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rdx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rdx, r11, 1)) - - add(rdi, rdx) - - - mulps(xmm6, xmm10) // scale by alpha, - movaps(xmm10, xmm0) - - movss(xmm0, mem(rcx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rcx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rcx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rcx, r11, 1)) - - add(rdi, rcx) - - - mulps(xmm6, xmm14) // scale by alpha, - movaps(xmm14, xmm0) - - movss(xmm0, mem(rdx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rdx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rdx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rdx, r11, 1)) - - add(rdi, rdx) - - - mulps(xmm6, xmm11) // scale by alpha, - movaps(xmm11, xmm0) - - movss(xmm0, mem(rcx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rcx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rcx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rcx, r11, 1)) - - - - - mulps(xmm6, xmm15) // scale by alpha, - movaps(xmm15, xmm0) - - movss(xmm0, mem(rdx)) // and store back to memory. - pshufd(imm(0x39), xmm0, xmm1) - movss(xmm1, mem(rdx, rsi, 1)) - pshufd(imm(0x39), xmm1, xmm2) - movss(xmm2, mem(rdx, rsi, 2)) - pshufd(imm(0x39), xmm2, xmm3) - movss(xmm3, mem(rdx, r11, 1)) - - - - - jmp(.SDONE) // jump to end. - - - - label(.SCOLSTORBZ) - - // skip loading c00 ~ c30, - mulps(xmm6, xmm8) // scale by alpha, - movaps(xmm8, mem(rcx)) // and store back to memory. - add(rdi, rcx) - // skip loading c40 ~ c70, - mulps(xmm6, xmm12) // scale by alpha, - movaps(xmm12, mem(rdx)) // and store back to memory. - add(rdi, rdx) - - - // skip loading c01 ~ c31, - mulps(xmm6, xmm9) // scale by alpha, - movaps(xmm9, mem(rcx)) // and store back to memory. - add(rdi, rcx) - // skip loading c41 ~ c71, - mulps(xmm6, xmm13) // scale by alpha, - movaps(xmm13, mem(rdx)) // and store back to memory. - add(rdi, rdx) - - - // skip loading c02 ~ c32, - mulps(xmm6, xmm10) // scale by alpha, - movaps(xmm10, mem(rcx)) // and store back to memory. - add(rdi, rcx) - // skip loading c42 ~ c72, - mulps(xmm6, xmm14) // scale by alpha, - movaps(xmm14, mem(rdx)) // and store back to memory. - add(rdi, rdx) - - - // skip loading c03 ~ c33, - mulps(xmm6, xmm11) // scale by alpha, - movaps(xmm11, mem(rcx)) // and store back to memory. - - // skip loading c43 ~ c73, - mulps(xmm6, xmm15) // scale by alpha, - movaps(xmm15, mem(rdx)) // and store back to memory. - - - - - - - - + + // skip loading c00 ~ c30, + mulps(xmm6, xmm8) // scale by alpha, + movaps(xmm8, mem(rcx)) // and store back to memory. + add(rdi, rcx) + // skip loading c40 ~ c70, + mulps(xmm6, xmm12) // scale by alpha, + movaps(xmm12, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + // skip loading c01 ~ c31, + mulps(xmm6, xmm9) // scale by alpha, + movaps(xmm9, mem(rcx)) // and store back to memory. + add(rdi, rcx) + // skip loading c41 ~ c71, + mulps(xmm6, xmm13) // scale by alpha, + movaps(xmm13, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + // skip loading c02 ~ c32, + mulps(xmm6, xmm10) // scale by alpha, + movaps(xmm10, mem(rcx)) // and store back to memory. + add(rdi, rcx) + // skip loading c42 ~ c72, + mulps(xmm6, xmm14) // scale by alpha, + movaps(xmm14, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + // skip loading c03 ~ c33, + mulps(xmm6, xmm11) // scale by alpha, + movaps(xmm11, mem(rcx)) // and store back to memory. + + // skip loading c43 ~ c73, + mulps(xmm6, xmm15) // scale by alpha, + movaps(xmm15, mem(rdx)) // and store back to memory. + label(.SDONE) - - end_asm( + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c), // 8 - [b_next] "m" (b_next)/*, // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next)/*, // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "xmm0", "xmm1", "xmm2", "xmm3", @@ -842,11 +508,15 @@ void bli_sgemm_penryn_asm_8x4 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( s ); } void bli_dgemm_penryn_asm_4x4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, double* restrict alpha, double* restrict a, double* restrict b, @@ -861,39 +531,41 @@ void bli_dgemm_penryn_asm_4x4 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT_ALIGNED( d, 4, 4, false, 16 ); + begin_asm() - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r9) // load address of b_next. mov(var(a_next), r11) // load address of a_next. - + sub(imm(0-8*16), rax) // increment pointers to allow byte sub(imm(0-8*16), rbx) // offsets in the unrolled iterations. - + movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements movaps(mem(rax, -7*16), xmm1) // of a and b. movaps(mem(rbx, -8*16), xmm2) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) mov(rdi, r12) // make a copy of cs_c (in bytes) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; - + prefetch(2, mem(r9, 0*8)) // prefetch b_next - + xorpd(xmm3, xmm3) xorpd(xmm4, xmm4) xorpd(xmm5, xmm5) xorpd(xmm6, xmm6) - + prefetch(2, mem(rcx, 3*8)) // prefetch c + 0*cs_c xorpd(xmm8, xmm8) xorpd(xmm9, xmm9) @@ -906,22 +578,22 @@ void bli_dgemm_penryn_asm_4x4 prefetch(2, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c xorpd(xmm14, xmm14) xorpd(xmm15, xmm15) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - + prefetch(0, mem(rax, (4*35+1)*8)) //prefetch(0, mem(rax, (8*97+4)*8)) - + //prefetch(0, mem(r11, 67*4*8)) // prefetch a_next[0] - + addpd(xmm3, xmm11) // iteration 0 movaps(mem(rbx, -7*16), xmm3) addpd(xmm4, xmm15) @@ -929,13 +601,13 @@ void bli_dgemm_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + addpd(xmm2, xmm9) movaps(mem(rbx, -6*16), xmm2) addpd(xmm4, xmm13) @@ -943,7 +615,7 @@ void bli_dgemm_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -951,9 +623,9 @@ void bli_dgemm_penryn_asm_4x4 movaps(mem(rax, -6*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -5*16), xmm1) - - - + + + addpd(xmm3, xmm11) // iteration 1 movaps(mem(rbx, -5*16), xmm3) addpd(xmm4, xmm15) @@ -961,13 +633,13 @@ void bli_dgemm_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + addpd(xmm2, xmm9) movaps(mem(rbx, -4*16), xmm2) addpd(xmm4, xmm13) @@ -975,7 +647,7 @@ void bli_dgemm_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -983,16 +655,16 @@ void bli_dgemm_penryn_asm_4x4 movaps(mem(rax, -4*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -3*16), xmm1) - - + + prefetch(0, mem(rax, (4*37+1)*8)) //prefetch(0, mem(rax, (8*97+12)*8)) - + //prefetch(0, mem(r11, 69*4*8)) // prefetch a_next[8] //sub(imm(-4*4*8), r11) // a_next += 4*4 (unroll x mr) - - - + + + addpd(xmm3, xmm11) // iteration 2 movaps(mem(rbx, -3*16), xmm3) addpd(xmm4, xmm15) @@ -1000,13 +672,13 @@ void bli_dgemm_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + addpd(xmm2, xmm9) movaps(mem(rbx, -2*16), xmm2) addpd(xmm4, xmm13) @@ -1014,8 +686,8 @@ void bli_dgemm_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - - + + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -1023,9 +695,9 @@ void bli_dgemm_penryn_asm_4x4 movaps(mem(rax, -2*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -1*16), xmm1) - - - + + + addpd(xmm3, xmm11) // iteration 3 movaps(mem(rbx, -1*16), xmm3) addpd(xmm4, xmm15) @@ -1033,17 +705,17 @@ void bli_dgemm_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + sub(imm(0-4*4*8), rax) // a += 4*4 (unroll x mr) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + sub(imm(0-4*4*8), r9) // b_next += 4*4 (unroll x nr) - + addpd(xmm2, xmm9) movaps(mem(rbx, 0*16), xmm2) addpd(xmm4, xmm13) @@ -1051,9 +723,9 @@ void bli_dgemm_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - + sub(imm(0-4*4*8), rbx) // b += 4*4 (unroll x nr) - + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -1061,29 +733,29 @@ void bli_dgemm_penryn_asm_4x4 movaps(mem(rax, -8*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -7*16), xmm1) - + prefetch(2, mem(r9, 0*8)) // prefetch b_next[0] prefetch(2, mem(r9, 8*8)) // prefetch b_next[8] - + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - + + + //prefetch(2, mem(r9, -8*8)) // prefetch b_next[-8] - - - + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + addpd(xmm3, xmm11) // iteration 0 movaps(mem(rbx, -7*16), xmm3) addpd(xmm4, xmm15) @@ -1091,13 +763,13 @@ void bli_dgemm_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + addpd(xmm2, xmm9) movaps(mem(rbx, -6*16), xmm2) addpd(xmm4, xmm13) @@ -1105,7 +777,7 @@ void bli_dgemm_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -1113,38 +785,38 @@ void bli_dgemm_penryn_asm_4x4 movaps(mem(rax, -6*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -5*16), xmm1) - - + + sub(imm(0-4*1*8), rax) // a += 4 (1 x mr) sub(imm(0-4*1*8), rbx) // b += 4 (1 x nr) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + addpd(xmm3, xmm11) addpd(xmm4, xmm15) addpd(xmm5, xmm10) addpd(xmm6, xmm14) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta movddup(mem(rax), xmm6) // load alpha and duplicate movddup(mem(rbx), xmm7) // load beta and duplicate - - + + mov(var(rs_c), rsi) // load rs_c mov(rsi, r8) // make a copy of rs_c - + lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) - + lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c; - + // xmm8: xmm9: xmm10: xmm11: // ( ab01 ( ab00 ( ab03 ( ab02 // ab10 ) ab11 ) ab12 ) ab13 ) @@ -1155,15 +827,15 @@ void bli_dgemm_penryn_asm_4x4 movaps(xmm8, xmm0) movsd(xmm9, xmm8) movsd(xmm0, xmm9) - + movaps(xmm10, xmm0) movsd(xmm11, xmm10) movsd(xmm0, xmm11) - + movaps(xmm12, xmm0) movsd(xmm13, xmm12) movsd(xmm0, xmm13) - + movaps(xmm14, xmm0) movsd(xmm15, xmm14) movsd(xmm0, xmm15) @@ -1174,313 +846,133 @@ void bli_dgemm_penryn_asm_4x4 // xmm12: xmm13: xmm14: xmm15: // ( ab20 ( ab21 ( ab22 ( ab23 // ab30 ) ab31 ) ab32 ) ab33 ) - - - - // determine if - // c % 16 == 0, AND - // 8*cs_c % 16 == 0, AND - // rs_c == 1 - // ie: aligned, ldim aligned, and - // column-stored - - cmp(imm(1), r8) // set ZF if rs_c == 1. - sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); - test(imm(15), rcx) // set ZF if c & 16 is zero. - setz(bh) // bh = ( ZF == 1 ? 1 : 0 ); - test(imm(15), r12) // set ZF if (8*cs_c) & 16 is zero. - setz(al) // al = ( ZF == 1 ? 1 : 0 ); - // and(bl,bh) followed by - // and(bh,al) will reveal result - + // now avoid loading C if beta == 0 - + xorpd(xmm0, xmm0) // set xmm0 to zero. ucomisd(xmm0, xmm7) // check if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.DCOLSTORED) // jump to column storage case - - - - label(.DGENSTORED) - - movlpd(mem(rcx), xmm0) // load c00 and c10, - movhpd(mem(rcx, rsi, 1), xmm0) - mulpd(xmm6, xmm8) // scale by alpha, - mulpd(xmm7, xmm0) // scale by beta, - addpd(xmm8, xmm0) // add the gemm result, - movlpd(xmm0, mem(rcx)) // and store back to memory. - movhpd(xmm0, mem(rcx, rsi, 1)) - add(rdi, rcx) - - movlpd(mem(rdx), xmm1) // load c20 and c30, - movhpd(mem(rdx, rsi, 1), xmm1) - mulpd(xmm6, xmm12) // scale by alpha, - mulpd(xmm7, xmm1) // scale by beta, - addpd(xmm12, xmm1) // add the gemm result, - movlpd(xmm1, mem(rdx)) // and store back to memory. - movhpd(xmm1, mem(rdx, rsi, 1)) - add(rdi, rdx) - - - - movlpd(mem(rcx), xmm0) // load c01 and c11, - movhpd(mem(rcx, rsi, 1), xmm0) - mulpd(xmm6, xmm9) // scale by alpha, - mulpd(xmm7, xmm0) // scale by beta, - addpd(xmm9, xmm0) // add the gemm result, - movlpd(xmm0, mem(rcx)) // and store back to memory. - movhpd(xmm0, mem(rcx, rsi, 1)) - add(rdi, rcx) - - movlpd(mem(rdx), xmm1) // load c21 and c31, - movhpd(mem(rdx, rsi, 1), xmm1) - mulpd(xmm6, xmm13) // scale by alpha, - mulpd(xmm7, xmm1) // scale by beta, - addpd(xmm13, xmm1) // add the gemm result, - movlpd(xmm1, mem(rdx)) // and store back to memory. - movhpd(xmm1, mem(rdx, rsi, 1)) - add(rdi, rdx) - - - - movlpd(mem(rcx), xmm0) // load c02 and c12, - movhpd(mem(rcx, rsi, 1), xmm0) - mulpd(xmm6, xmm10) // scale by alpha, - mulpd(xmm7, xmm0) // scale by beta, - addpd(xmm10, xmm0) // add the gemm result, - movlpd(xmm0, mem(rcx)) // and store back to memory. - movhpd(xmm0, mem(rcx, rsi, 1)) - add(rdi, rcx) - - movlpd(mem(rdx), xmm1) // load c22 and c32, - movhpd(mem(rdx, rsi, 1), xmm1) - mulpd(xmm6, xmm14) // scale by alpha, - mulpd(xmm7, xmm1) // scale by beta, - addpd(xmm14, xmm1) // add the gemm result, - movlpd(xmm1, mem(rdx)) // and store back to memory. - movhpd(xmm1, mem(rdx, rsi, 1)) - add(rdi, rdx) - - - - movlpd(mem(rcx), xmm0) // load c03 and c13, - movhpd(mem(rcx, rsi, 1), xmm0) - mulpd(xmm6, xmm11) // scale by alpha, - mulpd(xmm7, xmm0) // scale by beta, - addpd(xmm11, xmm0) // add the gemm result, - movlpd(xmm0, mem(rcx)) // and store back to memory. - movhpd(xmm0, mem(rcx, rsi, 1)) - - - movlpd(mem(rdx), xmm1) // load c23 and c33, - movhpd(mem(rdx, rsi, 1), xmm1) - mulpd(xmm6, xmm15) // scale by alpha, - mulpd(xmm7, xmm1) // scale by beta, - addpd(xmm15, xmm1) // add the gemm result, - movlpd(xmm1, mem(rdx)) // and store back to memory. - movhpd(xmm1, mem(rdx, rsi, 1)) - - jmp(.DDONE) // jump to end. - - - - label(.DCOLSTORED) - - movaps(mem(rcx), xmm0) // load c00 and c10, - mulpd(xmm6, xmm8) // scale by alpha, - mulpd(xmm7, xmm0) // scale by beta, - addpd(xmm8, xmm0) // add the gemm result, - movaps(xmm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) - - movaps(mem(rdx), xmm1) // load c20 and c30, - mulpd(xmm6, xmm12) // scale by alpha, - mulpd(xmm7, xmm1) // scale by beta, - addpd(xmm12, xmm1) // add the gemm result, - movaps(xmm1, mem(rdx)) // and store back to memory. - add(rdi, rdx) - - - - movaps(mem(rcx), xmm0) // load c01 and c11, - mulpd(xmm6, xmm9) // scale by alpha, - mulpd(xmm7, xmm0) // scale by beta, - addpd(xmm9, xmm0) // add the gemm result, - movaps(xmm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) - - movaps(mem(rdx), xmm1) // load c21 and c31, - mulpd(xmm6, xmm13) // scale by alpha, - mulpd(xmm7, xmm1) // scale by beta, - addpd(xmm13, xmm1) // add the gemm result, - movaps(xmm1, mem(rdx)) // and store back to memory. - add(rdi, rdx) - - - - movaps(mem(rcx), xmm0) // load c02 and c12, - mulpd(xmm6, xmm10) // scale by alpha, - mulpd(xmm7, xmm0) // scale by beta, - addpd(xmm10, xmm0) // add the gemm result, - movaps(xmm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) - - movaps(mem(rdx), xmm1) // load c22 and c32, - mulpd(xmm6, xmm14) // scale by alpha, - mulpd(xmm7, xmm1) // scale by beta, - addpd(xmm14, xmm1) // add the gemm result, - movaps(xmm1, mem(rdx)) // and store back to memory. - add(rdi, rdx) - - - - movaps(mem(rcx), xmm0) // load c03 and c13, - mulpd(xmm6, xmm11) // scale by alpha, - mulpd(xmm7, xmm0) // scale by beta, - addpd(xmm11, xmm0) // add the gemm result, - movaps(xmm0, mem(rcx)) // and store back to memory. - - - movaps(mem(rdx), xmm1) // load c23 and c33, - mulpd(xmm6, xmm15) // scale by alpha, - mulpd(xmm7, xmm1) // scale by beta, - addpd(xmm15, xmm1) // add the gemm result, - movaps(xmm1, mem(rdx)) // and store back to memory. - - jmp(.DDONE) // jump to end. - - - - + + movaps(mem(rcx), xmm0) // load c00 and c10, + mulpd(xmm6, xmm8) // scale by alpha, + mulpd(xmm7, xmm0) // scale by beta, + addpd(xmm8, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) + + movaps(mem(rdx), xmm1) // load c20 and c30, + mulpd(xmm6, xmm12) // scale by alpha, + mulpd(xmm7, xmm1) // scale by beta, + addpd(xmm12, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + + movaps(mem(rcx), xmm0) // load c01 and c11, + mulpd(xmm6, xmm9) // scale by alpha, + mulpd(xmm7, xmm0) // scale by beta, + addpd(xmm9, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) + + movaps(mem(rdx), xmm1) // load c21 and c31, + mulpd(xmm6, xmm13) // scale by alpha, + mulpd(xmm7, xmm1) // scale by beta, + addpd(xmm13, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + + movaps(mem(rcx), xmm0) // load c02 and c12, + mulpd(xmm6, xmm10) // scale by alpha, + mulpd(xmm7, xmm0) // scale by beta, + addpd(xmm10, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) + + movaps(mem(rdx), xmm1) // load c22 and c32, + mulpd(xmm6, xmm14) // scale by alpha, + mulpd(xmm7, xmm1) // scale by beta, + addpd(xmm14, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + + movaps(mem(rcx), xmm0) // load c03 and c13, + mulpd(xmm6, xmm11) // scale by alpha, + mulpd(xmm7, xmm0) // scale by beta, + addpd(xmm11, xmm0) // add the gemm result, + movaps(xmm0, mem(rcx)) // and store back to memory. + + + movaps(mem(rdx), xmm1) // load c23 and c33, + mulpd(xmm6, xmm15) // scale by alpha, + mulpd(xmm7, xmm1) // scale by beta, + addpd(xmm15, xmm1) // add the gemm result, + movaps(xmm1, mem(rdx)) // and store back to memory. + + jmp(.DDONE) // jump to end. + label(.DBETAZERO) - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.DCOLSTORBZ) // jump to column storage case - - - - label(.DGENSTORBZ) - // skip loading c00 and c10, - mulpd(xmm6, xmm8) // scale by alpha, - movlpd(xmm8, mem(rcx)) // and store back to memory. - movhpd(xmm8, mem(rcx, rsi, 1)) - add(rdi, rcx) - // skip loading c20 and c30, - mulpd(xmm6, xmm12) // scale by alpha, - movlpd(xmm12, mem(rdx)) // and store back to memory. - movhpd(xmm12, mem(rdx, rsi, 1)) - add(rdi, rdx) - - - // skip loading c01 and c11, - mulpd(xmm6, xmm9) // scale by alpha, - movlpd(xmm9, mem(rcx)) // and store back to memory. - movhpd(xmm9, mem(rcx, rsi, 1)) - add(rdi, rcx) - // skip loading c21 and c31, - mulpd(xmm6, xmm13) // scale by alpha, - movlpd(xmm13, mem(rdx)) // and store back to memory. - movhpd(xmm13, mem(rdx, rsi, 1)) - add(rdi, rdx) - - - // skip loading c02 and c12, - mulpd(xmm6, xmm10) // scale by alpha, - movlpd(xmm10, mem(rcx)) // and store back to memory. - movhpd(xmm10, mem(rcx, rsi, 1)) - add(rdi, rcx) - // skip loading c22 and c32, - mulpd(xmm6, xmm14) // scale by alpha, - movlpd(xmm14, mem(rdx)) // and store back to memory. - movhpd(xmm14, mem(rdx, rsi, 1)) - add(rdi, rdx) - - - // skip loading c03 and c13, - mulpd(xmm6, xmm11) // scale by alpha, - movlpd(xmm11, mem(rcx)) // and store back to memory. - movhpd(xmm11, mem(rcx, rsi, 1)) - - // skip loading c23 and c33, - mulpd(xmm6, xmm15) // scale by alpha, - movlpd(xmm15, mem(rdx)) // and store back to memory. - movhpd(xmm15, mem(rdx, rsi, 1)) - - jmp(.DDONE) // jump to end. - - - - label(.DCOLSTORBZ) - - // skip loading c00 and c10, - mulpd(xmm6, xmm8) // scale by alpha, - movaps(xmm8, mem(rcx)) // and store back to memory. - add(rdi, rcx) - // skip loading c20 and c30, - mulpd(xmm6, xmm12) // scale by alpha, - movaps(xmm12, mem(rdx)) // and store back to memory. - add(rdi, rdx) - - - // skip loading c01 and c11, - mulpd(xmm6, xmm9) // scale by alpha, - movaps(xmm9, mem(rcx)) // and store back to memory. - add(rdi, rcx) - // skip loading c21 and c31, - mulpd(xmm6, xmm13) // scale by alpha, - movaps(xmm13, mem(rdx)) // and store back to memory. - add(rdi, rdx) - - - // skip loading c02 and c12, - mulpd(xmm6, xmm10) // scale by alpha, - movaps(xmm10, mem(rcx)) // and store back to memory. - add(rdi, rcx) - // skip loading c22 and c32, - mulpd(xmm6, xmm14) // scale by alpha, - movaps(xmm14, mem(rdx)) // and store back to memory. - add(rdi, rdx) - - - // skip loading c03 and c13, - mulpd(xmm6, xmm11) // scale by alpha, - movaps(xmm11, mem(rcx)) // and store back to memory. - - // skip loading c23 and c33, - mulpd(xmm6, xmm15) // scale by alpha, - movaps(xmm15, mem(rdx)) // and store back to memory. - - - - - - - - + + // skip loading c00 and c10, + mulpd(xmm6, xmm8) // scale by alpha, + movaps(xmm8, mem(rcx)) // and store back to memory. + add(rdi, rcx) + // skip loading c20 and c30, + mulpd(xmm6, xmm12) // scale by alpha, + movaps(xmm12, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + // skip loading c01 and c11, + mulpd(xmm6, xmm9) // scale by alpha, + movaps(xmm9, mem(rcx)) // and store back to memory. + add(rdi, rcx) + // skip loading c21 and c31, + mulpd(xmm6, xmm13) // scale by alpha, + movaps(xmm13, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + // skip loading c02 and c12, + mulpd(xmm6, xmm10) // scale by alpha, + movaps(xmm10, mem(rcx)) // and store back to memory. + add(rdi, rcx) + // skip loading c22 and c32, + mulpd(xmm6, xmm14) // scale by alpha, + movaps(xmm14, mem(rdx)) // and store back to memory. + add(rdi, rdx) + + + // skip loading c03 and c13, + mulpd(xmm6, xmm11) // scale by alpha, + movaps(xmm11, mem(rcx)) // and store back to memory. + + // skip loading c23 and c33, + mulpd(xmm6, xmm15) // scale by alpha, + movaps(xmm15, mem(rdx)) // and store back to memory. + label(.DDONE) - - end_asm( + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c), // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "xmm0", "xmm1", "xmm2", "xmm3", @@ -1489,6 +981,8 @@ void bli_dgemm_penryn_asm_4x4 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( d ); } diff --git a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c index 5963dabee..e65ce7178 100644 --- a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c +++ b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c @@ -42,7 +42,9 @@ void bli_sgemm_piledriver_asm_16x3 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, float* restrict alpha, float* restrict a, float* restrict b, @@ -57,36 +59,38 @@ void bli_sgemm_piledriver_asm_16x3 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 8; - uint64_t k_left = k0 % 8; + uint64_t k_iter = k / 8; + uint64_t k_left = k % 8; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( s, 16, 3, false ); + begin_asm() - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r15) // load address of b_next. mov(var(a_next), r14) // load address of a_next. - + prefetch(0, mem(rbx, 128)) // prefetch b prefetch(0, mem(rbx, 64+128)) // prefetch b prefetch(0, mem(rbx, 128+128)) // prefetch b - + add(imm(32*4), rax) add(imm(12*4), rbx) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; lea(mem(rcx, rdi, 2), r11) // load address of c + 2*cs_c; - + vbroadcastss(mem(rbx, -12*4), xmm1) vbroadcastss(mem(rbx, -11*4), xmm2) vbroadcastss(mem(rbx, -10*4), xmm3) - + vxorps(xmm4, xmm4, xmm4) vxorps(xmm5, xmm5, xmm5) vxorps(xmm6, xmm6, xmm6) @@ -99,23 +103,23 @@ void bli_sgemm_piledriver_asm_16x3 vxorps(xmm13, xmm13, xmm13) vxorps(xmm14, xmm14, xmm14) vxorps(xmm15, xmm15, xmm15) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + je(.SCONSIDKLEFT) // if i == 0, jump to k_left code. - - + + prefetch(0, mem(rbx, 16+192)) // prefetch b - + // iteration 0 vmovaps(mem(rax, -32*4), xmm0) prefetch(0, mem(rax, 384)) @@ -136,7 +140,7 @@ void bli_sgemm_piledriver_asm_16x3 vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, -8*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) - + // iteration 1 vmovaps(mem(rax, -16*4), xmm0) vbroadcastss(mem(rbx, -7*4), xmm3) @@ -158,7 +162,7 @@ void bli_sgemm_piledriver_asm_16x3 vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, -5*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) - + // iteration 2 vmovaps(mem(rax, 0*4), xmm0) vbroadcastss(mem(rbx, -4*4), xmm3) @@ -180,7 +184,7 @@ void bli_sgemm_piledriver_asm_16x3 vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, -2*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) - + // iteration 3 vmovaps(mem(rax, 16*4), xmm0) vbroadcastss(mem(rbx, -1*4), xmm3) @@ -202,10 +206,10 @@ void bli_sgemm_piledriver_asm_16x3 vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, 1*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) - - + + add(imm(4*16*4), rax) // a += 4*16 (unroll x mr) - + // iteration 4 vmovaps(mem(rax, -32*4), xmm0) vbroadcastss(mem(rbx, 2*4), xmm3) @@ -227,9 +231,9 @@ void bli_sgemm_piledriver_asm_16x3 vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, 4*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) - + prefetch(0, mem(rbx, 80+192)) // prefetch b - + // iteration 5 vmovaps(mem(rax, -16*4), xmm0) vbroadcastss(mem(rbx, 5*4), xmm3) @@ -251,7 +255,7 @@ void bli_sgemm_piledriver_asm_16x3 vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, 7*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) - + // iteration 6 vmovaps(mem(rax, 0*4), xmm0) vbroadcastss(mem(rbx, 8*4), xmm3) @@ -273,7 +277,7 @@ void bli_sgemm_piledriver_asm_16x3 vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, 10*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) - + // iteration 7 vmovaps(mem(rax, 16*4), xmm0) vbroadcastss(mem(rbx, 11*4), xmm3) @@ -298,34 +302,34 @@ void bli_sgemm_piledriver_asm_16x3 vbroadcastss(mem(rbx, -11*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) vbroadcastss(mem(rbx, -10*4), xmm3) - - - - + + + + dec(rsi) // i -= 1; jmp(.SLOOPKITER) // jump to beginning of loop. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP - - + + je(.SPOSTACCUM) // if i == 0, we're done. - - + + prefetch(0, mem(rbx, 16+192)) // prefetch b - + // iteration 0 vmovaps(mem(rax, -32*4), xmm0) prefetch(0, mem(rax, 384)) @@ -347,56 +351,56 @@ void bli_sgemm_piledriver_asm_16x3 vbroadcastss(mem(rbx, -8*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) vbroadcastss(mem(rbx, -7*4), xmm3) - - + + add(imm(1*16*4), rax) // a += 4*16 (unroll x mr) add(imm(1*3*4), rbx) // a += 4*3 (unroll x nr) - - + + dec(rsi) // i -= 1; jmp(.SLOOPKLEFT) // jump to beginning of loop. - - - + + + label(.SPOSTACCUM) - - + + prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c prefetchw0(mem(r11, 0*8)) // prefetch c + 2*cs_c - - - // xmm4: xmm5: xmm6: + + + // xmm4: xmm5: xmm6: // ( ab00 ( ab01 ( ab02 - // ab10 ab11 ab12 + // ab10 ab11 ab12 // ab20 ab21 ab22 // ab30 ) ab31 ) ab32 ) - - // xmm7: xmm8: xmm9: + + // xmm7: xmm8: xmm9: // ( ab40 ( ab41 ( ab42 - // ab50 ab51 ab52 + // ab50 ab51 ab52 // ab60 ab61 ab62 // ab70 ) ab71 ) ab72 ) - + // xmm10: xmm11: xmm12: // ( ab80 ( ab01 ( ab02 - // ab90 ab11 ab12 + // ab90 ab11 ab12 // abA0 abA1 abA2 // abB0 ) abB1 ) abB2 ) - + // xmm13: xmm14: xmm15: // ( abC0 ( abC1 ( abC2 - // abD0 abD1 abD2 + // abD0 abD1 abD2 // abE0 abE1 abE2 // abF0 ) abF1 ) abF2 ) - - - + + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm2) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) @@ -409,32 +413,32 @@ void bli_sgemm_piledriver_asm_16x3 vmulps(xmm0, xmm13, xmm13) vmulps(xmm0, xmm14, xmm14) vmulps(xmm0, xmm15, xmm15) - - - + + + prefetch(0, mem(r14)) // prefetch a_next prefetch(0, mem(r14, 64)) // prefetch a_next - - - - + + + + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; - + lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c; - - - + + + // determine if // c % 32 == 0, AND // 4*cs_c % 32 == 0, AND // rs_c == 1 // ie: aligned, ldim aligned, and // column-stored - + cmp(imm(4), rsi) // set ZF if (4*rs_c) == 4. sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); test(imm(31), rcx) // set ZF if c & 32 is zero. @@ -443,465 +447,69 @@ void bli_sgemm_piledriver_asm_16x3 setz(al) // al = ( ZF == 0 ? 1 : 0 ); // and(bl,bh) followed by // and(bh,al) will reveal result - + prefetch(0, mem(r15)) // prefetch b_next prefetch(0, mem(r15, 64)) // prefetch b_next - + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm2) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.SCOLSTORED) // jump to column storage case - - - - label(.SGENSTORED) - - - vmovlps(mem(rcx), xmm0, xmm0) // load c00:c30 - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmulps(xmm2, xmm0, xmm0) - vaddps(xmm4, xmm0, xmm0) - vmovss(xmm0, mem(rcx)) // store c00:c30 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r13, 1)) - lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; - - - vmovlps(mem(rcx), xmm0, xmm0) // load c40:c70 - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmulps(xmm2, xmm0, xmm0) - vaddps(xmm7, xmm0, xmm0) - vmovss(xmm0, mem(rcx)) // store c40:c70 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r13, 1)) - lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; - - - vmovlps(mem(rcx), xmm0, xmm0) // load c80:cB0 - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmulps(xmm2, xmm0, xmm0) - vaddps(xmm10, xmm0, xmm0) - vmovss(xmm0, mem(rcx)) // store c80:cB0 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r13, 1)) - lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; - - - vmovlps(mem(rcx), xmm0, xmm0) // load cC0:cF0 - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmulps(xmm2, xmm0, xmm0) - vaddps(xmm13, xmm0, xmm0) - vmovss(xmm0, mem(rcx)) // store cC0:cF0 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r13, 1)) - lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; - - - vmovlps(mem(r10), xmm0, xmm0) // load c01:c31 - vmovhps(mem(r10, rsi, 1), xmm0, xmm0) - vmovlps(mem(r10, r12, 1), xmm1, xmm1) - vmovhps(mem(r10, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmulps(xmm2, xmm0, xmm0) - vaddps(xmm5, xmm0, xmm0) - vmovss(xmm0, mem(r10)) // store c01:c31 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r13, 1)) - lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; - - - vmovlps(mem(r10), xmm0, xmm0) // load c41:c71 - vmovhps(mem(r10, rsi, 1), xmm0, xmm0) - vmovlps(mem(r10, r12, 1), xmm1, xmm1) - vmovhps(mem(r10, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmulps(xmm2, xmm0, xmm0) - vaddps(xmm8, xmm0, xmm0) - vmovss(xmm0, mem(r10)) // store c41:c71 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r13, 1)) - lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; - - - vmovlps(mem(r10), xmm0, xmm0) // load c81:cB1 - vmovhps(mem(r10, rsi, 1), xmm0, xmm0) - vmovlps(mem(r10, r12, 1), xmm1, xmm1) - vmovhps(mem(r10, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmulps(xmm2, xmm0, xmm0) - vaddps(xmm11, xmm0, xmm0) - vmovss(xmm0, mem(r10)) // store c81:cB1 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r13, 1)) - lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; - - - vmovlps(mem(r10), xmm0, xmm0) // load cC1:cF1 - vmovhps(mem(r10, rsi, 1), xmm0, xmm0) - vmovlps(mem(r10, r12, 1), xmm1, xmm1) - vmovhps(mem(r10, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmulps(xmm2, xmm0, xmm0) - vaddps(xmm14, xmm0, xmm0) - vmovss(xmm0, mem(r10)) // store cC1:cF1 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r13, 1)) - lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; - - - vmovlps(mem(r11), xmm0, xmm0) // load c02:c32 - vmovhps(mem(r11, rsi, 1), xmm0, xmm0) - vmovlps(mem(r11, r12, 1), xmm1, xmm1) - vmovhps(mem(r11, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmulps(xmm2, xmm0, xmm0) - vaddps(xmm6, xmm0, xmm0) - vmovss(xmm0, mem(r11)) // store c02:c32 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r13, 1)) - lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; - - - vmovlps(mem(r11), xmm0, xmm0) // load c42:c72 - vmovhps(mem(r11, rsi, 1), xmm0, xmm0) - vmovlps(mem(r11, r12, 1), xmm1, xmm1) - vmovhps(mem(r11, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmulps(xmm2, xmm0, xmm0) - vaddps(xmm9, xmm0, xmm0) - vmovss(xmm0, mem(r11)) // store c42:c72 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r13, 1)) - lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; - - - vmovlps(mem(r11), xmm0, xmm0) // load c82:cB2 - vmovhps(mem(r11, rsi, 1), xmm0, xmm0) - vmovlps(mem(r11, r12, 1), xmm1, xmm1) - vmovhps(mem(r11, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmulps(xmm2, xmm0, xmm0) - vaddps(xmm12, xmm0, xmm0) - vmovss(xmm0, mem(r11)) // store c82:cB2 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r13, 1)) - lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; - - - vmovlps(mem(r11), xmm0, xmm0) // load cC2:cF2 - vmovhps(mem(r11, rsi, 1), xmm0, xmm0) - vmovlps(mem(r11, r12, 1), xmm1, xmm1) - vmovhps(mem(r11, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmulps(xmm2, xmm0, xmm0) - vaddps(xmm15, xmm0, xmm0) - vmovss(xmm0, mem(r11)) // store cC2:cF1 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r13, 1)) - lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; - - - - jmp(.SDONE) // jump to end. - - - - label(.SCOLSTORED) - - - vfmadd231ps(mem(rcx, 0*16), xmm2, xmm4) - vfmadd231ps(mem(rcx, 1*16), xmm2, xmm7) - vfmadd231ps(mem(rcx, 2*16), xmm2, xmm10) - vfmadd231ps(mem(rcx, 3*16), xmm2, xmm13) - - vmovups(xmm4, mem(rcx, 0*16)) - vmovups(xmm7, mem(rcx, 1*16)) - vmovups(xmm10, mem(rcx, 2*16)) - vmovups(xmm13, mem(rcx, 3*16)) - - vfmadd231ps(mem(r10, 0*16), xmm2, xmm5) - vfmadd231ps(mem(r10, 1*16), xmm2, xmm8) - vfmadd231ps(mem(r10, 2*16), xmm2, xmm11) - vfmadd231ps(mem(r10, 3*16), xmm2, xmm14) - - vmovups(xmm5, mem(r10, 0*16)) - vmovups(xmm8, mem(r10, 1*16)) - vmovups(xmm11, mem(r10, 2*16)) - vmovups(xmm14, mem(r10, 3*16)) - - vfmadd231ps(mem(r11, 0*16), xmm2, xmm6) - vfmadd231ps(mem(r11, 1*16), xmm2, xmm9) - vfmadd231ps(mem(r11, 2*16), xmm2, xmm12) - vfmadd231ps(mem(r11, 3*16), xmm2, xmm15) - - vmovups(xmm6, mem(r11, 0*16)) - vmovups(xmm9, mem(r11, 1*16)) - vmovups(xmm12, mem(r11, 2*16)) - vmovups(xmm15, mem(r11, 3*16)) - - - - jmp(.SDONE) // jump to end. - - - + + vfmadd231ps(mem(rcx, 0*16), xmm2, xmm4) + vfmadd231ps(mem(rcx, 1*16), xmm2, xmm7) + vfmadd231ps(mem(rcx, 2*16), xmm2, xmm10) + vfmadd231ps(mem(rcx, 3*16), xmm2, xmm13) + + vfmadd231ps(mem(r10, 0*16), xmm2, xmm5) + vfmadd231ps(mem(r10, 1*16), xmm2, xmm8) + vfmadd231ps(mem(r10, 2*16), xmm2, xmm11) + vfmadd231ps(mem(r10, 3*16), xmm2, xmm14) + + vfmadd231ps(mem(r11, 0*16), xmm2, xmm6) + vfmadd231ps(mem(r11, 1*16), xmm2, xmm9) + vfmadd231ps(mem(r11, 2*16), xmm2, xmm12) + vfmadd231ps(mem(r11, 3*16), xmm2, xmm15) + + // fall through + label(.SBETAZERO) - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.SCOLSTORBZ) // jump to column storage case - - - - label(.SGENSTORBZ) - - - vmovaps(xmm4, xmm0) - vmovss(xmm0, mem(rcx)) // store c00:c30 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r13, 1)) - lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; - - - vmovaps(xmm7, xmm0) - vmovss(xmm0, mem(rcx)) // store c40:c70 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r13, 1)) - lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; - - - vmovaps(xmm10, xmm0) - vmovss(xmm0, mem(rcx)) // store c80:cB0 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r13, 1)) - lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; - - - vmovaps(xmm13, xmm0) - vmovss(xmm0, mem(rcx)) // store cC0:cF0 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(rcx, r13, 1)) - lea(mem(rcx, rsi, 4), rcx) // c += 4*rs_c; - - - vmovaps(xmm5, xmm0) - vmovss(xmm0, mem(r10)) // store c01:c31 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r13, 1)) - lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; - - - vmovaps(xmm8, xmm0) - vmovss(xmm0, mem(r10)) // store c41:c71 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r13, 1)) - lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; - - - vmovaps(xmm11, xmm0) - vmovss(xmm0, mem(r10)) // store c81:cB1 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r13, 1)) - lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; - - - vmovaps(xmm14, xmm0) - vmovss(xmm0, mem(r10)) // store cC1:cF1 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r10, r13, 1)) - lea(mem(r10, rsi, 4), r10) // c += 4*rs_c; - - - vmovaps(xmm6, xmm0) - vmovss(xmm0, mem(r11)) // store c02:c32 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r13, 1)) - lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; - - - vmovaps(xmm9, xmm0) - vmovss(xmm0, mem(r11)) // store c42:c72 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r13, 1)) - lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; - - - vmovaps(xmm12, xmm0) - vmovss(xmm0, mem(r11)) // store c82:cB2 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r13, 1)) - lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; - - - vmovaps(xmm15, xmm0) - vmovss(xmm0, mem(r11)) // store cC2:cF1 - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, rsi, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm0) - vmovss(xmm0, mem(r11, r13, 1)) - lea(mem(r11, rsi, 4), r11) // c += 4*rs_c; - - - - jmp(.SDONE) // jump to end. - - - - label(.SCOLSTORBZ) - - - vmovups(xmm4, mem(rcx, 0*16)) - vmovups(xmm7, mem(rcx, 1*16)) - vmovups(xmm10, mem(rcx, 2*16)) - vmovups(xmm13, mem(rcx, 3*16)) - - vmovups(xmm5, mem(r10, 0*16)) - vmovups(xmm8, mem(r10, 1*16)) - vmovups(xmm11, mem(r10, 2*16)) - vmovups(xmm14, mem(r10, 3*16)) - - vmovups(xmm6, mem(r11, 0*16)) - vmovups(xmm9, mem(r11, 1*16)) - vmovups(xmm12, mem(r11, 2*16)) - vmovups(xmm15, mem(r11, 3*16)) - - - - - - + + vmovups(xmm4, mem(rcx, 0*16)) + vmovups(xmm7, mem(rcx, 1*16)) + vmovups(xmm10, mem(rcx, 2*16)) + vmovups(xmm13, mem(rcx, 3*16)) + + vmovups(xmm5, mem(r10, 0*16)) + vmovups(xmm8, mem(r10, 1*16)) + vmovups(xmm11, mem(r10, 2*16)) + vmovups(xmm14, mem(r10, 3*16)) + + vmovups(xmm6, mem(r11, 0*16)) + vmovups(xmm9, mem(r11, 1*16)) + vmovups(xmm12, mem(r11, 2*16)) + vmovups(xmm15, mem(r11, 3*16)) + label(.SDONE) - - end_asm( + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c), // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next) // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -909,11 +517,15 @@ void bli_sgemm_piledriver_asm_16x3 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( s ); } void bli_dgemm_piledriver_asm_8x3 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, double* restrict alpha, double* restrict a, double* restrict b, @@ -928,36 +540,38 @@ void bli_dgemm_piledriver_asm_8x3 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 8; - uint64_t k_left = k0 % 8; + uint64_t k_iter = k / 8; + uint64_t k_left = k % 8; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( d, 8, 3, false ); + begin_asm() - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r15) // load address of b_next. mov(var(a_next), r14) // load address of a_next. - + prefetch(0, mem(rbx, 128)) // prefetch b prefetch(0, mem(rbx, 64+128)) // prefetch b prefetch(0, mem(rbx, 128+128)) // prefetch b - + add(imm(16*8), rax) add(imm(12*8), rbx) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; lea(mem(rcx, rdi, 2), r11) // load address of c + 2*cs_c; - + vmovddup(mem(rbx, -12*8), xmm1) vmovddup(mem(rbx, -11*8), xmm2) vmovddup(mem(rbx, -10*8), xmm3) - + vxorpd(xmm4, xmm4, xmm4) vxorpd(xmm5, xmm5, xmm5) vxorpd(xmm6, xmm6, xmm6) @@ -970,24 +584,24 @@ void bli_dgemm_piledriver_asm_8x3 vxorpd(xmm13, xmm13, xmm13) vxorpd(xmm14, xmm14, xmm14) vxorpd(xmm15, xmm15, xmm15) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + je(.DCONSIDKLEFT) // if i == 0, jump to k_left code. - - + + prefetch(0, mem(rbx, -32+256)) // prefetch b prefetch(0, mem(rbx, 32+256)) // prefetch b - + // iteration 0 vmovaps(mem(rax, -8*16), xmm0) prefetch(0, mem(rax, 384)) // prefetch a @@ -1008,7 +622,7 @@ void bli_dgemm_piledriver_asm_8x3 vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, -8*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) - + // iteration 1 vmovaps(mem(rax, -4*16), xmm0) prefetch(0, mem(rax, 64+384)) // prefetch a @@ -1030,7 +644,7 @@ void bli_dgemm_piledriver_asm_8x3 vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, -5*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) - + // iteration 2 vmovaps(mem(rax, 0*16), xmm0) prefetch(0, mem(rax, 128+384)) // prefetch a @@ -1052,7 +666,7 @@ void bli_dgemm_piledriver_asm_8x3 vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, -2*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) - + // iteration 3 vmovaps(mem(rax, 4*16), xmm0) prefetch(0, mem(rax, 192+384)) // prefetch a @@ -1075,7 +689,7 @@ void bli_dgemm_piledriver_asm_8x3 vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, 1*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) - + // iteration 4 vmovaps(mem(rax, -8*16), xmm0) prefetch(0, mem(rax, 384)) // prefetch a @@ -1097,9 +711,9 @@ void bli_dgemm_piledriver_asm_8x3 vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, 4*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) - + prefetch(0, mem(rbx, 96+256)) // prefetch b - + // iteration 5 vmovaps(mem(rax, -4*16), xmm0) prefetch(0, mem(rax, 64+384)) // prefetch a @@ -1121,8 +735,8 @@ void bli_dgemm_piledriver_asm_8x3 vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, 7*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) - - + + // iteration 6 vmovaps(mem(rax, 0*16), xmm0) prefetch(0, mem(rax, 128+384)) // prefetch a @@ -1144,7 +758,7 @@ void bli_dgemm_piledriver_asm_8x3 vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, 10*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) - + // iteration 7 vmovaps(mem(rax, 4*16), xmm0) prefetch(0, mem(rax, 192+384)) // prefetch a @@ -1169,31 +783,31 @@ void bli_dgemm_piledriver_asm_8x3 vmovddup(mem(rbx, -11*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) vmovddup(mem(rbx, -10*8), xmm3) - - - + + + dec(rsi) // i -= 1; jmp(.DLOOPKITER) // jump to beginning of loop. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done. // else, we prepare to // enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - - + + je(.DPOSTACCUM) // if i == 0, we're done. - + // iteration 0 vmovaps(mem(rax, -8*16), xmm0) prefetch(0, mem(rax, 512)) // prefetch a @@ -1215,48 +829,48 @@ void bli_dgemm_piledriver_asm_8x3 vmovddup(mem(rbx, -8*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) vmovddup(mem(rbx, -7*8), xmm3) - - + + add(imm(1*8*8), rax) // a += 1*8 (1 x mr) add(imm(1*3*8), rbx) // b += 1*3 (1 x nr) - - + + dec(rsi) // i -= 1; jmp(.DLOOPKLEFT) // jump to beginning of loop. - - - + + + label(.DPOSTACCUM) - + prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c prefetchw0(mem(r11, 0*8)) // prefetch c + 2*cs_c - - - // xmm4: xmm5: xmm6: - // ( ab00 ( ab01 ( ab02 + + + // xmm4: xmm5: xmm6: + // ( ab00 ( ab01 ( ab02 // ab10 ) ab11 ) ab12 ) // - // xmm7: xmm8: xmm9: - // ( ab20 ( ab21 ( ab22 + // xmm7: xmm8: xmm9: + // ( ab20 ( ab21 ( ab22 // ab30 ) ab31 ) ab32 ) // - // xmm10: xmm11: xmm12: - // ( ab40 ( ab41 ( ab42 + // xmm10: xmm11: xmm12: + // ( ab40 ( ab41 ( ab42 // ab50 ) ab51 ) ab52 ) // - // xmm13: xmm14: xmm15: - // ( ab60 ( ab61 ( ab62 + // xmm13: xmm14: xmm15: + // ( ab60 ( ab61 ( ab62 // ab70 ) ab71 ) ab72 ) - - - - + + + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vmovddup(mem(rax), xmm0) // load alpha and duplicate vmovddup(mem(rbx), xmm2) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(xmm0, xmm6, xmm6) @@ -1269,358 +883,89 @@ void bli_dgemm_piledriver_asm_8x3 vmulpd(xmm0, xmm13, xmm13) vmulpd(xmm0, xmm14, xmm14) vmulpd(xmm0, xmm15, xmm15) - - + + prefetch(0, mem(r14)) // prefetch a_next prefetch(0, mem(r14, 64)) // prefetch a_next - - - - - mov(var(rs_c), rsi) // load rs_c - lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) - - lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; - - lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; - lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c; - - - - // determine if - // c % 32 == 0, AND - // 8*cs_c % 32 == 0, AND - // rs_c == 1 - // ie: aligned, ldim aligned, and - // column-stored - - cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8. - sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); - test(imm(31), rcx) // set ZF if c & 32 is zero. - setz(bh) // bh = ( ZF == 0 ? 1 : 0 ); - test(imm(31), rdi) // set ZF if (8*cs_c) & 32 is zero. - setz(al) // al = ( ZF == 0 ? 1 : 0 ); - // and(bl,bh) followed by - // and(bh,al) will reveal result - + prefetch(0, mem(r15)) // prefetch b_next prefetch(0, mem(r15, 64)) // prefetch b_next - + // now avoid loading C if beta == 0 - + vxorpd(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomisd(xmm0, xmm2) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - je(.DGENSTORED) // jump to column storage case - - - - label(.DCOLSTORED) - - // xmm4: xmm5: xmm6: - // ( ab00 ( ab01 ( ab02 - // ab10 ) ab11 ) ab12 ) - // - // xmm7: xmm8: xmm9: - // ( ab20 ( ab21 ( ab22 - // ab30 ) ab31 ) ab32 ) - // - // xmm10: xmm11: xmm12: - // ( ab40 ( ab41 ( ab42 - // ab50 ) ab51 ) ab52 ) - // - // xmm13: xmm14: xmm15: - // ( ab60 ( ab61 ( ab62 - // ab70 ) ab71 ) ab72 ) - - - vfmadd231pd(mem(rcx, 0*16), xmm2, xmm4) - vfmadd231pd(mem(rcx, 1*16), xmm2, xmm7) - vfmadd231pd(mem(rcx, 2*16), xmm2, xmm10) - vfmadd231pd(mem(rcx, 3*16), xmm2, xmm13) - - vfmadd231pd(mem(r10, 0*16), xmm2, xmm5) - vfmadd231pd(mem(r10, 1*16), xmm2, xmm8) - vfmadd231pd(mem(r10, 2*16), xmm2, xmm11) - vfmadd231pd(mem(r10, 3*16), xmm2, xmm14) - - vfmadd231pd(mem(r11, 0*16), xmm2, xmm6) - vfmadd231pd(mem(r11, 1*16), xmm2, xmm9) - vfmadd231pd(mem(r11, 2*16), xmm2, xmm12) - vfmadd231pd(mem(r11, 3*16), xmm2, xmm15) - - - vmovups(xmm4, mem(rcx, 0*16)) - vmovups(xmm7, mem(rcx, 1*16)) - vmovups(xmm10, mem(rcx, 2*16)) - vmovups(xmm13, mem(rcx, 3*16)) - - vmovups(xmm5, mem(r10, 0*16)) - vmovups(xmm8, mem(r10, 1*16)) - vmovups(xmm11, mem(r10, 2*16)) - vmovups(xmm14, mem(r10, 3*16)) - - vmovups(xmm6, mem(r11, 0*16)) - vmovups(xmm9, mem(r11, 1*16)) - vmovups(xmm12, mem(r11, 2*16)) - vmovups(xmm15, mem(r11, 3*16)) - - - - -/* - vmovupd(mem(rcx), xmm0) // load c00:c10 - vmovupd(mem(rcx, r12, 1), xmm1) // load c20:c30 - vfmadd231pd(xmm2, xmm0, xmm4) - vfmadd231pd(xmm2, xmm1, xmm7) - vmovupd(xmm4, mem(rcx)) // store c00:c10 - vmovupd(xmm7, mem(rcx, r12, 1)) // store c20:c30 - add(rdi, rcx) - - vmovupd(mem(rdx), xmm0) // load c40:c50 - vmovupd(mem(rdx, r12, 1), xmm1) // load c60:c70 - vfmadd213pd(xmm10, xmm2, xmm0) - vfmadd213pd(xmm13, xmm2, xmm1) - vmovupd(xmm0, mem(rdx)) // store c40:c50 - vmovupd(xmm1, mem(rdx, r12, 1)) // store c60:c70 - add(rdi, rdx) - - - vmovupd(mem(rcx), xmm0) // load c01:c11 - vmovupd(mem(rcx, r12, 1), xmm1) // load c21:c31 - vfmadd213pd(xmm5, xmm2, xmm0) - vfmadd213pd(xmm8, xmm2, xmm1) - vmovupd(xmm0, mem(rcx)) // store c01:c11 - vmovupd(xmm1, mem(rcx, r12, 1)) // store c21:c31 - add(rdi, rcx) - - vmovupd(mem(rdx), xmm0) // load c41:c51 - vmovupd(mem(rdx, r12, 1), xmm1) // load c61:c71 - vfmadd213pd(xmm11, xmm2, xmm0) - vfmadd213pd(xmm14, xmm2, xmm1) - vmovupd(xmm0, mem(rdx)) // store c41:c51 - vmovupd(xmm1, mem(rdx, r12, 1)) // store c61:c71 - add(rdi, rdx) - - - vmovupd(mem(rcx), xmm0) // load c02:c12 - vmovupd(mem(rcx, r12, 1), xmm1) // load c22:c32 - vfmadd213pd(xmm6, xmm2, xmm0) - vfmadd213pd(xmm9, xmm2, xmm1) - vmovupd(xmm0, mem(rcx)) // store c02:c12 - vmovupd(xmm1, mem(rcx, r12, 1)) // store c22:c32 - - vmovupd(mem(rdx), xmm0) // load c42:c52 - vmovupd(mem(rdx, r12, 1), xmm1) // load c62:c72 - vfmadd213pd(xmm12, xmm2, xmm0) - vfmadd213pd(xmm15, xmm2, xmm1) - vmovupd(xmm0, mem(rdx)) // store c42:c52 - vmovupd(xmm1, mem(rdx, r12, 1)) // store c62:c72 -*/ - - - - jmp(.DDONE) // jump to end. - - - - label(.DGENSTORED) - - - vmovlpd(mem(rcx), xmm0, xmm0) // load c00:c10 - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) - vaddpd(xmm4, xmm0, xmm0) - vmovlpd(xmm0, mem(rcx)) // store c00:c10 - vmovhpd(xmm0, mem(rcx, rsi, 1)) - vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c20:c30 - vmovhpd(mem(rcx, r13, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) - vaddpd(xmm7, xmm0, xmm0) - vmovlpd(xmm0, mem(rcx, r12, 1)) // store c20:c30 - vmovhpd(xmm0, mem(rcx, r13, 1)) - add(rdi, rcx) - - vmovlpd(mem(rdx), xmm0, xmm0) // load c40:c50 - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) - vaddpd(xmm10, xmm0, xmm0) - vmovlpd(xmm0, mem(rdx)) // store c40:c50 - vmovhpd(xmm0, mem(rdx, rsi, 1)) - vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c60:c70 - vmovhpd(mem(rdx, r13, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) - vaddpd(xmm13, xmm0, xmm0) - vmovlpd(xmm0, mem(rdx, r12, 1)) // store c60:c70 - vmovhpd(xmm0, mem(rdx, r13, 1)) - add(rdi, rdx) - - - vmovlpd(mem(rcx), xmm0, xmm0) // load c01:c11 - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) - vaddpd(xmm5, xmm0, xmm0) - vmovlpd(xmm0, mem(rcx)) // store c01:c11 - vmovhpd(xmm0, mem(rcx, rsi, 1)) - vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c21:c31 - vmovhpd(mem(rcx, r13, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) - vaddpd(xmm8, xmm0, xmm0) - vmovlpd(xmm0, mem(rcx, r12, 1)) // store c21:c31 - vmovhpd(xmm0, mem(rcx, r13, 1)) - add(rdi, rcx) - - vmovlpd(mem(rdx), xmm0, xmm0) // load c41:c51 - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) - vaddpd(xmm11, xmm0, xmm0) - vmovlpd(xmm0, mem(rdx)) // store c41:c51 - vmovhpd(xmm0, mem(rdx, rsi, 1)) - vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c61:c71 - vmovhpd(mem(rdx, r13, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) - vaddpd(xmm14, xmm0, xmm0) - vmovlpd(xmm0, mem(rdx, r12, 1)) // store c61:c71 - vmovhpd(xmm0, mem(rdx, r13, 1)) - add(rdi, rdx) - - - vmovlpd(mem(rcx), xmm0, xmm0) // load c02:c12 - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) - vaddpd(xmm6, xmm0, xmm0) - vmovlpd(xmm0, mem(rcx)) // store c02:c12 - vmovhpd(xmm0, mem(rcx, rsi, 1)) - vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c22:c32 - vmovhpd(mem(rcx, r13, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) - vaddpd(xmm9, xmm0, xmm0) - vmovlpd(xmm0, mem(rcx, r12, 1)) // store c22:c32 - vmovhpd(xmm0, mem(rcx, r13, 1)) - add(rdi, rcx) - - vmovlpd(mem(rdx), xmm0, xmm0) // load c42:c52 - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) - vaddpd(xmm12, xmm0, xmm0) - vmovlpd(xmm0, mem(rdx)) // store c42:c52 - vmovhpd(xmm0, mem(rdx, rsi, 1)) - vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c62:c72 - vmovhpd(mem(rdx, r13, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) - vaddpd(xmm15, xmm0, xmm0) - vmovlpd(xmm0, mem(rdx, r12, 1)) // store c62:c72 - vmovhpd(xmm0, mem(rdx, r13, 1)) - add(rdi, rdx) - - - - jmp(.DDONE) // jump to end. - - - + + // xmm4: xmm5: xmm6: + // ( ab00 ( ab01 ( ab02 + // ab10 ) ab11 ) ab12 ) + // + // xmm7: xmm8: xmm9: + // ( ab20 ( ab21 ( ab22 + // ab30 ) ab31 ) ab32 ) + // + // xmm10: xmm11: xmm12: + // ( ab40 ( ab41 ( ab42 + // ab50 ) ab51 ) ab52 ) + // + // xmm13: xmm14: xmm15: + // ( ab60 ( ab61 ( ab62 + // ab70 ) ab71 ) ab72 ) + + vfmadd231pd(mem(rcx, 0*16), xmm2, xmm4) + vfmadd231pd(mem(rcx, 1*16), xmm2, xmm7) + vfmadd231pd(mem(rcx, 2*16), xmm2, xmm10) + vfmadd231pd(mem(rcx, 3*16), xmm2, xmm13) + + vfmadd231pd(mem(r10, 0*16), xmm2, xmm5) + vfmadd231pd(mem(r10, 1*16), xmm2, xmm8) + vfmadd231pd(mem(r10, 2*16), xmm2, xmm11) + vfmadd231pd(mem(r10, 3*16), xmm2, xmm14) + + vfmadd231pd(mem(r11, 0*16), xmm2, xmm6) + vfmadd231pd(mem(r11, 1*16), xmm2, xmm9) + vfmadd231pd(mem(r11, 2*16), xmm2, xmm12) + vfmadd231pd(mem(r11, 3*16), xmm2, xmm15) + + // fall through + label(.DBETAZERO) - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.DCOLSTORBZ) // jump to column storage case - - - - label(.DGENSTORBZ) - - - vmovlpd(xmm4, mem(rcx)) - vmovhpd(xmm4, mem(rcx, rsi, 1)) - vmovlpd(xmm7, mem(rcx, r12, 1)) - vmovhpd(xmm7, mem(rcx, r13, 1)) - add(rdi, rcx) - vmovlpd(xmm10, mem(rdx)) - vmovhpd(xmm10, mem(rdx, rsi, 1)) - vmovlpd(xmm13, mem(rdx, r12, 1)) - vmovhpd(xmm13, mem(rdx, r13, 1)) - add(rdi, rdx) - - vmovlpd(xmm5, mem(rcx)) - vmovhpd(xmm5, mem(rcx, rsi, 1)) - vmovlpd(xmm8, mem(rcx, r12, 1)) - vmovhpd(xmm8, mem(rcx, r13, 1)) - add(rdi, rcx) - vmovlpd(xmm11, mem(rdx)) - vmovhpd(xmm11, mem(rdx, rsi, 1)) - vmovlpd(xmm14, mem(rdx, r12, 1)) - vmovhpd(xmm14, mem(rdx, r13, 1)) - add(rdi, rdx) - - vmovlpd(xmm6, mem(rcx)) - vmovhpd(xmm6, mem(rcx, rsi, 1)) - vmovlpd(xmm9, mem(rcx, r12, 1)) - vmovhpd(xmm9, mem(rcx, r13, 1)) - add(rdi, rcx) - vmovlpd(xmm12, mem(rdx)) - vmovhpd(xmm12, mem(rdx, rsi, 1)) - vmovlpd(xmm15, mem(rdx, r12, 1)) - vmovhpd(xmm15, mem(rdx, r13, 1)) - add(rdi, rdx) - - - - jmp(.DDONE) // jump to end. - - - - label(.DCOLSTORBZ) - - - vmovupd(xmm4, mem(rcx)) - vmovupd(xmm7, mem(rcx, r12, 1)) - add(rdi, rcx) - vmovupd(xmm10, mem(rdx)) - vmovupd(xmm13, mem(rdx, r12, 1)) - add(rdi, rdx) - - vmovupd(xmm5, mem(rcx)) - vmovupd(xmm8, mem(rcx, r12, 1)) - add(rdi, rcx) - vmovupd(xmm11, mem(rdx)) - vmovupd(xmm14, mem(rdx, r12, 1)) - add(rdi, rdx) - - vmovupd(xmm6, mem(rcx)) - vmovupd(xmm9, mem(rcx, r12, 1)) - add(rdi, rcx) - vmovupd(xmm12, mem(rdx)) - vmovupd(xmm15, mem(rdx, r12, 1)) - add(rdi, rdx) - - - - - + + vmovups(xmm4, mem(rcx, 0*16)) + vmovups(xmm7, mem(rcx, 1*16)) + vmovups(xmm10, mem(rcx, 2*16)) + vmovups(xmm13, mem(rcx, 3*16)) + + vmovups(xmm5, mem(r10, 0*16)) + vmovups(xmm8, mem(r10, 1*16)) + vmovups(xmm11, mem(r10, 2*16)) + vmovups(xmm14, mem(r10, 3*16)) + + vmovups(xmm6, mem(r11, 0*16)) + vmovups(xmm9, mem(r11, 1*16)) + vmovups(xmm12, mem(r11, 2*16)) + vmovups(xmm15, mem(r11, 3*16)) + label(.DDONE) - - end_asm( + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c), // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next) // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1628,11 +973,15 @@ void bli_dgemm_piledriver_asm_8x3 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( d ); } void bli_cgemm_piledriver_asm_4x2 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, @@ -1647,28 +996,30 @@ void bli_cgemm_piledriver_asm_4x2 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 8; - uint64_t k_left = k0 % 8; + uint64_t k_iter = k / 8; + uint64_t k_left = k % 8; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( c, 4, 2, false ); + begin_asm() - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r15) // load address of b_next. mov(var(a_next), r14) // load address of a_next. - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; - + add(imm(32*4), rax) add(imm(16*4), rbx) - - + + vxorps(xmm8, xmm8, xmm8) vxorps(xmm9, xmm9, xmm9) vxorps(xmm10, xmm10, xmm10) @@ -1678,24 +1029,24 @@ void bli_cgemm_piledriver_asm_4x2 vxorps(xmm14, xmm14, xmm14) vxorps(xmm15, xmm15, xmm15) //vzeroall() - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.CLOOPKITER) // MAIN LOOP - - + + je(.CCONSIDKLEFT) // if i == 0, jump to k_left code. - - + + prefetch(0, mem(rbx, 256)) prefetch(0, mem(rax, 512)) - + // iteration 0 vmovaps(mem(rax, -32*4), xmm0) vbroadcastss(mem(rbx, -16*4), xmm4) @@ -1711,7 +1062,7 @@ void bli_cgemm_piledriver_asm_4x2 vbroadcastss(mem(rbx, -13*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) - + // iteration 1 vmovaps(mem(rax, -24*4), xmm0) vbroadcastss(mem(rbx, -12*4), xmm4) @@ -1727,10 +1078,10 @@ void bli_cgemm_piledriver_asm_4x2 vbroadcastss(mem(rbx, -9*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) - + prefetch(0, mem(rbx, 64+256)) prefetch(0, mem(rax, 64+512)) - + // iteration 2 vmovaps(mem(rax, -16*4), xmm0) vbroadcastss(mem(rbx, -8*4), xmm4) @@ -1746,7 +1097,7 @@ void bli_cgemm_piledriver_asm_4x2 vbroadcastss(mem(rbx, -5*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) - + // iteration 3 vmovaps(mem(rax, -8*4), xmm0) vbroadcastss(mem(rbx, -4*4), xmm4) @@ -1762,10 +1113,10 @@ void bli_cgemm_piledriver_asm_4x2 vbroadcastss(mem(rbx, -1*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) - + prefetch(0, mem(rbx, 128+256)) prefetch(0, mem(rax, 128+512)) - + // iteration 4 vmovaps(mem(rax, 0*4), xmm0) vbroadcastss(mem(rbx, 0*4), xmm4) @@ -1781,7 +1132,7 @@ void bli_cgemm_piledriver_asm_4x2 vbroadcastss(mem(rbx, 3*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) - + // iteration 5 vmovaps(mem(rax, 8*4), xmm0) vbroadcastss(mem(rbx, 4*4), xmm4) @@ -1797,10 +1148,10 @@ void bli_cgemm_piledriver_asm_4x2 vbroadcastss(mem(rbx, 7*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) - + prefetch(0, mem(rbx, 128+256)) prefetch(0, mem(rax, 128+512)) - + // iteration 6 vmovaps(mem(rax, 16*4), xmm0) vbroadcastss(mem(rbx, 8*4), xmm4) @@ -1816,7 +1167,7 @@ void bli_cgemm_piledriver_asm_4x2 vbroadcastss(mem(rbx, 11*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) - + // iteration 7 vmovaps(mem(rax, 24*4), xmm0) vbroadcastss(mem(rbx, 12*4), xmm4) @@ -1834,33 +1185,33 @@ void bli_cgemm_piledriver_asm_4x2 add(imm(8*2*8), rbx) // b += 8*2 (unroll x nr) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) - - - + + + dec(rsi) // i -= 1; jmp(.CLOOPKITER) // jump to beginning of loop. - - - - - - + + + + + + label(.CCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.CLOOPKLEFT) // EDGE LOOP - - + + je(.CPOSTACCUM) // if i == 0, we're done. - + prefetch(0, mem(rbx, 256)) prefetch(0, mem(rax, 512)) - + // iteration 0 vmovaps(mem(rax, -32*4), xmm0) vbroadcastss(mem(rbx, -16*4), xmm4) @@ -1876,123 +1227,88 @@ void bli_cgemm_piledriver_asm_4x2 vbroadcastss(mem(rbx, -13*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) - - + + add(imm(1*4*8), rax) // a += 1*2 (1 x mr) add(imm(1*2*8), rbx) // b += 1*2 (1 x nr) - - + + dec(rsi) // i -= 1; jmp(.CLOOPKLEFT) // jump to beginning of loop. - - - + + + label(.CPOSTACCUM) - - + + prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c - - + + vpermilps(imm(0xb1), xmm9, xmm9) vpermilps(imm(0xb1), xmm11, xmm11) vpermilps(imm(0xb1), xmm13, xmm13) vpermilps(imm(0xb1), xmm15, xmm15) - + vaddsubps(xmm9, xmm8, xmm8) vaddsubps(xmm11, xmm10, xmm10) vaddsubps(xmm13, xmm12, xmm12) vaddsubps(xmm15, xmm14, xmm14) - - + + // xmm8: xmm10: // ( ab00 ( ab01 // ab10 ab11 // ab20 ab21 // ab30 ) ab31 ) - + // xmm12: xmm14: // ( ab40 ( ab41 // ab50 ab51 // ab60 ab61 // ab70 ) ab71 ) - - + + prefetch(0, mem(r14)) // prefetch a_next prefetch(0, mem(r14, 64)) // prefetch a_next - - + + // scale by alpha - + mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), xmm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), xmm1) // load alpha_i and duplicate - + vpermilps(imm(0xb1), xmm8, xmm9) vpermilps(imm(0xb1), xmm10, xmm11) vpermilps(imm(0xb1), xmm12, xmm13) vpermilps(imm(0xb1), xmm14, xmm15) - + vmulps(xmm8, xmm0, xmm8) vmulps(xmm10, xmm0, xmm10) vmulps(xmm12, xmm0, xmm12) vmulps(xmm14, xmm0, xmm14) - + vmulps(xmm9, xmm1, xmm9) vmulps(xmm11, xmm1, xmm11) vmulps(xmm13, xmm1, xmm13) vmulps(xmm15, xmm1, xmm15) - + vaddsubps(xmm9, xmm8, xmm8) vaddsubps(xmm11, xmm10, xmm10) vaddsubps(xmm13, xmm12, xmm12) vaddsubps(xmm15, xmm14, xmm14) - - - - + + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), xmm6) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), xmm7) // load beta_i and duplicate - - - - - - - - mov(var(rs_c), rsi) // load rs_c - lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex) - - - lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; - lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c; - - - + prefetch(0, mem(r15)) // prefetch b_next prefetch(0, mem(r15, 64)) // prefetch b_next - - - - // determine if - // c % 32 == 0, AND - // 8*cs_c % 32 == 0, AND - // rs_c == 1 - // ie: aligned, ldim aligned, and - // column-stored - - cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8. - sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); - test(imm(31), rcx) // set ZF if c & 32 is zero. - setz(bh) // bh = ( ZF == 0 ? 1 : 0 ); - test(imm(31), rdi) // set ZF if (8*cs_c) & 32 is zero. - setz(al) // al = ( ZF == 0 ? 1 : 0 ); - // and(bl,bh) followed by - // and(bh,al) will reveal result - + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm6) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); @@ -2000,175 +1316,66 @@ void bli_cgemm_piledriver_asm_4x2 sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.CBETAZERO) // if ZF = 0, jump to beta == 0 case - - - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.CCOLSTORED) // jump to column storage case - - - - label(.CGENSTORED) - - - vmovlps(mem(rcx), xmm0, xmm0) // load c00:c10 - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm2, xmm2) // load c20:c30 - vmovhps(mem(rcx, r13, 1), xmm2, xmm2) - vpermilps(imm(0xb1), xmm0, xmm1) - vpermilps(imm(0xb1), xmm2, xmm3) - - vmulps(xmm6, xmm0, xmm0) - vmulps(xmm7, xmm1, xmm1) - vaddsubps(xmm1, xmm0, xmm0) - vaddps(xmm8, xmm0, xmm0) - vmovlps(xmm0, mem(rcx)) // store c00:c10 - vmovhps(xmm0, mem(rcx, rsi, 1)) - - vmulps(xmm6, xmm2, xmm2) - vmulps(xmm7, xmm3, xmm3) - vaddsubps(xmm3, xmm2, xmm2) - vaddps(xmm12, xmm2, xmm2) - vmovlps(xmm2, mem(rcx, r12, 1)) // store c20:c30 - vmovhps(xmm2, mem(rcx, r13, 1)) - - - - vmovlps(mem(r10), xmm0, xmm0) // load c01:c11 - vmovhps(mem(r10, rsi, 1), xmm0, xmm0) - vmovlps(mem(r10, r12, 1), xmm2, xmm2) // load c21:c31 - vmovhps(mem(r10, r13, 1), xmm2, xmm2) - vpermilps(imm(0xb1), xmm0, xmm1) - vpermilps(imm(0xb1), xmm2, xmm3) - - vmulps(xmm6, xmm0, xmm0) - vmulps(xmm7, xmm1, xmm1) - vaddsubps(xmm1, xmm0, xmm0) - vaddps(xmm10, xmm0, xmm0) - vmovlps(xmm0, mem(r10)) // store c01:c11 - vmovhps(xmm0, mem(r10, rsi, 1)) - - vmulps(xmm6, xmm2, xmm2) - vmulps(xmm7, xmm3, xmm3) - vaddsubps(xmm3, xmm2, xmm2) - vaddps(xmm14, xmm2, xmm2) - vmovlps(xmm2, mem(r10, r12, 1)) // store c21:c31 - vmovhps(xmm2, mem(r10, r13, 1)) - - - - jmp(.CDONE) // jump to end. - - - - label(.CCOLSTORED) - - - vmovups(mem(rcx), xmm0) // load c00:c10 - vmovups(mem(rcx, 16), xmm2) // load c20:c30 - vpermilps(imm(0xb1), xmm0, xmm1) - vpermilps(imm(0xb1), xmm2, xmm3) - - vmulps(xmm6, xmm0, xmm0) - vmulps(xmm7, xmm1, xmm1) - vaddsubps(xmm1, xmm0, xmm0) - vaddps(xmm8, xmm0, xmm0) - vmovups(xmm0, mem(rcx)) // store c00:c10 - - vmulps(xmm6, xmm2, xmm2) - vmulps(xmm7, xmm3, xmm3) - vaddsubps(xmm3, xmm2, xmm2) - vaddps(xmm12, xmm2, xmm2) - vmovups(xmm2, mem(rcx, 16)) // store c20:c30 - - - - vmovups(mem(r10), xmm0) // load c01:c11 - vmovups(mem(r10, 16), xmm2) // load c21:c31 - vpermilps(imm(0xb1), xmm0, xmm1) - vpermilps(imm(0xb1), xmm2, xmm3) - - vmulps(xmm6, xmm0, xmm0) - vmulps(xmm7, xmm1, xmm1) - vaddsubps(xmm1, xmm0, xmm0) - vaddps(xmm10, xmm0, xmm0) - vmovups(xmm0, mem(r10)) // store c01:c11 - - vmulps(xmm6, xmm2, xmm2) - vmulps(xmm7, xmm3, xmm3) - vaddsubps(xmm3, xmm2, xmm2) - vaddps(xmm14, xmm2, xmm2) - vmovups(xmm2, mem(r10, 16)) // store c21:c31 - - - - jmp(.CDONE) // jump to end. - - - + + vmovups(mem(rcx), xmm0) // load c00:c10 + vmovups(mem(rcx, 16), xmm2) // load c20:c30 + vpermilps(imm(0xb1), xmm0, xmm1) + vpermilps(imm(0xb1), xmm2, xmm3) + + vmulps(xmm6, xmm0, xmm0) + vmulps(xmm7, xmm1, xmm1) + vaddsubps(xmm1, xmm0, xmm0) + vaddps(xmm8, xmm0, xmm0) + + vmulps(xmm6, xmm2, xmm2) + vmulps(xmm7, xmm3, xmm3) + vaddsubps(xmm3, xmm2, xmm2) + vaddps(xmm12, xmm2, xmm2) + + vmovups(mem(r10), xmm0) // load c01:c11 + vmovups(mem(r10, 16), xmm2) // load c21:c31 + vpermilps(imm(0xb1), xmm0, xmm1) + vpermilps(imm(0xb1), xmm2, xmm3) + + vmulps(xmm6, xmm0, xmm0) + vmulps(xmm7, xmm1, xmm1) + vaddsubps(xmm1, xmm0, xmm0) + vaddps(xmm10, xmm0, xmm0) + + vmulps(xmm6, xmm2, xmm2) + vmulps(xmm7, xmm3, xmm3) + vaddsubps(xmm3, xmm2, xmm2) + vaddps(xmm14, xmm2, xmm2) + + // fall through + label(.CBETAZERO) - // check if aligned/column-stored - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.CCOLSTORBZ) // jump to column storage case - - - - label(.CGENSTORBZ) - - - vmovlps(xmm8, mem(rcx)) // store c00:c10 - vmovhps(xmm8, mem(rcx, rsi, 1)) - - vmovlps(xmm12, mem(rcx, r12, 1)) // store c20:c30 - vmovhps(xmm12, mem(rcx, r13, 1)) - - vmovlps(xmm10, mem(r10)) // store c01:c11 - vmovhps(xmm10, mem(r10, rsi, 1)) - - vmovlps(xmm14, mem(r10, r12, 1)) // store c21:c31 - vmovhps(xmm14, mem(r10, r13, 1)) - - - - jmp(.CDONE) // jump to end. - - - - label(.CCOLSTORBZ) - - - vmovups(xmm8, mem(rcx)) // store c00:c10 - vmovups(xmm12, mem(rcx, 16)) // store c20:c30 - - vmovups(xmm10, mem(r10)) // store c01:c11 - vmovups(xmm14, mem(r10, 16)) // store c21:c31 - - - - - + + vmovups(xmm8, mem(rcx)) // store c00:c10 + vmovups(xmm12, mem(rcx, 16)) // store c20:c30 + + vmovups(xmm10, mem(r10)) // store c01:c11 + vmovups(xmm14, mem(r10, 16)) // store c21:c31 + label(.CDONE) - - end_asm( + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c), // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next) // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2176,11 +1383,15 @@ void bli_cgemm_piledriver_asm_4x2 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( c ); } void bli_zgemm_piledriver_asm_2x2 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, @@ -2195,28 +1406,30 @@ void bli_zgemm_piledriver_asm_2x2 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 8; - uint64_t k_left = k0 % 8; + uint64_t k_iter = k / 8; + uint64_t k_left = k % 8; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( z, 2, 2, false ); + begin_asm() - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r15) // load address of b_next. mov(var(a_next), r14) // load address of a_next. - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) lea(mem(, rdi, 2), rdi) lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; - + add(imm(16*8), rax) add(imm(16*8), rbx) - + vxorpd(xmm8, xmm8, xmm8) vxorpd(xmm9, xmm9, xmm9) vxorpd(xmm10, xmm10, xmm10) @@ -2225,25 +1438,25 @@ void bli_zgemm_piledriver_asm_2x2 vxorpd(xmm13, xmm13, xmm13) vxorpd(xmm14, xmm14, xmm14) vxorpd(xmm15, xmm15, xmm15) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.ZLOOPKITER) // MAIN LOOP - - + + je(.ZCONSIDKLEFT) // if i == 0, jump to k_left code. - - + + prefetch(0, mem(rbx, 256)) - + prefetch(0, mem(rax, 512)) - + // iteration 0 vmovaps(mem(rax, -16*8), xmm0) vmovddup(mem(rbx, -16*8), xmm4) @@ -2261,7 +1474,7 @@ void bli_zgemm_piledriver_asm_2x2 vmovaps(mem(rax, -12*8), xmm0) vmovddup(mem(rbx, -12*8), xmm4) vfmadd231pd(xmm1, xmm7, xmm15) - + // iteration 1 vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, -10*8), xmm1) @@ -2277,11 +1490,11 @@ void bli_zgemm_piledriver_asm_2x2 vmovaps(mem(rax, -8*8), xmm0) vmovddup(mem(rbx, -8*8), xmm4) vfmadd231pd(xmm1, xmm7, xmm15) - + prefetch(0, mem(rbx, 64+256)) - + prefetch(0, mem(rax, 64+512)) - + // iteration 2 vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, -6*8), xmm1) @@ -2297,7 +1510,7 @@ void bli_zgemm_piledriver_asm_2x2 vmovaps(mem(rax, -4*8), xmm0) vmovddup(mem(rbx, -4*8), xmm4) vfmadd231pd(xmm1, xmm7, xmm15) - + // iteration 3 vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, -2*8), xmm1) @@ -2313,11 +1526,11 @@ void bli_zgemm_piledriver_asm_2x2 vmovaps(mem(rax, 0*8), xmm0) vmovddup(mem(rbx, 0*8), xmm4) vfmadd231pd(xmm1, xmm7, xmm15) - + prefetch(0, mem(rbx, 128+256)) - + prefetch(0, mem(rax, 128+512)) - + // iteration 4 vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, 2*8), xmm1) @@ -2333,7 +1546,7 @@ void bli_zgemm_piledriver_asm_2x2 vmovaps(mem(rax, 4*8), xmm0) vmovddup(mem(rbx, 4*8), xmm4) vfmadd231pd(xmm1, xmm7, xmm15) - + // iteration 5 vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, 6*8), xmm1) @@ -2349,11 +1562,11 @@ void bli_zgemm_piledriver_asm_2x2 vmovaps(mem(rax, 8*8), xmm0) vmovddup(mem(rbx, 8*8), xmm4) vfmadd231pd(xmm1, xmm7, xmm15) - + prefetch(0, mem(rbx, 128+256)) - + prefetch(0, mem(rax, 128+512)) - + // iteration 6 vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, 10*8), xmm1) @@ -2369,7 +1582,7 @@ void bli_zgemm_piledriver_asm_2x2 vmovaps(mem(rax, 12*8), xmm0) vmovddup(mem(rbx, 12*8), xmm4) vfmadd231pd(xmm1, xmm7, xmm15) - + // iteration 7 vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, 14*8), xmm1) @@ -2385,34 +1598,34 @@ void bli_zgemm_piledriver_asm_2x2 add(imm(8*2*16), rbx) // b += 8*2 (unroll x nr) vfmadd231pd(xmm0, xmm7, xmm11) vfmadd231pd(xmm1, xmm7, xmm15) - - - + + + dec(rsi) // i -= 1; jmp(.ZLOOPKITER) // jump to beginning of loop. - - - - - - + + + + + + label(.ZCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.ZLOOPKLEFT) // EDGE LOOP - - + + je(.ZPOSTACCUM) // if i == 0, we're done. - + prefetch(0, mem(rbx, 256)) - + prefetch(0, mem(rax, 512)) - + // iteration 0 vmovaps(mem(rax, -16*8), xmm0) vmovddup(mem(rbx, -16*8), xmm4) @@ -2428,119 +1641,86 @@ void bli_zgemm_piledriver_asm_2x2 vmovddup(mem(rbx, -13*8), xmm7) vfmadd231pd(xmm0, xmm7, xmm11) vfmadd231pd(xmm1, xmm7, xmm15) - - + + add(imm(1*2*16), rax) // a += 1*2 (1 x mr) add(imm(1*2*16), rbx) // b += 1*2 (1 x nr) - - + + dec(rsi) // i -= 1; jmp(.ZLOOPKLEFT) // jump to beginning of loop. - - - + + + label(.ZPOSTACCUM) - - + + prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c - - + + vpermilpd(imm(0x1), xmm9, xmm9) vpermilpd(imm(0x1), xmm11, xmm11) vpermilpd(imm(0x1), xmm13, xmm13) vpermilpd(imm(0x1), xmm15, xmm15) - + vaddsubpd(xmm9, xmm8, xmm8) vaddsubpd(xmm11, xmm10, xmm10) vaddsubpd(xmm13, xmm12, xmm12) vaddsubpd(xmm15, xmm14, xmm14) - - + + // xmm8: xmm10: // ( ab00 ( ab01 // ab10 ) ab11 ) - + // xmm12: xmm14: // ( ab20 ( ab21 // ab30 ) ab31 ) - - + + prefetch(0, mem(r14)) // prefetch a_next prefetch(0, mem(r14, 64)) // prefetch a_next - - + + // scale by alpha - + mov(var(alpha), rax) // load address of alpha vmovddup(mem(rax), xmm0) // load alpha_r and duplicate vmovddup(mem(rax, 8), xmm1) // load alpha_i and duplicate - + vpermilpd(imm(0x1), xmm8, xmm9) vpermilpd(imm(0x1), xmm10, xmm11) vpermilpd(imm(0x1), xmm12, xmm13) vpermilpd(imm(0x1), xmm14, xmm15) - + vmulpd(xmm8, xmm0, xmm8) vmulpd(xmm10, xmm0, xmm10) vmulpd(xmm12, xmm0, xmm12) vmulpd(xmm14, xmm0, xmm14) - + vmulpd(xmm9, xmm1, xmm9) vmulpd(xmm11, xmm1, xmm11) vmulpd(xmm13, xmm1, xmm13) vmulpd(xmm15, xmm1, xmm15) - + vaddsubpd(xmm9, xmm8, xmm8) vaddsubpd(xmm11, xmm10, xmm10) vaddsubpd(xmm13, xmm12, xmm12) vaddsubpd(xmm15, xmm14, xmm14) - - - - + + + + mov(var(beta), rbx) // load address of beta vmovddup(mem(rbx), xmm6) // load beta_r and duplicate vmovddup(mem(rbx, 8), xmm7) // load beta_i and duplicate - - - - - - - - mov(var(rs_c), rsi) // load rs_c - lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex) - lea(mem(, rsi, 2), rsi) - //lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c; - - - - - + prefetch(0, mem(r15)) // prefetch b_next prefetch(0, mem(r15, 64)) // prefetch b_next - - - - // determine if - // c % 32 == 0, AND - // 16*cs_c % 32 == 0, AND - // rs_c == 1 - // ie: aligned, ldim aligned, and - // column-stored - - cmp(imm(16), rsi) // set ZF if (16*rs_c) == 16. - sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); - test(imm(31), rcx) // set ZF if c & 32 is zero. - setz(bh) // bh = ( ZF == 0 ? 1 : 0 ); - test(imm(31), rdi) // set ZF if (16*cs_c) & 32 is zero. - setz(al) // al = ( ZF == 0 ? 1 : 0 ); - // and(bl,bh) followed by - // and(bh,al) will reveal result - + // now avoid loading C if beta == 0 - + vxorpd(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomisd(xmm0, xmm6) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); @@ -2548,161 +1728,66 @@ void bli_zgemm_piledriver_asm_2x2 sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.ZBETAZERO) // if ZF = 0, jump to beta == 0 case - - - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.ZCOLSTORED) // jump to column storage case - - - - label(.ZGENSTORED) - - - vmovups(mem(rcx), xmm0) // load c00 - vmovups(mem(rcx, rsi, 1), xmm2) // load c10 - vpermilpd(imm(0x1), xmm0, xmm1) - vpermilpd(imm(0x1), xmm2, xmm3) - - vmulpd(xmm6, xmm0, xmm0) - vmulpd(xmm7, xmm1, xmm1) - vaddsubpd(xmm1, xmm0, xmm0) - vaddpd(xmm8, xmm0, xmm0) - vmovups(xmm0, mem(rcx)) // store c00 - - vmulpd(xmm6, xmm2, xmm2) - vmulpd(xmm7, xmm3, xmm3) - vaddsubpd(xmm3, xmm2, xmm2) - vaddpd(xmm12, xmm2, xmm2) - vmovups(xmm2, mem(rcx, rsi, 1)) // store c10 - - - - vmovups(mem(r10), xmm0) // load c01 - vmovups(mem(r10, rsi, 1), xmm2) // load c11 - vpermilpd(imm(0x1), xmm0, xmm1) - vpermilpd(imm(0x1), xmm2, xmm3) - - vmulpd(xmm6, xmm0, xmm0) - vmulpd(xmm7, xmm1, xmm1) - vaddsubpd(xmm1, xmm0, xmm0) - vaddpd(xmm10, xmm0, xmm0) - vmovups(xmm0, mem(r10)) // store c01 - - vmulpd(xmm6, xmm2, xmm2) - vmulpd(xmm7, xmm3, xmm3) - vaddsubpd(xmm3, xmm2, xmm2) - vaddpd(xmm14, xmm2, xmm2) - vmovups(xmm2, mem(r10, rsi, 1)) // store c11 - - - - jmp(.ZDONE) // jump to end. - - - - label(.ZCOLSTORED) - - - vmovups(mem(rcx), xmm0) // load c00 - vmovups(mem(rcx, 16), xmm2) // load c10 - vpermilpd(imm(0x1), xmm0, xmm1) - vpermilpd(imm(0x1), xmm2, xmm3) - - vmulpd(xmm6, xmm0, xmm0) - vmulpd(xmm7, xmm1, xmm1) - vaddsubpd(xmm1, xmm0, xmm0) - vaddpd(xmm8, xmm0, xmm0) - vmovups(xmm0, mem(rcx)) // store c00 - - vmulpd(xmm6, xmm2, xmm2) - vmulpd(xmm7, xmm3, xmm3) - vaddsubpd(xmm3, xmm2, xmm2) - vaddpd(xmm12, xmm2, xmm2) - vmovups(xmm2, mem(rcx, 16)) // store c10 - - - - vmovups(mem(r10), xmm0) // load c01 - vmovups(mem(r10, 16), xmm2) // load c11 - vpermilpd(imm(0x1), xmm0, xmm1) - vpermilpd(imm(0x1), xmm2, xmm3) - - vmulpd(xmm6, xmm0, xmm0) - vmulpd(xmm7, xmm1, xmm1) - vaddsubpd(xmm1, xmm0, xmm0) - vaddpd(xmm10, xmm0, xmm0) - vmovups(xmm0, mem(r10)) // store c01 - - vmulpd(xmm6, xmm2, xmm2) - vmulpd(xmm7, xmm3, xmm3) - vaddsubpd(xmm3, xmm2, xmm2) - vaddpd(xmm14, xmm2, xmm2) - vmovups(xmm2, mem(r10, 16)) // store c11 - - - - jmp(.ZDONE) // jump to end. - - - + + vmovups(mem(rcx), xmm0) // load c00 + vmovups(mem(rcx, 16), xmm2) // load c10 + vpermilpd(imm(0x1), xmm0, xmm1) + vpermilpd(imm(0x1), xmm2, xmm3) + + vmulpd(xmm6, xmm0, xmm0) + vmulpd(xmm7, xmm1, xmm1) + vaddsubpd(xmm1, xmm0, xmm0) + vaddpd(xmm8, xmm0, xmm0) + + vmulpd(xmm6, xmm2, xmm2) + vmulpd(xmm7, xmm3, xmm3) + vaddsubpd(xmm3, xmm2, xmm2) + vaddpd(xmm12, xmm2, xmm2) + + vmovups(mem(r10), xmm0) // load c01 + vmovups(mem(r10, 16), xmm2) // load c11 + vpermilpd(imm(0x1), xmm0, xmm1) + vpermilpd(imm(0x1), xmm2, xmm3) + + vmulpd(xmm6, xmm0, xmm0) + vmulpd(xmm7, xmm1, xmm1) + vaddsubpd(xmm1, xmm0, xmm0) + vaddpd(xmm10, xmm0, xmm0) + + vmulpd(xmm6, xmm2, xmm2) + vmulpd(xmm7, xmm3, xmm3) + vaddsubpd(xmm3, xmm2, xmm2) + vaddpd(xmm14, xmm2, xmm2) + + // fall through + label(.ZBETAZERO) - // check if aligned/column-stored - // check if aligned/column-stored - and(bl, bh) // set ZF if bl & bh == 1. - and(bh, al) // set ZF if bh & al == 1. - jne(.ZCOLSTORBZ) // jump to column storage case - - - - label(.ZGENSTORBZ) - - - vmovups(xmm8, mem(rcx)) // store c00 - vmovups(xmm12, mem(rcx, rsi, 1)) // store c10 - - vmovups(xmm10, mem(r10)) // store c01 - vmovups(xmm14, mem(r10, rsi, 1)) // store c11 - - - - jmp(.ZDONE) // jump to end. - - - - label(.ZCOLSTORBZ) - - - vmovups(xmm8, mem(rcx)) // store c00 - vmovups(xmm12, mem(rcx, 16)) // store c10 - - vmovups(xmm10, mem(r10)) // store c01 - vmovups(xmm14, mem(r10, 16)) // store c11 - - - - - + + vmovups(xmm8, mem(rcx)) // store c00 + vmovups(xmm12, mem(rcx, 16)) // store c10 + + vmovups(xmm10, mem(r10)) // store c01 + vmovups(xmm14, mem(r10, 16)) // store c11 + label(.ZDONE) - - end_asm( + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c), // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next) // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next) // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2710,6 +1795,8 @@ void bli_zgemm_piledriver_asm_2x2 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( z ); } diff --git a/kernels/power10/3/bli_dgemm_power10_mma.c b/kernels/power10/3/bli_dgemm_power10_mma.c index 396824986..84e7d16d3 100644 --- a/kernels/power10/3/bli_dgemm_power10_mma.c +++ b/kernels/power10/3/bli_dgemm_power10_mma.c @@ -37,7 +37,7 @@ #define D_ASSEMBLE_VEC_PAIR \ __builtin_mma_assemble_pair (&colA_1, ca[1], ca[0]); \ - __builtin_mma_assemble_pair (&colA_2, ca[3], ca[2]); + __builtin_mma_assemble_pair (&colA_2, ca[3], ca[2]); #define D_ACCUMULATE \ __builtin_mma_xvf64gerpp (&acc0, colA_1, rb[0]); \ @@ -47,7 +47,7 @@ __builtin_mma_xvf64gerpp (&acc4, colA_2, rb[0]); \ __builtin_mma_xvf64gerpp (&acc5, colA_2, rb[1]); \ __builtin_mma_xvf64gerpp (&acc6, colA_2, rb[2]); \ - __builtin_mma_xvf64gerpp (&acc7, colA_2, rb[3]); + __builtin_mma_xvf64gerpp (&acc7, colA_2, rb[3]); #define D_INCREMENT \ A0+=8; \ @@ -57,17 +57,19 @@ LOAD_VECTORS \ D_ASSEMBLE_VEC_PAIR \ D_INCREMENT \ - D_ACCUMULATE + D_ACCUMULATE void bli_dgemm_power10_mma_8x8 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, - double* restrict c, inc_t rs_c0, inc_t cs_c0, + double* restrict c, inc_t rs_c0, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) @@ -76,11 +78,13 @@ void bli_dgemm_power10_mma_8x8 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. // (1 is subtracted from k0 because 1 iteration of the k loop is pulled out) - uint64_t k_iter = (k0-1) / 4; - uint64_t k_left = (k0-1) % 4; + uint64_t k_iter = (k-1) / 4; + uint64_t k_left = (k-1) % 4; uint64_t rs_c = rs_c0; + GEMM_UKR_SETUP_CT( d, 8, 8, true ); + double* restrict A0 = a; double* restrict B0 = b; double* restrict C0 = c; @@ -92,23 +96,23 @@ void bli_dgemm_power10_mma_8x8 dv4sf_t *rowC; /* 8 accumulator registers that will be used to store the result. - + Each accumulator register is mapped to 4 vector registers. Illustration: - + acc0 = [ vs0 vs1 vs3 vs4 ] - These registers are used to store the result of an outer product + These registers are used to store the result of an outer product instruction (general outer product instruction syntax: xv???ger??). */ - __vector_quad acc0, acc1, acc2, acc3, + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - /* 2 vector pairs are necessary for a double precision outer product + /* 2 vector pairs are necessary for a double precision outer product instruction. */ - __vector_pair colA_1, + __vector_pair colA_1, colA_2; /* Prefetch C so that it stays in cache */ @@ -123,17 +127,17 @@ void bli_dgemm_power10_mma_8x8 /* Load elements into vector registers */ vec_t *ca = (vec_t *) A0; - vec_t *rb = (vec_t *) B0; + vec_t *rb = (vec_t *) B0; - /* Each accumulator represents a matrix of size + /* Each accumulator represents a matrix of size 4 x ( 16 / (datatype size in bytes) ) (vector register size = 16B) - Thus in the case of double, the accumulate registers represent a 4x2 + Thus in the case of double, the accumulate registers represent a 4x2 matrix. However, a vector register can hold at most 2 doubles. Thus, if - we performed an outer product using 2 vector register, we can only get a + we performed an outer product using 2 vector register, we can only get a 2x2 matrix. Therefore, we must create a vector register pair in order to get the desired 4x2 matrix. - + */ D_ASSEMBLE_VEC_PAIR @@ -158,7 +162,7 @@ void bli_dgemm_power10_mma_8x8 D_AB_PRODUCT D_AB_PRODUCT } - + // edge loop for (int k = 0; k 0; kk--) { + vector double va00 = vec_splats( *(double *)( pa+0 ) ); + vector double va10 = vec_splats( *(double *)( pa+d1 ) ); + vector double va20 = vec_splats( *(double *)( pa+d2 ) ); + vector double va30 = vec_splats( *(double *)( pa+d3 ) ); + vector double va40 = vec_splats( *(double *)( pa+d4 ) ); + vector double va50 = vec_splats( *(double *)( pa+d5 ) ); + vector double va60 = vec_splats( *(double *)( pa+d6 ) ); + vector double va70 = vec_splats( *(double *)( pa+d7 ) ); + pa += 8*sizeof(double); + + vector double vb00_01 = *(vector double *)( pb+0 ); + vector double vb02_03 = *(vector double *)( pb+d2 ); + pb += 4*sizeof(double); + + vc00_01 = vec_madd(va00, vb00_01, vc00_01); + vc02_03 = vec_madd(va00, vb02_03, vc02_03); + vc10_11 = vec_madd(va10, vb00_01, vc10_11); + vc12_13 = vec_madd(va10, vb02_03, vc12_13); + vc20_21 = vec_madd(va20, vb00_01, vc20_21); + vc22_23 = vec_madd(va20, vb02_03, vc22_23); + vc30_31 = vec_madd(va30, vb00_01, vc30_31); + vc32_33 = vec_madd(va30, vb02_03, vc32_33); + vc40_41 = vec_madd(va40, vb00_01, vc40_41); + vc42_43 = vec_madd(va40, vb02_03, vc42_43); + vc50_51 = vec_madd(va50, vb00_01, vc50_51); + vc52_53 = vec_madd(va50, vb02_03, vc52_53); + vc60_61 = vec_madd(va60, vb00_01, vc60_61); + vc62_63 = vec_madd(va60, vb02_03, vc62_63); + vc70_71 = vec_madd(va70, vb00_01, vc70_71); + vc72_73 = vec_madd(va70, vb02_03, vc72_73); + } + + vector double valpha = vec_splats( *alpha ); + vector double vbeta = (vector double) { *beta, *beta }; + + vector double *pc = (vector double *)c; + + vc00_01 = vec_mul(valpha, vc00_01); + vc02_03 = vec_mul(valpha, vc02_03); + pc[0] = vec_madd( pc[0], vbeta, vc00_01); + pc[1] = vec_madd( pc[1], vbeta, vc02_03); + pc += rs_c/2; + + vc10_11 = vec_mul(valpha, vc10_11); + vc12_13 = vec_mul(valpha, vc12_13); + pc[0] = vec_madd( pc[0], vbeta, vc10_11); + pc[1] = vec_madd( pc[1], vbeta, vc12_13); + pc += rs_c/2; + + vc20_21 = vec_mul(valpha, vc20_21); + vc22_23 = vec_mul(valpha, vc22_23); + pc[0] = vec_madd( pc[0], vbeta, vc20_21); + pc[1] = vec_madd( pc[1], vbeta, vc22_23); + pc += rs_c/2; + + vc30_31 = vec_mul(valpha, vc30_31); + vc32_33 = vec_mul(valpha, vc32_33); + pc[0] = vec_madd( pc[0], vbeta, vc30_31); + pc[1] = vec_madd( pc[1], vbeta, vc32_33); + pc += rs_c/2; + + vc40_41 = vec_mul(valpha, vc40_41); + vc42_43 = vec_mul(valpha, vc42_43); + pc[0] = vec_madd( pc[0], vbeta, vc40_41); + pc[1] = vec_madd( pc[1], vbeta, vc42_43); + pc += rs_c/2; + + vc50_51 = vec_mul(valpha, vc50_51); + vc52_53 = vec_mul(valpha, vc52_53); + pc[0] = vec_madd( pc[0], vbeta, vc50_51); + pc[1] = vec_madd( pc[1], vbeta, vc52_53); + pc += rs_c/2; + + vc60_61 = vec_mul(valpha, vc60_61); + vc62_63 = vec_mul(valpha, vc62_63); + pc[0] = vec_madd( pc[0], vbeta, vc60_61); + pc[1] = vec_madd( pc[1], vbeta, vc62_63); + pc += rs_c/2; + + vc70_71 = vec_mul(valpha, vc70_71); + vc72_73 = vec_mul(valpha, vc72_73); + pc[0] = vec_madd( pc[0], vbeta, vc70_71); + pc[1] = vec_madd( pc[1], vbeta, vc72_73); + pc += rs_c/2; + } + else + { + GEMM_UKR_SETUP_CT( d, 8, 4, false ); + // Optimized code for case where C columns are contiguous (column-major C) vector double vzero = vec_splats( 0.0 ); @@ -301,168 +433,8 @@ void bli_dgemm_power7_int_8x4 pc[1] = vec_madd( pc[1], vbeta, vc23_33); pc[2] = vec_madd( pc[2], vbeta, vc43_53); pc[3] = vec_madd( pc[3], vbeta, vc63_73); - } - else -#endif -#if 1 - if ( cs_c == 1 ) { - // Optimized code for case where C rows are contiguous (i.e. C is row-major) - - vector double vzero = vec_splats( 0.0 ); - - vector double vc00_01 = vzero; - vector double vc02_03 = vzero; - vector double vc10_11 = vzero; - vector double vc12_13 = vzero; - vector double vc20_21 = vzero; - vector double vc22_23 = vzero; - vector double vc30_31 = vzero; - vector double vc32_33 = vzero; - vector double vc40_41 = vzero; - vector double vc42_43 = vzero; - vector double vc50_51 = vzero; - vector double vc52_53 = vzero; - vector double vc60_61 = vzero; - vector double vc62_63 = vzero; - vector double vc70_71 = vzero; - vector double vc72_73 = vzero; - - unsigned long long pa = (unsigned long long)a; - unsigned long long pb = (unsigned long long)b; - -#if 0 - unsigned long long d1 = 1*sizeof(double); - unsigned long long d2 = 2*sizeof(double); - unsigned long long d3 = 3*sizeof(double); - unsigned long long d4 = 4*sizeof(double); - unsigned long long d6 = 6*sizeof(double); -#else - // ppc64 linux abi: r14-r31 Nonvolatile registers used for local variables - register unsigned long long d1 __asm ("r21") = 1*sizeof(double); - register unsigned long long d2 __asm ("r22") = 2*sizeof(double); - register unsigned long long d3 __asm ("r23") = 3*sizeof(double); - register unsigned long long d4 __asm ("r24") = 4*sizeof(double); - register unsigned long long d5 __asm ("r25") = 5*sizeof(double); - register unsigned long long d6 __asm ("r26") = 6*sizeof(double); - register unsigned long long d7 __asm ("r27") = 7*sizeof(double); - - __asm__ volatile (";" : "=r" (d1) : "r" (d1) ); - __asm__ volatile (";" : "=r" (d2) : "r" (d2) ); - __asm__ volatile (";" : "=r" (d3) : "r" (d3) ); - __asm__ volatile (";" : "=r" (d4) : "r" (d4) ); - __asm__ volatile (";" : "=r" (d5) : "r" (d5) ); - __asm__ volatile (";" : "=r" (d6) : "r" (d6) ); - __asm__ volatile (";" : "=r" (d7) : "r" (d7) ); -#endif - - int kk; - for (kk=k; kk > 0; kk--) { - vector double va00 = vec_splats( *(double *)( pa+0 ) ); - vector double va10 = vec_splats( *(double *)( pa+d1 ) ); - vector double va20 = vec_splats( *(double *)( pa+d2 ) ); - vector double va30 = vec_splats( *(double *)( pa+d3 ) ); - vector double va40 = vec_splats( *(double *)( pa+d4 ) ); - vector double va50 = vec_splats( *(double *)( pa+d5 ) ); - vector double va60 = vec_splats( *(double *)( pa+d6 ) ); - vector double va70 = vec_splats( *(double *)( pa+d7 ) ); - pa += 8*sizeof(double); - - vector double vb00_01 = *(vector double *)( pb+0 ); - vector double vb02_03 = *(vector double *)( pb+d2 ); - pb += 4*sizeof(double); - - vc00_01 = vec_madd(va00, vb00_01, vc00_01); - vc02_03 = vec_madd(va00, vb02_03, vc02_03); - vc10_11 = vec_madd(va10, vb00_01, vc10_11); - vc12_13 = vec_madd(va10, vb02_03, vc12_13); - vc20_21 = vec_madd(va20, vb00_01, vc20_21); - vc22_23 = vec_madd(va20, vb02_03, vc22_23); - vc30_31 = vec_madd(va30, vb00_01, vc30_31); - vc32_33 = vec_madd(va30, vb02_03, vc32_33); - vc40_41 = vec_madd(va40, vb00_01, vc40_41); - vc42_43 = vec_madd(va40, vb02_03, vc42_43); - vc50_51 = vec_madd(va50, vb00_01, vc50_51); - vc52_53 = vec_madd(va50, vb02_03, vc52_53); - vc60_61 = vec_madd(va60, vb00_01, vc60_61); - vc62_63 = vec_madd(va60, vb02_03, vc62_63); - vc70_71 = vec_madd(va70, vb00_01, vc70_71); - vc72_73 = vec_madd(va70, vb02_03, vc72_73); - } - - vector double valpha = vec_splats( *alpha ); - vector double vbeta = (vector double) { *beta, *beta }; - - vector double *pc = (vector double *)c; - - vc00_01 = vec_mul(valpha, vc00_01); - vc02_03 = vec_mul(valpha, vc02_03); - pc[0] = vec_madd( pc[0], vbeta, vc00_01); - pc[1] = vec_madd( pc[1], vbeta, vc02_03); - pc += rs_c/2; - - vc10_11 = vec_mul(valpha, vc10_11); - vc12_13 = vec_mul(valpha, vc12_13); - pc[0] = vec_madd( pc[0], vbeta, vc10_11); - pc[1] = vec_madd( pc[1], vbeta, vc12_13); - pc += rs_c/2; - - vc20_21 = vec_mul(valpha, vc20_21); - vc22_23 = vec_mul(valpha, vc22_23); - pc[0] = vec_madd( pc[0], vbeta, vc20_21); - pc[1] = vec_madd( pc[1], vbeta, vc22_23); - pc += rs_c/2; - - vc30_31 = vec_mul(valpha, vc30_31); - vc32_33 = vec_mul(valpha, vc32_33); - pc[0] = vec_madd( pc[0], vbeta, vc30_31); - pc[1] = vec_madd( pc[1], vbeta, vc32_33); - pc += rs_c/2; - - vc40_41 = vec_mul(valpha, vc40_41); - vc42_43 = vec_mul(valpha, vc42_43); - pc[0] = vec_madd( pc[0], vbeta, vc40_41); - pc[1] = vec_madd( pc[1], vbeta, vc42_43); - pc += rs_c/2; - - vc50_51 = vec_mul(valpha, vc50_51); - vc52_53 = vec_mul(valpha, vc52_53); - pc[0] = vec_madd( pc[0], vbeta, vc50_51); - pc[1] = vec_madd( pc[1], vbeta, vc52_53); - pc += rs_c/2; - - vc60_61 = vec_mul(valpha, vc60_61); - vc62_63 = vec_mul(valpha, vc62_63); - pc[0] = vec_madd( pc[0], vbeta, vc60_61); - pc[1] = vec_madd( pc[1], vbeta, vc62_63); - pc += rs_c/2; - - vc70_71 = vec_mul(valpha, vc70_71); - vc72_73 = vec_mul(valpha, vc72_73); - pc[0] = vec_madd( pc[0], vbeta, vc70_71); - pc[1] = vec_madd( pc[1], vbeta, vc72_73); - pc += rs_c/2; - } - else -#endif - { /* General case. Just do it right. */ -#if 1 || defined(UTEST) - const long MR = BLIS_DEFAULT_MR_D, NR = BLIS_DEFAULT_NR_D; - const long LDA = MR, LDB = NR; - int i, j, kk; - double c00; - - for (i=0; i < MR; i++) { - for (j=0; j < NR; j++) { - c00 = c[BLIS_INDEX(i,j,rs_c,cs_c)] * *beta; - for (kk=0; kk < k; kk++) - c00 += *alpha * (a[COLMAJ_INDEX(i,kk,LDA)] * b[ROWMAJ_INDEX(kk,j,LDB)]); - c[BLIS_INDEX(i,j,rs_c,cs_c)] = c00; - } - } -#else - //BLIS_DGEMM_UKERNEL_REF(k, alpha, a, b, beta, c, rs_c, cs_c, data); -#endif + GEMM_UKR_FLUSH_CT( d ); } } @@ -477,30 +449,26 @@ void bli_dgemm_power7_int_8x4 */ void bli_cgemm_power7_int_8x4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, - scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, + scomplex* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { - // Typecast local copies of integers in case dim_t and inc_t are a - // different size than is expected by load instructions. - uint64_t k = k0; - uint64_t rs_c = rs_c0; - uint64_t cs_c = cs_c0; - #if 1 || defined(UTEST) const long MR = BLIS_DEFAULT_MR_C, NR = BLIS_DEFAULT_NR_C; const long LDA = MR, LDB = NR; int i, j, kk; scomplex c00; - for (i=0; i < MR; i++) { - for (j=0; j < NR; j++) { + for (i=0; i < m; i++) { + for (j=0; j < n; j++) { scomplex tmpc, tmpa, tmpb, tmp; //c00 = c[BLIS_INDEX(i,j,rs_c,cs_c)] * *beta; tmpc = c[BLIS_INDEX(i,j,rs_c,cs_c)]; @@ -534,30 +502,26 @@ void bli_cgemm_power7_int_8x4 */ void bli_zgemm_power7_int_8x4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, - scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, + scomplex* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { - // Typecast local copies of integers in case dim_t and inc_t are a - // different size than is expected by load instructions. - uint64_t k = k0; - uint64_t rs_c = rs_c0; - uint64_t cs_c = cs_c0; - #if 1 || defined(UTEST) const long MR = BLIS_DEFAULT_MR_Z, NR = BLIS_DEFAULT_NR_Z; const long LDA = MR, LDB = NR; int i, j, kk; dcomplex c00; - for (i=0; i < MR; i++) { - for (j=0; j < NR; j++) { + for (i=0; i < m; i++) { + for (j=0; j < n; j++) { dcomplex tmpc, tmpa, tmpb, tmp; //c00 = c[BLIS_INDEX(i,j,rs_c,cs_c)] * *beta; tmpc = c[BLIS_INDEX(i,j,rs_c,cs_c)]; diff --git a/kernels/power7/3/test/bli_gemm_power7_int_8x4.h b/kernels/power7/3/test/bli_gemm_power7_int_8x4.h index ef1930907..50984a67d 100644 --- a/kernels/power7/3/test/bli_gemm_power7_int_8x4.h +++ b/kernels/power7/3/test/bli_gemm_power7_int_8x4.h @@ -43,6 +43,8 @@ void bli_sgemm_opt_8x4 ( + dim_t m, + dim_t n, dim_t k, float* restrict alpha, float* restrict a, @@ -55,6 +57,8 @@ void bli_sgemm_opt_8x4 void bli_dgemm_opt_8x4 ( + dim_t m, + dim_t n, dim_t k, double* restrict alpha, double* restrict a, @@ -67,6 +71,8 @@ void bli_dgemm_opt_8x4 void bli_cgemm_opt_8x4 ( + dim_t m, + dim_t n, dim_t k, scomplex* restrict alpha, scomplex* restrict a, @@ -79,6 +85,8 @@ void bli_cgemm_opt_8x4 void bli_zgemm_opt_8x4 ( + dim_t m, + dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, diff --git a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c index ec09f8e38..3e5f0d416 100644 --- a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c +++ b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c @@ -37,7 +37,9 @@ void bli_dgemm_power9_asm_12x6 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, double* restrict alpha, double* restrict a, double* restrict b, @@ -50,117 +52,91 @@ void bli_dgemm_power9_asm_12x6 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 16; - uint64_t k_left = k0 % 16; + uint64_t k_iter = k / 16; + uint64_t k_left = k % 16; - uint64_t rs_c = rs_c0; + uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( d, 12, 6, false ); + __asm__ volatile - ( - " \n\t" - "ld %%r7, %2 \n\t" // load ptr of A - "ld %%r8, %3 \n\t" // load ptr of B - "ld %%r16, %6 \n\t" // load ptr of C - " \n\t" - "ld %%r28, %4 \n\t" // load ptr for alpha - "ld %%r29, %5 \n\t" // load ptr for beta - " \n\t" - "ld %%r11, %0 \n\t" // load k_iter - "ld %%r12, %1 \n\t" // load k_left - " \n\t" - "ld %%r10, %8 \n\t" // load cs_c - "slwi %%r10, %%r10, 3 \n\t" // mul by size of elem - " \n\t" - "ld %%r9, %7 \n\t" // load rs_c - "slwi %%r9, %%r9, 3 \n\t" // mul by size of elem - " \n\t" - "ld %%r26, 0(%%r29) \n\t" // load val of beta - " \n\t" - "lxvdsx %%vs62, 0, %%r28 \n\t" // splat alpha - "lxvdsx %%vs63, 0, %%r29 \n\t" // splat beta - " \n\t" - "add %%r17, %%r16, %%r10 \n\t" // addr of col 1 of C - "add %%r18, %%r17, %%r10 \n\t" // col 2 of C - "add %%r19, %%r18, %%r10 \n\t" // col 3 of C - "add %%r20, %%r19, %%r10 \n\t" // col 4 of C - "add %%r21, %%r20, %%r10 \n\t" // col 5 of C - " \n\t" - DZERO_OUT_VREG - " \n\t" - DPRELOAD - " \n\t" - "addi %%r8, %%r8, 96 \n\t" // move to next col/row of A/B - "addi %%r7, %%r7, 96 \n\t" - " \n\t" - DPREFETCH - " \n\t" - "cmpwi %%r11, 0 \n\t" // if k_iter == 0, - "beq DCONSIDERKLEFT \n\t" // then jmp to k_left - "mtctr %%r11 \n\t" // else, do k_iter loop - " \n\t" - "DLOOPKITER: \n\t" // k_iter loop - " \n\t" - A_B_PRODUCT_16 // compute A*B - " \n\t" - "bdnz DLOOPKITER \n\t" - " \n\t" - "DCONSIDERKLEFT: \n\t" - " \n\t" - "cmpwi %%r12, 0 \n\t" // if k_left == 0, - "beq DPOSTACCUM \n\t" // then jmp to post accum - "mtctr %%r12 \n\t" // else, do k_left loop - " \n\t" - "DLOOPKLEFT: \n\t" // k_left loop - " \n\t" - A_B_PRODUCT_1 - " \n\t" - "bdnz DLOOPKLEFT \n\t" - " \n\t" - "DPOSTACCUM: \n\t" - " \n\t" - DSCALE_ALPHA - " \n\t" - "cmpdi %%r26, 0 \n\t" // if beta == 0, - "beq DBETAZERO \n\t" // then jmp to BZ - " \n\t" - "cmpwi %%r9, 8 \n\t" // if rs_c == 8 - "beq DCOLSTOREDBNZ \n\t" // then jmp to col store - " \n\t" - "DGENSTOREDBNZ: \n\t" // BNZ gen stored case - " \n\t" - DGEN_LOAD_OFS_C - " \n\t" - DGEN_SCALE_BETA - " \n\t" - "b DGENSTORED \n\t" - " \n\t" - "DCOLSTOREDBNZ: \n\t" // BNZ col stored case - " \n\t" - DCOL_SCALE_BETA - " \n\t" - "b DCOLSTORED \n\t" - " \n\t" - "DBETAZERO: \n\t" // BZ case - " \n\t" - "cmpwi %%r9, 8 \n\t" // if rs_c == 8, - "beq DCOLSTORED \n\t" // C is col stored - " \n\t" - "DGENSTORED: \n\t" // BZ gen stored case - " \n\t" - DGEN_LOAD_OFS_C - " \n\t" - DGEN_STORE - " \n\t" - "b DDONE \n\t" - " \n\t" - "DCOLSTORED: \n\t" // BZ col stored case - " \n\t" - DCOL_STORE - " \n\t" - "DDONE: \n\t" - " \n\t" - : // output operands (none) + ( + " \n\t" + "ld %%r7, %2 \n\t" // load ptr of A + "ld %%r8, %3 \n\t" // load ptr of B + "ld %%r16, %6 \n\t" // load ptr of C + " \n\t" + "ld %%r28, %4 \n\t" // load ptr for alpha + "ld %%r29, %5 \n\t" // load ptr for beta + " \n\t" + "ld %%r11, %0 \n\t" // load k_iter + "ld %%r12, %1 \n\t" // load k_left + " \n\t" + "ld %%r10, %8 \n\t" // load cs_c + "slwi %%r10, %%r10, 3 \n\t" // mul by size of elem + " \n\t" + "ld %%r9, %7 \n\t" // load rs_c + "slwi %%r9, %%r9, 3 \n\t" // mul by size of elem + " \n\t" + "ld %%r26, 0(%%r29) \n\t" // load val of beta + " \n\t" + "lxvdsx %%vs62, 0, %%r28 \n\t" // splat alpha + "lxvdsx %%vs63, 0, %%r29 \n\t" // splat beta + " \n\t" + "add %%r17, %%r16, %%r10 \n\t" // addr of col 1 of C + "add %%r18, %%r17, %%r10 \n\t" // col 2 of C + "add %%r19, %%r18, %%r10 \n\t" // col 3 of C + "add %%r20, %%r19, %%r10 \n\t" // col 4 of C + "add %%r21, %%r20, %%r10 \n\t" // col 5 of C + " \n\t" + DZERO_OUT_VREG + " \n\t" + DPRELOAD + " \n\t" + "addi %%r8, %%r8, 96 \n\t" // move to next col/row of A/B + "addi %%r7, %%r7, 96 \n\t" + " \n\t" + DPREFETCH + " \n\t" + "cmpwi %%r11, 0 \n\t" // if k_iter == 0, + "beq DCONSIDERKLEFT \n\t" // then jmp to k_left + "mtctr %%r11 \n\t" // else, do k_iter loop + " \n\t" + "DLOOPKITER: \n\t" // k_iter loop + " \n\t" + A_B_PRODUCT_16 // compute A*B + " \n\t" + "bdnz DLOOPKITER \n\t" + " \n\t" + "DCONSIDERKLEFT: \n\t" + " \n\t" + "cmpwi %%r12, 0 \n\t" // if k_left == 0, + "beq DPOSTACCUM \n\t" // then jmp to post accum + "mtctr %%r12 \n\t" // else, do k_left loop + " \n\t" + "DLOOPKLEFT: \n\t" // k_left loop + " \n\t" + A_B_PRODUCT_1 + " \n\t" + "bdnz DLOOPKLEFT \n\t" + " \n\t" + "DPOSTACCUM: \n\t" + " \n\t" + DSCALE_ALPHA + " \n\t" + "cmpdi %%r26, 0 \n\t" // if beta == 0, + "beq DBETAZERO \n\t" // then jmp to BZ + " \n\t" + DCOL_SCALE_BETA + " \n\t" + "DBETAZERO: \n\t" // BZ case + " \n\t" + DCOL_STORE + " \n\t" + "DDONE: \n\t" + " \n\t" + : // output operands (none) : // input operands "m" (k_iter), // 0 "m" (k_left), // 1 @@ -174,28 +150,30 @@ void bli_dgemm_power9_asm_12x6 "m" (b_next), // 9 "m" (a_next)*/ // 10 : // register clobber list - /* unclobberable regs: r2, r3, r4, r5, r6, r13, r14, r15, r30, r31 */ - "r0", "r7", "r8", "r9", - "r10", "r11", "r12", "r16", "r17", "r18", "r19", - "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29" + /* unclobberable regs: r2, r3, r4, r5, r6, r13, r14, r15, r30, r31 */ + "r0", "r7", "r8", "r9", + "r10", "r11", "r12", "r16", "r17", "r18", "r19", + "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29" + + #if XLC + ,"f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9" + , "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19" + , "f20" ,"f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29" + , "f30" ,"f31" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9" + , "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19" + , "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29" + , "v30", "v31" + #else + , "vs0", "vs1", "vs2", "vs3", "vs4", "vs5", "vs6", "vs7", "vs8", "vs9" + , "vs10", "vs11", "vs12", "vs13", "vs14", "vs15", "vs16", "vs17", "vs18", "vs19" + , "vs20", "vs21", "vs22", "vs23", "vs24", "vs25", "vs26", "vs27", "vs28", "vs29" + , "vs30", "vs31", "vs32", "vs33", "vs34", "vs35", "vs36", "vs37", "vs38", "vs39" + , "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49" + , "vs50", "vs51", "vs52", "vs53" + #endif - #if XLC - ,"f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9" - , "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19" - , "f20" ,"f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29" - , "f30" ,"f31" - , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9" - , "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19" - , "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29" - , "v30", "v31" - #else - , "vs0", "vs1", "vs2", "vs3", "vs4", "vs5", "vs6", "vs7", "vs8", "vs9" - , "vs10", "vs11", "vs12", "vs13", "vs14", "vs15", "vs16", "vs17", "vs18", "vs19" - , "vs20", "vs21", "vs22", "vs23", "vs24", "vs25", "vs26", "vs27", "vs28", "vs29" - , "vs30", "vs31", "vs32", "vs33", "vs34", "vs35", "vs36", "vs37", "vs38", "vs39" - , "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49" - , "vs50", "vs51", "vs52", "vs53" - #endif + ); - ); + GEMM_UKR_FLUSH_CT( d ); } diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c index a56ef16e5..7890ad347 100644 --- a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c +++ b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c @@ -42,7 +42,9 @@ void bli_sgemm_sandybridge_asm_8x8 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, float* restrict alpha, float* restrict a, float* restrict b, @@ -57,27 +59,29 @@ void bli_sgemm_sandybridge_asm_8x8 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( s, 8, 8, false ); + begin_asm() - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(var(b_next), r15) // load address of b_next. - + vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovsldup(mem(rbx, 0*32), ymm2) // elements of a and b. vpermilps(imm(0x4e), ymm2, ymm3) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) lea(mem(rcx, rdi, 4), r10) // load address of c + 4*cs_c; - + lea(mem(rdi, rdi, 2), r14) // r14 = 3*cs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*cs_c @@ -87,7 +91,7 @@ void bli_sgemm_sandybridge_asm_8x8 prefetch(0, mem(r10, rdi, 1, 7*8)) // prefetch c + 5*cs_c prefetch(0, mem(r10, rdi, 2, 7*8)) // prefetch c + 6*cs_c prefetch(0, mem(r10, r14, 1, 7*8)) // prefetch c + 7*cs_c - + vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) @@ -96,18 +100,18 @@ void bli_sgemm_sandybridge_asm_8x8 vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // iteration 0 prefetch(0, mem(rax, 16*32)) vmulps(ymm0, ymm2, ymm6) @@ -117,14 +121,14 @@ void bli_sgemm_sandybridge_asm_8x8 vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vaddps(ymm15, ymm6, ymm15) vaddps(ymm13, ymm7, ymm13) - + vmovaps(mem(rax, 1*32), ymm1) vpermilps(imm(0x4e), ymm2, ymm3) vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vaddps(ymm11, ymm6, ymm11) vaddps(ymm9, ymm7, ymm9) - + vmulps(ymm0, ymm2, ymm6) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 1*32), ymm2) @@ -132,13 +136,13 @@ void bli_sgemm_sandybridge_asm_8x8 vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vaddps(ymm14, ymm6, ymm14) vaddps(ymm12, ymm7, ymm12) - + vpermilps(imm(0x4e), ymm2, ymm3) vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vaddps(ymm10, ymm6, ymm10) vaddps(ymm8, ymm7, ymm8) - + // iteration 1 vmulps(ymm1, ymm2, ymm6) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) @@ -147,14 +151,14 @@ void bli_sgemm_sandybridge_asm_8x8 vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vaddps(ymm15, ymm6, ymm15) vaddps(ymm13, ymm7, ymm13) - + vmovaps(mem(rax, 2*32), ymm0) vpermilps(imm(0x4e), ymm2, ymm3) vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddps(ymm11, ymm6, ymm11) vaddps(ymm9, ymm7, ymm9) - + vmulps(ymm1, ymm2, ymm6) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 2*32), ymm2) @@ -162,14 +166,14 @@ void bli_sgemm_sandybridge_asm_8x8 vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vaddps(ymm14, ymm6, ymm14) vaddps(ymm12, ymm7, ymm12) - + vpermilps(imm(0x4e), ymm2, ymm3) vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddps(ymm10, ymm6, ymm10) vaddps(ymm8, ymm7, ymm8) - - + + // iteration 2 prefetch(0, mem(rax, 18*32)) vmulps(ymm0, ymm2, ymm6) @@ -179,7 +183,7 @@ void bli_sgemm_sandybridge_asm_8x8 vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vaddps(ymm15, ymm6, ymm15) vaddps(ymm13, ymm7, ymm13) - + vmovaps(mem(rax, 3*32), ymm1) add(imm(4*8*4), rax) // a += 4*8 (unroll x mr) vpermilps(imm(0x4e), ymm2, ymm3) @@ -187,7 +191,7 @@ void bli_sgemm_sandybridge_asm_8x8 vmulps(ymm0, ymm5, ymm7) vaddps(ymm11, ymm6, ymm11) vaddps(ymm9, ymm7, ymm9) - + vmulps(ymm0, ymm2, ymm6) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 3*32), ymm2) @@ -195,14 +199,14 @@ void bli_sgemm_sandybridge_asm_8x8 vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vaddps(ymm14, ymm6, ymm14) vaddps(ymm12, ymm7, ymm12) - + vpermilps(imm(0x4e), ymm2, ymm3) vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vaddps(ymm10, ymm6, ymm10) vaddps(ymm8, ymm7, ymm8) - - + + // iteration 3 vmulps(ymm1, ymm2, ymm6) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) @@ -212,14 +216,14 @@ void bli_sgemm_sandybridge_asm_8x8 vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vaddps(ymm15, ymm6, ymm15) vaddps(ymm13, ymm7, ymm13) - + vmovaps(mem(rax, 0*32), ymm0) vpermilps(imm(0x4e), ymm2, ymm3) vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddps(ymm11, ymm6, ymm11) vaddps(ymm9, ymm7, ymm9) - + vmulps(ymm1, ymm2, ymm6) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 0*32), ymm2) @@ -227,35 +231,35 @@ void bli_sgemm_sandybridge_asm_8x8 vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vaddps(ymm14, ymm6, ymm14) vaddps(ymm12, ymm7, ymm12) - + vpermilps(imm(0x4e), ymm2, ymm3) vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddps(ymm10, ymm6, ymm10) vaddps(ymm8, ymm7, ymm8) - - - - + + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP - - + + prefetch(0, mem(rax, 16*32)) vmulps(ymm0, ymm2, ymm6) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) @@ -264,7 +268,7 @@ void bli_sgemm_sandybridge_asm_8x8 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddps(ymm15, ymm6, ymm15) vaddps(ymm13, ymm7, ymm13) - + vmovaps(mem(rax, 1*32), ymm1) add(imm(8*1*4), rax) // a += 8 (1 x mr) vpermilps(imm(0x4e), ymm2, ymm3) @@ -272,7 +276,7 @@ void bli_sgemm_sandybridge_asm_8x8 vmulps(ymm0, ymm5, ymm7) vaddps(ymm11, ymm6, ymm11) vaddps(ymm9, ymm7, ymm9) - + vmulps(ymm0, ymm2, ymm6) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 1*32), ymm2) @@ -281,122 +285,122 @@ void bli_sgemm_sandybridge_asm_8x8 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddps(ymm14, ymm6, ymm14) vaddps(ymm12, ymm7, ymm12) - + vpermilps(imm(0x4e), ymm2, ymm3) vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(ymm1, ymm0) vaddps(ymm10, ymm6, ymm10) vaddps(ymm8, ymm7, ymm8) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab02 ( ab04 ( ab06 - // ab10 ab12 ab14 ab16 + // ab10 ab12 ab14 ab16 // ab22 ab20 ab26 ab24 // ab32 ab30 ab36 ab34 // ab44 ab46 ab40 ab42 - // ab54 ab56 ab50 ab52 + // ab54 ab56 ab50 ab52 // ab66 ab64 ab62 ab60 // ab76 ) ab74 ) ab72 ) ab70 ) - + // ymm14: ymm12: ymm10: ymm8: // ( ab01 ( ab03 ( ab05 ( ab07 - // ab11 ab13 ab15 ab17 + // ab11 ab13 ab15 ab17 // ab23 ab21 ab27 ab25 // ab33 ab31 ab37 ab35 // ab45 ab47 ab41 ab43 - // ab55 ab57 ab51 ab53 + // ab55 ab57 ab51 ab53 // ab67 ab65 ab63 ab61 // ab77 ) ab75 ) ab73 ) ab71 ) - + vmovaps(ymm15, ymm7) vshufps(imm(0xe4), ymm13, ymm15, ymm15) vshufps(imm(0xe4), ymm7, ymm13, ymm13) - + vmovaps(ymm11, ymm7) vshufps(imm(0xe4), ymm9, ymm11, ymm11) vshufps(imm(0xe4), ymm7, ymm9, ymm9) - + vmovaps(ymm14, ymm7) vshufps(imm(0xe4), ymm12, ymm14, ymm14) vshufps(imm(0xe4), ymm7, ymm12, ymm12) - + vmovaps(ymm10, ymm7) vshufps(imm(0xe4), ymm8, ymm10, ymm10) vshufps(imm(0xe4), ymm7, ymm8, ymm8) - + // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab02 ( ab04 ( ab06 - // ab10 ab12 ab14 ab16 + // ab10 ab12 ab14 ab16 // ab20 ab22 ab24 ab26 // ab30 ab32 ab34 ab36 // ab44 ab46 ab40 ab42 - // ab54 ab56 ab50 ab52 + // ab54 ab56 ab50 ab52 // ab64 ab66 ab60 ab62 // ab74 ) ab76 ) ab70 ) ab72 ) - + // ymm14: ymm12: ymm10: ymm8: // ( ab01 ( ab03 ( ab05 ( ab07 - // ab11 ab13 ab15 ab17 + // ab11 ab13 ab15 ab17 // ab21 ab23 ab25 ab27 // ab31 ab33 ab35 ab37 // ab45 ab47 ab41 ab43 - // ab55 ab57 ab51 ab53 + // ab55 ab57 ab51 ab53 // ab65 ab67 ab61 ab63 // ab75 ) ab77 ) ab71 ) ab73 ) - + vmovaps(ymm15, ymm7) vperm2f128(imm(0x30), ymm11, ymm15, ymm15) vperm2f128(imm(0x12), ymm11, ymm7, ymm11) - + vmovaps(ymm13, ymm7) vperm2f128(imm(0x30), ymm9, ymm13, ymm13) vperm2f128(imm(0x12), ymm9, ymm7, ymm9) - + vmovaps(ymm14, ymm7) vperm2f128(imm(0x30), ymm10, ymm14, ymm14) vperm2f128(imm(0x12), ymm10, ymm7, ymm10) - + vmovaps(ymm12, ymm7) vperm2f128(imm(0x30), ymm8, ymm12, ymm12) vperm2f128(imm(0x12), ymm8, ymm7, ymm8) - + // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab02 ( ab04 ( ab06 - // ab10 ab12 ab14 ab16 + // ab10 ab12 ab14 ab16 // ab20 ab22 ab24 ab26 // ab30 ab32 ab34 ab36 // ab40 ab42 ab44 ab46 - // ab50 ab52 ab54 ab56 + // ab50 ab52 ab54 ab56 // ab60 ab62 ab64 ab66 // ab70 ) ab72 ) ab74 ) ab76 ) - + // ymm14: ymm12: ymm10: ymm8: // ( ab01 ( ab03 ( ab05 ( ab07 - // ab11 ab13 ab15 ab17 + // ab11 ab13 ab15 ab17 // ab21 ab23 ab25 ab27 // ab31 ab33 ab35 ab37 // ab41 ab43 ab45 ab47 - // ab51 ab53 ab55 ab57 + // ab51 ab53 ab55 ab57 // ab61 ab63 ab65 ab67 // ab71 ) ab73 ) ab75 ) ab77 ) - - - + + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm4) // load beta and duplicate - + vmulps(ymm0, ymm8, ymm8) // scale by alpha vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) @@ -405,618 +409,118 @@ void bli_sgemm_sandybridge_asm_8x8 vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) - - - - - - + + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) - + lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; - + lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c; - - + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm4) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - - cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. - jz(.SCOLSTORED) // jump to column storage case - - - - label(.SGENSTORED) - - // update c00:c70 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - - vmulps(ymm4, ymm0, ymm0) // scale by beta, - vaddps(ymm15, ymm0, ymm0) // add the gemm result, - - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c01:c71 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - - vmulps(ymm4, ymm0, ymm0) // scale by beta, - vaddps(ymm14, ymm0, ymm0) // add the gemm result, - - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c02:c72 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - - vmulps(ymm4, ymm0, ymm0) // scale by beta, - vaddps(ymm13, ymm0, ymm0) // add the gemm result, - - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c03:c73 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - - vmulps(ymm4, ymm0, ymm0) // scale by beta, - vaddps(ymm12, ymm0, ymm0) // add the gemm result, - - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c04:c74 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - - vmulps(ymm4, ymm0, ymm0) // scale by beta, - vaddps(ymm11, ymm0, ymm0) // add the gemm result, - - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c05:c75 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - - vmulps(ymm4, ymm0, ymm0) // scale by beta, - vaddps(ymm10, ymm0, ymm0) // add the gemm result, - - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c06:c76 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - - vmulps(ymm4, ymm0, ymm0) // scale by beta, - vaddps(ymm9, ymm0, ymm0) // add the gemm result, - - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c07:c77 - vmovlps(mem(rcx), xmm0, xmm0) - vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) - vmovlps(mem(rcx, r12, 1), xmm1, xmm1) - vmovhps(mem(rcx, r13, 1), xmm1, xmm1) - vshufps(imm(0x88), xmm1, xmm0, xmm0) - vmovlps(mem(rdx), xmm2, xmm2) - vmovhps(mem(rdx, rsi, 1), xmm2, xmm2) - vmovlps(mem(rdx, r12, 1), xmm3, xmm3) - vmovhps(mem(rdx, r13, 1), xmm3, xmm3) - vshufps(imm(0x88), xmm3, xmm2, xmm2) - vperm2f128(imm(0x20), ymm2, ymm0, ymm0) - - vmulps(ymm4, ymm0, ymm0) // scale by beta, - vaddps(ymm8, ymm0, ymm0) // add the gemm result, - - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - - - jmp(.SDONE) // jump to end. - - - - label(.SCOLSTORED) - - - vmovups(mem(rcx), ymm0) // load c00:c70, - vmulps(ymm4, ymm0, ymm0) // scale by beta, - vaddps(ymm15, ymm0, ymm0) // add the gemm result, - vmovups(ymm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovups(mem(rcx), ymm1) // load c01:c71, - vmulps(ymm4, ymm1, ymm1) // scale by beta, - vaddps(ymm14, ymm1, ymm1) // add the gemm result, - vmovups(ymm1, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovups(mem(rcx), ymm0) // load c02:c72, - vmulps(ymm4, ymm0, ymm0) // scale by beta, - vaddps(ymm13, ymm0, ymm0) // add the gemm result, - vmovups(ymm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovups(mem(rcx), ymm1) // load c03:c73, - vmulps(ymm4, ymm1, ymm1) // scale by beta, - vaddps(ymm12, ymm1, ymm1) // add the gemm result, - vmovups(ymm1, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovups(mem(rcx), ymm0) // load c04:c74, - vmulps(ymm4, ymm0, ymm0) // scale by beta, - vaddps(ymm11, ymm0, ymm0) // add the gemm result, - vmovups(ymm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovups(mem(rcx), ymm1) // load c05:c75, - vmulps(ymm4, ymm1, ymm1) // scale by beta, - vaddps(ymm10, ymm1, ymm1) // add the gemm result, - vmovups(ymm1, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovups(mem(rcx), ymm0) // load c06:c76, - vmulps(ymm4, ymm0, ymm0) // scale by beta, - vaddps(ymm9, ymm0, ymm0) // add the gemm result, - vmovups(ymm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovups(mem(rcx), ymm1) // load c07:c77, - vmulps(ymm4, ymm1, ymm1) // scale by beta, - vaddps(ymm8, ymm1, ymm1) // add the gemm result, - vmovups(ymm1, mem(rcx)) // and store back to memory. - - - jmp(.SDONE) // jump to end. - - - - + + vmovups(mem(rcx), ymm0) // load c00:c70, + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm15, ymm0, ymm0) // add the gemm result, + vmovups(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(mem(rcx), ymm1) // load c01:c71, + vmulps(ymm4, ymm1, ymm1) // scale by beta, + vaddps(ymm14, ymm1, ymm1) // add the gemm result, + vmovups(ymm1, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(mem(rcx), ymm0) // load c02:c72, + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm13, ymm0, ymm0) // add the gemm result, + vmovups(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(mem(rcx), ymm1) // load c03:c73, + vmulps(ymm4, ymm1, ymm1) // scale by beta, + vaddps(ymm12, ymm1, ymm1) // add the gemm result, + vmovups(ymm1, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(mem(rcx), ymm0) // load c04:c74, + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm11, ymm0, ymm0) // add the gemm result, + vmovups(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(mem(rcx), ymm1) // load c05:c75, + vmulps(ymm4, ymm1, ymm1) // scale by beta, + vaddps(ymm10, ymm1, ymm1) // add the gemm result, + vmovups(ymm1, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(mem(rcx), ymm0) // load c06:c76, + vmulps(ymm4, ymm0, ymm0) // scale by beta, + vaddps(ymm9, ymm0, ymm0) // add the gemm result, + vmovups(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(mem(rcx), ymm1) // load c07:c77, + vmulps(ymm4, ymm1, ymm1) // scale by beta, + vaddps(ymm8, ymm1, ymm1) // add the gemm result, + vmovups(ymm1, mem(rcx)) // and store back to memory. + + jmp(.SDONE) // jump to end. + label(.SBETAZERO) - - cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. - jz(.SCOLSTORBZ) // jump to column storage case - - - - label(.SGENSTORBZ) - - // update c00:c70 - vmovups(ymm15, ymm0) - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c01:c71 - vmovups(ymm14, ymm0) - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c02:c72 - vmovups(ymm13, ymm0) - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c03:c73 - vmovups(ymm12, ymm0) - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c04:c74 - vmovups(ymm11, ymm0) - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c05:c75 - vmovups(ymm10, ymm0) - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c06:c76 - vmovups(ymm9, ymm0) - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - add(rdi, rcx) // c += cs_c; - add(rdi, rdx) // c += cs_c; - - - // update c07:c77 - vmovups(ymm8, ymm0) - vextractf128(imm(1), ymm0, xmm2) - vmovss(xmm0, mem(rcx)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, rsi, 1)) - vpermilps(imm(0x39), xmm1, xmm0) - vmovss(xmm0, mem(rcx, r12, 1)) - vpermilps(imm(0x39), xmm0, xmm1) - vmovss(xmm1, mem(rcx, r13, 1)) - vmovss(xmm2, mem(rdx)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, rsi, 1)) - vpermilps(imm(0x39), xmm3, xmm2) - vmovss(xmm2, mem(rdx, r12, 1)) - vpermilps(imm(0x39), xmm2, xmm3) - vmovss(xmm3, mem(rdx, r13, 1)) - - - jmp(.SDONE) // jump to end. - - - - label(.SCOLSTORBZ) - - - vmovups(ymm15, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovups(ymm14, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovups(ymm13, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovups(ymm12, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovups(ymm11, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovups(ymm10, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovups(ymm9, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovups(ymm8, mem(rcx)) // and store back to memory. - - - - - + + vmovups(ymm15, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(ymm14, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(ymm13, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(ymm12, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(ymm11, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(ymm10, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(ymm9, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovups(ymm8, mem(rcx)) // and store back to memory. + label(.SDONE) - + vzeroupper() - - end_asm( + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c)/*, // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1024,11 +528,15 @@ void bli_sgemm_sandybridge_asm_8x8 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( s ); } void bli_dgemm_sandybridge_asm_8x4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, double* restrict alpha, double* restrict a, double* restrict b, @@ -1043,34 +551,36 @@ void bli_dgemm_sandybridge_asm_8x4 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( d, 8, 4, false ); + begin_asm() - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r15) // load address of b_next. //mov(var(a_next), r14) // load address of a_next. sub(imm(4*64), r15) - + vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovapd(mem(rbx, 0*32), ymm2) // elements of a and b. vpermilpd(imm(0x5), ymm2, ymm3) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; - + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c - + vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) @@ -1079,19 +589,19 @@ void bli_dgemm_sandybridge_asm_8x4 vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - + add(imm(4*4*8), r15) // b_next += 4*4 (unroll x nr) - + // iteration 0 vmovapd(mem(rax, 1*32), ymm1) vmulpd(ymm0, ymm2, ymm6) @@ -1100,7 +610,7 @@ void bli_dgemm_sandybridge_asm_8x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm15, ymm6, ymm15) vaddpd(ymm13, ymm7, ymm13) - + prefetch(0, mem(rax, 16*32)) vmulpd(ymm1, ymm2, ymm6) vmovapd(mem(rbx, 1*32), ymm2) @@ -1108,20 +618,20 @@ void bli_dgemm_sandybridge_asm_8x4 vpermilpd(imm(0x5), ymm2, ymm3) vaddpd(ymm14, ymm6, ymm14) vaddpd(ymm12, ymm7, ymm12) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 2*32), ymm0) vaddpd(ymm11, ymm6, ymm11) vaddpd(ymm9, ymm7, ymm9) prefetch(0, mem(r15, 0*32)) // prefetch b_next[0*4] - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddpd(ymm10, ymm6, ymm10) vaddpd(ymm8, ymm7, ymm8) - - + + // iteration 1 vmovapd(mem(rax, 3*32), ymm1) vmulpd(ymm0, ymm2, ymm6) @@ -1130,7 +640,7 @@ void bli_dgemm_sandybridge_asm_8x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm15, ymm6, ymm15) vaddpd(ymm13, ymm7, ymm13) - + prefetch(0, mem(rax, 18*32)) vmulpd(ymm1, ymm2, ymm6) vmovapd(mem(rbx, 2*32), ymm2) @@ -1138,19 +648,19 @@ void bli_dgemm_sandybridge_asm_8x4 vpermilpd(imm(0x5), ymm2, ymm3) vaddpd(ymm14, ymm6, ymm14) vaddpd(ymm12, ymm7, ymm12) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 4*32), ymm0) vaddpd(ymm11, ymm6, ymm11) vaddpd(ymm9, ymm7, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddpd(ymm10, ymm6, ymm10) vaddpd(ymm8, ymm7, ymm8) - - + + // iteration 2 vmovapd(mem(rax, 5*32), ymm1) vmulpd(ymm0, ymm2, ymm6) @@ -1159,7 +669,7 @@ void bli_dgemm_sandybridge_asm_8x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm15, ymm6, ymm15) vaddpd(ymm13, ymm7, ymm13) - + prefetch(0, mem(rax, 20*32)) vmulpd(ymm1, ymm2, ymm6) vmovapd(mem(rbx, 3*32), ymm2) @@ -1168,20 +678,20 @@ void bli_dgemm_sandybridge_asm_8x4 vpermilpd(imm(0x5), ymm2, ymm3) vaddpd(ymm14, ymm6, ymm14) vaddpd(ymm12, ymm7, ymm12) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 6*32), ymm0) vaddpd(ymm11, ymm6, ymm11) vaddpd(ymm9, ymm7, ymm9) prefetch(0, mem(r15, 2*32)) // prefetch b_next[2*4] - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddpd(ymm10, ymm6, ymm10) vaddpd(ymm8, ymm7, ymm8) - - + + // iteration 3 vmovapd(mem(rax, 7*32), ymm1) add(imm(4*8*8), rax) // a += 4*8 (unroll x mr) @@ -1191,7 +701,7 @@ void bli_dgemm_sandybridge_asm_8x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm15, ymm6, ymm15) vaddpd(ymm13, ymm7, ymm13) - + //prefetch(0, mem(rax, 22*32)) prefetch(0, mem(rax, 14*32)) vmulpd(ymm1, ymm2, ymm6) @@ -1200,41 +710,41 @@ void bli_dgemm_sandybridge_asm_8x4 vpermilpd(imm(0x5), ymm2, ymm3) vaddpd(ymm14, ymm6, ymm14) vaddpd(ymm12, ymm7, ymm12) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 0*32), ymm0) vaddpd(ymm11, ymm6, ymm11) vaddpd(ymm9, ymm7, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddpd(ymm10, ymm6, ymm10) vaddpd(ymm8, ymm7, ymm8) - - - + + + //add(imm(4*8*8), rax) // a += 4*8 (unroll x mr) //add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr) - + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + vmovapd(mem(rax, 1*32), ymm1) add(imm(8*1*8), rax) // a += 8 (1 x mr) vmulpd(ymm0, ymm2, ymm6) @@ -1243,7 +753,7 @@ void bli_dgemm_sandybridge_asm_8x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm15, ymm6, ymm15) vaddpd(ymm13, ymm7, ymm13) - + prefetch(0, mem(rax, 14*32)) vmulpd(ymm1, ymm2, ymm6) vmovapd(mem(rbx, 1*32), ymm2) @@ -1252,101 +762,101 @@ void bli_dgemm_sandybridge_asm_8x4 vpermilpd(imm(0x5), ymm2, ymm3) vaddpd(ymm14, ymm6, ymm14) vaddpd(ymm12, ymm7, ymm12) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 0*32), ymm0) vaddpd(ymm11, ymm6, ymm11) vaddpd(ymm9, ymm7, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddpd(ymm10, ymm6, ymm10) vaddpd(ymm8, ymm7, ymm8) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab01 ( ab02 ( ab03 - // ab11 ab10 ab13 ab12 + // ab11 ab10 ab13 ab12 // ab22 ab23 ab20 ab21 // ab33 ) ab32 ) ab31 ) ab30 ) - + // ymm14: ymm12: ymm10: ymm8: // ( ab40 ( ab41 ( ab42 ( ab43 - // ab51 ab50 ab53 ab52 + // ab51 ab50 ab53 ab52 // ab62 ab63 ab60 ab61 // ab73 ) ab72 ) ab71 ) ab70 ) - + vmovapd(ymm15, ymm7) vshufpd(imm(0xa), ymm15, ymm13, ymm15) vshufpd(imm(0xa), ymm13, ymm7, ymm13) - + vmovapd(ymm11, ymm7) vshufpd(imm(0xa), ymm11, ymm9, ymm11) vshufpd(imm(0xa), ymm9, ymm7, ymm9) - + vmovapd(ymm14, ymm7) vshufpd(imm(0xa), ymm14, ymm12, ymm14) vshufpd(imm(0xa), ymm12, ymm7, ymm12) - + vmovapd(ymm10, ymm7) vshufpd(imm(0xa), ymm10, ymm8, ymm10) vshufpd(imm(0xa), ymm8, ymm7, ymm8) - + // ymm15: ymm13: ymm11: ymm9: // ( ab01 ( ab00 ( ab03 ( ab02 - // ab11 ab10 ab13 ab12 + // ab11 ab10 ab13 ab12 // ab23 ab22 ab21 ab20 // ab33 ) ab32 ) ab31 ) ab30 ) - + // ymm14: ymm12: ymm10: ymm8: // ( ab41 ( ab40 ( ab43 ( ab42 - // ab51 ab50 ab53 ab52 + // ab51 ab50 ab53 ab52 // ab63 ab62 ab61 ab60 // ab73 ) ab72 ) ab71 ) ab70 ) - + vmovapd(ymm15, ymm7) vperm2f128(imm(0x30), ymm15, ymm11, ymm15) vperm2f128(imm(0x12), ymm7, ymm11, ymm11) - + vmovapd(ymm13, ymm7) vperm2f128(imm(0x30), ymm13, ymm9, ymm13) vperm2f128(imm(0x12), ymm7, ymm9, ymm9) - + vmovapd(ymm14, ymm7) vperm2f128(imm(0x30), ymm14, ymm10, ymm14) vperm2f128(imm(0x12), ymm7, ymm10, ymm10) - + vmovapd(ymm12, ymm7) vperm2f128(imm(0x30), ymm12, ymm8, ymm12) vperm2f128(imm(0x12), ymm7, ymm8, ymm8) - + // ymm9: ymm11: ymm13: ymm15: // ( ab00 ( ab01 ( ab02 ( ab03 - // ab10 ab11 ab12 ab13 + // ab10 ab11 ab12 ab13 // ab20 ab21 ab22 ab23 // ab30 ) ab31 ) ab32 ) ab33 ) - + // ymm8: ymm10: ymm12: ymm14: // ( ab40 ( ab41 ( ab42 ( ab43 - // ab50 ab51 ab52 ab53 + // ab50 ab51 ab52 ab53 // ab60 ab61 ab62 ab63 // ab70 ) ab71 ) ab72 ) ab73 ) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm2) // load beta and duplicate - + vmulpd(ymm0, ymm8, ymm8) // scale by alpha vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) @@ -1355,343 +865,124 @@ void bli_dgemm_sandybridge_asm_8x4 vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(ymm0, ymm15, ymm15) - - - - - - + + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) - + lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; - + lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c; - - + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm2) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - - cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. - jz(.DCOLSTORED) // jump to column storage case - - - - label(.DGENSTORED) - // update c00:c33 - - vextractf128(imm(1), ymm9, xmm1) - vmovlpd(mem(rcx), xmm0, xmm0) // load c00 and c10, - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm9, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rcx)) // and store back to memory. - vmovhpd(xmm0, mem(rcx, rsi, 1)) - vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c20 and c30, - vmovhpd(mem(rcx, r13, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm1, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rcx, r12, 1)) // and store back to memory. - vmovhpd(xmm0, mem(rcx, r13, 1)) - add(rdi, rcx) // c += cs_c; - - vextractf128(imm(1), ymm11, xmm1) - vmovlpd(mem(rcx), xmm0, xmm0) // load c01 and c11, - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm11, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rcx)) // and store back to memory. - vmovhpd(xmm0, mem(rcx, rsi, 1)) - vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c21 and c31, - vmovhpd(mem(rcx, r13, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm1, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rcx, r12, 1)) // and store back to memory. - vmovhpd(xmm0, mem(rcx, r13, 1)) - add(rdi, rcx) // c += cs_c; - - vextractf128(imm(1), ymm13, xmm1) - vmovlpd(mem(rcx), xmm0, xmm0) // load c02 and c12, - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm13, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rcx)) // and store back to memory. - vmovhpd(xmm0, mem(rcx, rsi, 1)) - vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c22 and c32, - vmovhpd(mem(rcx, r13, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm1, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rcx, r12, 1)) // and store back to memory. - vmovhpd(xmm0, mem(rcx, r13, 1)) - add(rdi, rcx) // c += cs_c; - - vextractf128(imm(1), ymm15, xmm1) - vmovlpd(mem(rcx), xmm0, xmm0) // load c03 and c13, - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm15, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rcx)) // and store back to memory. - vmovhpd(xmm0, mem(rcx, rsi, 1)) - vmovlpd(mem(rcx, r12, 1), xmm0, xmm0) // load c23 and c33, - vmovhpd(mem(rcx, r13, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm1, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rcx, r12, 1)) // and store back to memory. - vmovhpd(xmm0, mem(rcx, r13, 1)) - - // update c40:c73 - - vextractf128(imm(1), ymm8, xmm1) - vmovlpd(mem(rdx), xmm0, xmm0) // load c40 and c50, - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm8, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rdx)) // and store back to memory. - vmovhpd(xmm0, mem(rdx, rsi, 1)) - vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c60 and c70, - vmovhpd(mem(rdx, r13, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm1, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rdx, r12, 1)) // and store back to memory. - vmovhpd(xmm0, mem(rdx, r13, 1)) - add(rdi, rdx) // c += cs_c; - - vextractf128(imm(1), ymm10, xmm1) - vmovlpd(mem(rdx), xmm0, xmm0) // load c41 and c51, - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm10, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rdx)) // and store back to memory. - vmovhpd(xmm0, mem(rdx, rsi, 1)) - vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c61 and c71, - vmovhpd(mem(rdx, r13, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm1, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rdx, r12, 1)) // and store back to memory. - vmovhpd(xmm0, mem(rdx, r13, 1)) - add(rdi, rdx) // c += cs_c; - - vextractf128(imm(1), ymm12, xmm1) - vmovlpd(mem(rdx), xmm0, xmm0) // load c42 and c52, - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm12, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rdx)) // and store back to memory. - vmovhpd(xmm0, mem(rdx, rsi, 1)) - vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c62 and c72, - vmovhpd(mem(rdx, r13, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm1, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rdx, r12, 1)) // and store back to memory. - vmovhpd(xmm0, mem(rdx, r13, 1)) - add(rdi, rdx) // c += cs_c; - - vextractf128(imm(1), ymm14, xmm1) - vmovlpd(mem(rdx), xmm0, xmm0) // load c43 and c53, - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm14, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rdx)) // and store back to memory. - vmovhpd(xmm0, mem(rdx, rsi, 1)) - vmovlpd(mem(rdx, r12, 1), xmm0, xmm0) // load c63 and c73, - vmovhpd(mem(rdx, r13, 1), xmm0, xmm0) - vmulpd(xmm2, xmm0, xmm0) // scale by beta, - vaddpd(xmm1, xmm0, xmm0) // add the gemm result, - vmovlpd(xmm0, mem(rdx, r12, 1)) // and store back to memory. - vmovhpd(xmm0, mem(rdx, r13, 1)) - - - jmp(.DDONE) // jump to end. - - - - label(.DCOLSTORED) - // update c00:c33 - - vmovupd(mem(rcx), ymm0) // load c00:c30, - vmulpd(ymm2, ymm0, ymm0) // scale by beta, - vaddpd(ymm9, ymm0, ymm0) // add the gemm result, - vmovupd(ymm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovupd(mem(rcx), ymm0) // load c01:c31, - vmulpd(ymm2, ymm0, ymm0) // scale by beta, - vaddpd(ymm11, ymm0, ymm0) // add the gemm result, - vmovupd(ymm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovupd(mem(rcx), ymm0) // load c02:c32, - vmulpd(ymm2, ymm0, ymm0) // scale by beta, - vaddpd(ymm13, ymm0, ymm0) // add the gemm result, - vmovupd(ymm0, mem(rcx)) // and store back to memory. - add(rdi, rcx) // c += cs_c; - - vmovupd(mem(rcx), ymm0) // load c03:c33, - vmulpd(ymm2, ymm0, ymm0) // scale by beta, - vaddpd(ymm15, ymm0, ymm0) // add the gemm result, - vmovupd(ymm0, mem(rcx)) // and store back to memory. - - // update c40:c73 - - vmovupd(mem(rdx), ymm0) // load c40:c70, - vmulpd(ymm2, ymm0, ymm0) // scale by beta, - vaddpd(ymm8, ymm0, ymm0) // add the gemm result, - vmovupd(ymm0, mem(rdx)) // and store back to memory. - add(rdi, rdx) // c += cs_c; - - vmovupd(mem(rdx), ymm0) // load c41:c71, - vmulpd(ymm2, ymm0, ymm0) // scale by beta, - vaddpd(ymm10, ymm0, ymm0) // add the gemm result, - vmovupd(ymm0, mem(rdx)) // and store back to memory. - add(rdi, rdx) // c += cs_c; - - vmovupd(mem(rdx), ymm0) // load c42:c72, - vmulpd(ymm2, ymm0, ymm0) // scale by beta, - vaddpd(ymm12, ymm0, ymm0) // add the gemm result, - vmovupd(ymm0, mem(rdx)) // and store back to memory. - add(rdi, rdx) // c += cs_c; - - vmovupd(mem(rdx), ymm0) // load c43:c73, - vmulpd(ymm2, ymm0, ymm0) // scale by beta, - vaddpd(ymm14, ymm0, ymm0) // add the gemm result, - vmovupd(ymm0, mem(rdx)) // and store back to memory. - - - jmp(.DDONE) // jump to end. - - - - + + // update c00:c33 + + vmovupd(mem(rcx), ymm0) // load c00:c30, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm9, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovupd(mem(rcx), ymm0) // load c01:c31, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm11, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovupd(mem(rcx), ymm0) // load c02:c32, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm13, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rcx)) // and store back to memory. + add(rdi, rcx) // c += cs_c; + + vmovupd(mem(rcx), ymm0) // load c03:c33, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm15, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rcx)) // and store back to memory. + + // update c40:c73 + + vmovupd(mem(rdx), ymm0) // load c40:c70, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm8, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rdx)) // and store back to memory. + add(rdi, rdx) // c += cs_c; + + vmovupd(mem(rdx), ymm0) // load c41:c71, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm10, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rdx)) // and store back to memory. + add(rdi, rdx) // c += cs_c; + + vmovupd(mem(rdx), ymm0) // load c42:c72, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm12, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rdx)) // and store back to memory. + add(rdi, rdx) // c += cs_c; + + vmovupd(mem(rdx), ymm0) // load c43:c73, + vmulpd(ymm2, ymm0, ymm0) // scale by beta, + vaddpd(ymm14, ymm0, ymm0) // add the gemm result, + vmovupd(ymm0, mem(rdx)) // and store back to memory. + + jmp(.DDONE) // jump to end. + label(.DBETAZERO) - - cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. - jz(.DCOLSTORBZ) // jump to column storage case - - - - label(.DGENSTORBZ) - // update c00:c33 - - vextractf128(imm(1), ymm9, xmm1) - vmovlpd(xmm9, mem(rcx)) // store to c00:c30 - vmovhpd(xmm9, mem(rcx, rsi, 1)) - vmovlpd(xmm1, mem(rcx, r12, 1)) - vmovhpd(xmm1, mem(rcx, r13, 1)) - add(rdi, rcx) // c += cs_c; - - vextractf128(imm(1), ymm11, xmm1) - vmovlpd(xmm11, mem(rcx)) // store to c01:c31 - vmovhpd(xmm11, mem(rcx, rsi, 1)) - vmovlpd(xmm1, mem(rcx, r12, 1)) - vmovhpd(xmm1, mem(rcx, r13, 1)) - add(rdi, rcx) // c += cs_c; - - vextractf128(imm(1), ymm13, xmm1) - vmovlpd(xmm13, mem(rcx)) // store to c02:c32 - vmovhpd(xmm13, mem(rcx, rsi, 1)) - vmovlpd(xmm1, mem(rcx, r12, 1)) - vmovhpd(xmm1, mem(rcx, r13, 1)) - add(rdi, rcx) // c += cs_c; - - vextractf128(imm(1), ymm15, xmm1) - vmovlpd(xmm15, mem(rcx)) // store to c03:c33 - vmovhpd(xmm15, mem(rcx, rsi, 1)) - vmovlpd(xmm1, mem(rcx, r12, 1)) - vmovhpd(xmm1, mem(rcx, r13, 1)) - - // update c40:c73 - - vextractf128(imm(1), ymm8, xmm1) - vmovlpd(xmm8, mem(rdx)) // store to c40:c70 - vmovhpd(xmm8, mem(rdx, rsi, 1)) - vmovlpd(xmm1, mem(rdx, r12, 1)) - vmovhpd(xmm1, mem(rdx, r13, 1)) - add(rdi, rdx) // c += cs_c; - - vextractf128(imm(1), ymm10, xmm1) - vmovlpd(xmm10, mem(rdx)) // store to c41:c71 - vmovhpd(xmm10, mem(rdx, rsi, 1)) - vmovlpd(xmm1, mem(rdx, r12, 1)) - vmovhpd(xmm1, mem(rdx, r13, 1)) - add(rdi, rdx) // c += cs_c; - - vextractf128(imm(1), ymm12, xmm1) - vmovlpd(xmm12, mem(rdx)) // store to c42:c72 - vmovhpd(xmm12, mem(rdx, rsi, 1)) - vmovlpd(xmm1, mem(rdx, r12, 1)) - vmovhpd(xmm1, mem(rdx, r13, 1)) - add(rdi, rdx) // c += cs_c; - - vextractf128(imm(1), ymm14, xmm1) - vmovlpd(xmm14, mem(rdx)) // store to c43:c73 - vmovhpd(xmm14, mem(rdx, rsi, 1)) - vmovlpd(xmm1, mem(rdx, r12, 1)) - vmovhpd(xmm1, mem(rdx, r13, 1)) - - - jmp(.DDONE) // jump to end. - - - - label(.DCOLSTORBZ) - // update c00:c33 - - vmovupd(ymm9, mem(rcx)) // store c00:c30 - add(rdi, rcx) // c += cs_c; - - vmovupd(ymm11, mem(rcx)) // store c01:c31 - add(rdi, rcx) // c += cs_c; - - vmovupd(ymm13, mem(rcx)) // store c02:c32 - add(rdi, rcx) // c += cs_c; - - vmovupd(ymm15, mem(rcx)) // store c03:c33 - - // update c40:c73 - - vmovupd(ymm8, mem(rdx)) // store c40:c70 - add(rdi, rdx) // c += cs_c; - - vmovupd(ymm10, mem(rdx)) // store c41:c71 - add(rdi, rdx) // c += cs_c; - - vmovupd(ymm12, mem(rdx)) // store c42:c72 - add(rdi, rdx) // c += cs_c; - - vmovupd(ymm14, mem(rdx)) // store c43:c73 - - - - - + + // update c00:c33 + + vmovupd(ymm9, mem(rcx)) // store c00:c30 + add(rdi, rcx) // c += cs_c; + + vmovupd(ymm11, mem(rcx)) // store c01:c31 + add(rdi, rcx) // c += cs_c; + + vmovupd(ymm13, mem(rcx)) // store c02:c32 + add(rdi, rcx) // c += cs_c; + + vmovupd(ymm15, mem(rcx)) // store c03:c33 + + // update c40:c73 + + vmovupd(ymm8, mem(rdx)) // store c40:c70 + add(rdi, rdx) // c += cs_c; + + vmovupd(ymm10, mem(rdx)) // store c41:c71 + add(rdi, rdx) // c += cs_c; + + vmovupd(ymm12, mem(rdx)) // store c42:c72 + add(rdi, rdx) // c += cs_c; + + vmovupd(ymm14, mem(rdx)) // store c43:c73 + label(.DDONE) - - vzeroupper() - + vzeroupper() - - end_asm( + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c), // 8 - [b_next] "m" (b_next)/*, // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next)/*, // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -1699,11 +990,15 @@ void bli_dgemm_sandybridge_asm_8x4 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( d ); } void bli_cgemm_sandybridge_asm_8x4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, @@ -1718,34 +1013,36 @@ void bli_cgemm_sandybridge_asm_8x4 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( c, 8, 4, false ); + begin_asm() - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r15) // load address of b_next. //mov(var(a_next), r14) // load address of a_next. sub(imm(4*64), r15) - + vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovsldup(mem(rbx, 0*32), ymm2) vpermilps(imm(0x4e), ymm2, ymm3) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; - + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c - + vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) @@ -1754,19 +1051,19 @@ void bli_cgemm_sandybridge_asm_8x4 vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.CLOOPKITER) // MAIN LOOP - + add(imm(4*4*8), r15) // b_next += 4*4 (unroll x nr) - + // iteration 0 prefetch(0, mem(rax, 8*32)) vmovaps(mem(rax, 1*32), ymm1) @@ -1776,20 +1073,20 @@ void bli_cgemm_sandybridge_asm_8x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddps(ymm6, ymm15, ymm15) vaddps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovshdup(mem(rbx, 0*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddps(ymm6, ymm14, ymm14) vaddps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vpermilps(imm(0xb1), ymm0, ymm0) vaddps(ymm6, ymm11, ymm11) vaddps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmulps(ymm1, ymm5, ymm7) @@ -1797,32 +1094,32 @@ void bli_cgemm_sandybridge_asm_8x4 vaddps(ymm6, ymm10, ymm10) vaddps(ymm7, ymm8, ymm8) prefetch(0, mem(r15, 0*32)) // prefetch b_next[0*4] - + vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm6, ymm15, ymm15) vaddsubps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 1*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 2*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) - - + + // iteration 1 prefetch(0, mem(rax, 10*32)) vmovaps(mem(rax, 3*32), ymm1) @@ -1832,52 +1129,52 @@ void bli_cgemm_sandybridge_asm_8x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddps(ymm6, ymm15, ymm15) vaddps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovshdup(mem(rbx, 1*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddps(ymm6, ymm14, ymm14) vaddps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vpermilps(imm(0xb1), ymm0, ymm0) vaddps(ymm6, ymm11, ymm11) vaddps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmulps(ymm1, ymm5, ymm7) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddps(ymm6, ymm10, ymm10) vaddps(ymm7, ymm8, ymm8) - + vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm6, ymm15, ymm15) vaddsubps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 2*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 4*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) - - + + // iteration 2 prefetch(0, mem(rax, 12*32)) vmovaps(mem(rax, 5*32), ymm1) @@ -1887,20 +1184,20 @@ void bli_cgemm_sandybridge_asm_8x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddps(ymm6, ymm15, ymm15) vaddps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovshdup(mem(rbx, 2*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddps(ymm6, ymm14, ymm14) vaddps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vpermilps(imm(0xb1), ymm0, ymm0) vaddps(ymm6, ymm11, ymm11) vaddps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmulps(ymm1, ymm5, ymm7) @@ -1908,32 +1205,32 @@ void bli_cgemm_sandybridge_asm_8x4 vaddps(ymm6, ymm10, ymm10) vaddps(ymm7, ymm8, ymm8) prefetch(0, mem(r15, 2*32)) // prefetch b_next[2*4] - + vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm6, ymm15, ymm15) vaddsubps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 3*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 6*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) - - + + // iteration 3 prefetch(0, mem(rax, 14*32)) vmovaps(mem(rax, 7*32), ymm1) @@ -1943,74 +1240,74 @@ void bli_cgemm_sandybridge_asm_8x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddps(ymm6, ymm15, ymm15) vaddps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovshdup(mem(rbx, 3*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddps(ymm6, ymm14, ymm14) vaddps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vpermilps(imm(0xb1), ymm0, ymm0) vaddps(ymm6, ymm11, ymm11) vaddps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmulps(ymm1, ymm5, ymm7) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddps(ymm6, ymm10, ymm10) vaddps(ymm7, ymm8, ymm8) - + vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm6, ymm15, ymm15) vaddsubps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 4*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 8*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) - - + + add(imm(8*4*8), rax) // a += 8*4 (unroll x mr) add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr) - - + + dec(rsi) // i -= 1; jne(.CLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.CCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.CLOOPKLEFT) // EDGE LOOP - + // iteration 0 prefetch(0, mem(rax, 8*32)) vmovaps(mem(rax, 1*32), ymm1) @@ -2020,228 +1317,228 @@ void bli_cgemm_sandybridge_asm_8x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddps(ymm6, ymm15, ymm15) vaddps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovshdup(mem(rbx, 0*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddps(ymm6, ymm14, ymm14) vaddps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vpermilps(imm(0xb1), ymm0, ymm0) vaddps(ymm6, ymm11, ymm11) vaddps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmulps(ymm1, ymm5, ymm7) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddps(ymm6, ymm10, ymm10) vaddps(ymm7, ymm8, ymm8) - + vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm6, ymm15, ymm15) vaddsubps(ymm7, ymm13, ymm13) - + vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 1*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) - + vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 2*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) - + vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) - - + + add(imm(8*1*8), rax) // a += 8 (1 x mr) add(imm(4*1*8), rbx) // b += 4 (1 x nr) - - + + dec(rsi) // i -= 1; jne(.CLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.CPOSTACCUM) - + // ymm15: ymm13: ymm11: ymm9: - // ( ab00 ( ab01 ( ab02 ( ab03 - // ab10 ab11 ab12 ab13 - // ab21 ab20 ab23 ab22 - // ab31 ab30 ab33 ab32 - // ab42 ab43 ab40 ab41 - // ab52 ab53 ab50 ab51 - // ab63 ab62 ab61 ab60 + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab21 ab20 ab23 ab22 + // ab31 ab30 ab33 ab32 + // ab42 ab43 ab40 ab41 + // ab52 ab53 ab50 ab51 + // ab63 ab62 ab61 ab60 // ab73 ) ab72 ) ab71 ) ab70 ) - + // ymm14: ymm12: ymm10: ymm8: - // ( ab80 ( ab81 ( ab82 ( ab83 - // ab90 ab91 ab92 ab93 - // aba1 aba0 aba3 aba2 - // abb1 abb0 abb3 abb2 - // abc2 abc3 abc0 abc1 - // abd2 abd3 abd0 abd1 - // abe3 abe2 abe1 abe0 + // ( ab80 ( ab81 ( ab82 ( ab83 + // ab90 ab91 ab92 ab93 + // aba1 aba0 aba3 aba2 + // abb1 abb0 abb3 abb2 + // abc2 abc3 abc0 abc1 + // abd2 abd3 abd0 abd1 + // abe3 abe2 abe1 abe0 // abf3 abf2 abf1 abf0 ) - + vmovaps(ymm15, ymm7) vshufps(imm(0xe4), ymm13, ymm15, ymm15) vshufps(imm(0xe4), ymm7, ymm13, ymm13) - + vmovaps(ymm11, ymm7) vshufps(imm(0xe4), ymm9, ymm11, ymm11) vshufps(imm(0xe4), ymm7, ymm9, ymm9) - + vmovaps(ymm14, ymm7) vshufps(imm(0xe4), ymm12, ymm14, ymm14) vshufps(imm(0xe4), ymm7, ymm12, ymm12) - + vmovaps(ymm10, ymm7) vshufps(imm(0xe4), ymm8, ymm10, ymm10) vshufps(imm(0xe4), ymm7, ymm8, ymm8) - + // ymm15: ymm13: ymm11: ymm9: - // ( ab00 ( ab01 ( ab02 ( ab03 - // ab10 ab11 ab12 ab13 - // ab20 ab21 ab22 ab23 - // ab30 ab31 ab32 ab33 - // ab42 ab43 ab40 ab41 - // ab52 ab53 ab50 ab51 - // ab62 ab63 ab60 ab61 + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab20 ab21 ab22 ab23 + // ab30 ab31 ab32 ab33 + // ab42 ab43 ab40 ab41 + // ab52 ab53 ab50 ab51 + // ab62 ab63 ab60 ab61 // ab72 ) ab73 ) ab70 ) ab71 ) - + // ymm14: ymm12: ymm10: ymm8: - // ( ab80 ( ab81 ( ab82 ( ab83 - // ab90 ab91 ab92 ab93 - // aba0 aba1 aba2 aba3 - // abb0 abb1 abb2 abb3 - // abc2 abc3 abc0 abc1 - // abd2 abd3 abd0 abd1 - // abe2 abe3 abe0 abe1 + // ( ab80 ( ab81 ( ab82 ( ab83 + // ab90 ab91 ab92 ab93 + // aba0 aba1 aba2 aba3 + // abb0 abb1 abb2 abb3 + // abc2 abc3 abc0 abc1 + // abd2 abd3 abd0 abd1 + // abe2 abe3 abe0 abe1 // abf2 ) abf3 ) abf0 ) abf1 ) - + vmovaps(ymm15, ymm7) vperm2f128(imm(0x12), ymm15, ymm11, ymm15) vperm2f128(imm(0x30), ymm7, ymm11, ymm11) - + vmovaps(ymm13, ymm7) vperm2f128(imm(0x12), ymm13, ymm9, ymm13) vperm2f128(imm(0x30), ymm7, ymm9, ymm9) - + vmovaps(ymm14, ymm7) vperm2f128(imm(0x12), ymm14, ymm10, ymm14) vperm2f128(imm(0x30), ymm7, ymm10, ymm10) - + vmovaps(ymm12, ymm7) vperm2f128(imm(0x12), ymm12, ymm8, ymm12) vperm2f128(imm(0x30), ymm7, ymm8, ymm8) - + // ymm15: ymm13: ymm11: ymm9: - // ( ab00 ( ab01 ( ab02 ( ab03 - // ab10 ab11 ab12 ab13 - // ab20 ab21 ab22 ab23 - // ab30 ab31 ab32 ab33 - // ab40 ab41 ab42 ab43 - // ab50 ab51 ab52 ab53 - // ab60 ab61 ab62 ab63 + // ( ab00 ( ab01 ( ab02 ( ab03 + // ab10 ab11 ab12 ab13 + // ab20 ab21 ab22 ab23 + // ab30 ab31 ab32 ab33 + // ab40 ab41 ab42 ab43 + // ab50 ab51 ab52 ab53 + // ab60 ab61 ab62 ab63 // ab70 ) ab71 ) ab72 ) ab73 ) - + // ymm14: ymm12: ymm10: ymm8: - // ( ab80 ( ab81 ( ab82 ( ab83 - // ab90 ab91 ab92 ab93 - // aba0 aba1 aba2 aba3 - // abb0 abb1 abb2 abb3 - // abc0 abc1 abc2 abc3 - // abd0 abd1 abd2 abd3 - // abe0 abe1 abe2 abe3 + // ( ab80 ( ab81 ( ab82 ( ab83 + // ab90 ab91 ab92 ab93 + // aba0 aba1 aba2 aba3 + // abb0 abb1 abb2 abb3 + // abc0 abc1 abc2 abc3 + // abd0 abd1 abd2 abd3 + // abe0 abe1 abe2 abe3 // abf0 ) abf1 ) abf2 ) abf3 ) - - - - + + + + // scale by alpha - + mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm7) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm6) // load alpha_i and duplicate - + vpermilps(imm(0xb1), ymm15, ymm3) vmulps(ymm7, ymm15, ymm15) vmulps(ymm6, ymm3, ymm3) vaddsubps(ymm3, ymm15, ymm15) - + vpermilps(imm(0xb1), ymm14, ymm2) vmulps(ymm7, ymm14, ymm14) vmulps(ymm6, ymm2, ymm2) vaddsubps(ymm2, ymm14, ymm14) - + vpermilps(imm(0xb1), ymm13, ymm1) vmulps(ymm7, ymm13, ymm13) vmulps(ymm6, ymm1, ymm1) vaddsubps(ymm1, ymm13, ymm13) - + vpermilps(imm(0xb1), ymm12, ymm0) vmulps(ymm7, ymm12, ymm12) vmulps(ymm6, ymm0, ymm0) vaddsubps(ymm0, ymm12, ymm12) - + vpermilps(imm(0xb1), ymm11, ymm3) vmulps(ymm7, ymm11, ymm11) vmulps(ymm6, ymm3, ymm3) vaddsubps(ymm3, ymm11, ymm11) - + vpermilps(imm(0xb1), ymm10, ymm2) vmulps(ymm7, ymm10, ymm10) vmulps(ymm6, ymm2, ymm2) vaddsubps(ymm2, ymm10, ymm10) - + vpermilps(imm(0xb1), ymm9, ymm1) vmulps(ymm7, ymm9, ymm9) vmulps(ymm6, ymm1, ymm1) vaddsubps(ymm1, ymm9, ymm9) - + vpermilps(imm(0xb1), ymm8, ymm0) vmulps(ymm7, ymm8, ymm8) vmulps(ymm6, ymm0, ymm0) vaddsubps(ymm0, ymm8, ymm8) - - - - + + + + mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm7) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm6) // load beta_i and duplicate - - - - - - - + + + + + + + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex) - + lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; - + lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c; - - + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm7) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); @@ -2249,410 +1546,144 @@ void bli_cgemm_sandybridge_asm_8x4 sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.CBETAZERO) // if ZF = 0, jump to beta == 0 case - - - cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. - jz(.CCOLSTORED) // jump to column storage case - - - - label(.CGENSTORED) - - // update c00:c70 - - vmovlpd(mem(rcx), xmm0, xmm0) // load (c00,10) into xmm0[0:1] - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c20,30) into xmm0[2:3] - vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c40,50) into xmm2[0:1] - vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c60,70) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rcx)) // store (c00,c10) - vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c20,c30) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c40,c50) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c60,c70) - add(rdi, rcx) // c += cs_c; - - // update c80:cf0 - - vmovlpd(mem(rdx), xmm0, xmm0) // load (c80,90) into xmm0[0:1] - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca0,b0) into xmm0[2:3] - vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc0,d0) into xmm2[0:1] - vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce0,f0) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rdx)) // store (c80,c90) - vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca0,cb0) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc0,cd0) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce0,cf0) - add(rdi, rdx) // c += cs_c; - - // update c01:c71 - - vmovlpd(mem(rcx), xmm0, xmm0) // load (c01,11) into xmm0[0:1] - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c21,31) into xmm0[2:3] - vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c41,51) into xmm2[0:1] - vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c61,71) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rcx)) // store (c01,c11) - vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c21,c31) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c41,c51) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c61,c71) - add(rdi, rcx) // c += cs_c; - - // update c81:cf1 - - vmovlpd(mem(rdx), xmm0, xmm0) // load (c81,91) into xmm0[0:1] - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca1,b1) into xmm0[2:3] - vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc1,d1) into xmm2[0:1] - vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce1,f1) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rdx)) // store (c81,c91) - vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca1,cb1) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc1,cd1) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce1,cf1) - add(rdi, rdx) // c += cs_c; - - // update c02:c72 - - vmovlpd(mem(rcx), xmm0, xmm0) // load (c02,12) into xmm0[0:1] - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c22,32) into xmm0[2:3] - vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c42,52) into xmm2[0:1] - vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c62,72) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rcx)) // store (c02,c12) - vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c22,c32) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c42,c52) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c62,c72) - add(rdi, rcx) // c += cs_c; - - // update c82:cf2 - - vmovlpd(mem(rdx), xmm0, xmm0) // load (c82,92) into xmm0[0:1] - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca2,b2) into xmm0[2:3] - vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc2,d2) into xmm2[0:1] - vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce2,f2) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rdx)) // store (c82,c92) - vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca2,cb2) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc2,cd2) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce2,cf2) - add(rdi, rdx) // c += cs_c; - - // update c03:c73 - - vmovlpd(mem(rcx), xmm0, xmm0) // load (c03,13) into xmm0[0:1] - vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c23,33) into xmm0[2:3] - vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c43,53) into xmm2[0:1] - vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c63,73) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rcx)) // store (c03,c13) - vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c23,c33) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c43,c53) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c63,c73) - add(rdi, rcx) // c += cs_c; - - // update c83:cf3 - - vmovlpd(mem(rdx), xmm0, xmm0) // load (c83,93) into xmm0[0:1] - vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca3,b3) into xmm0[2:3] - vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc3,d3) into xmm2[0:1] - vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce3,f3) into xmm2[2:3] - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2) - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm0, mem(rdx)) // store (c83,c93) - vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca3,cb3) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc3,cd3) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce3,cf3) - add(rdi, rdx) // c += cs_c; - - - - jmp(.CDONE) // jump to end. - - - - label(.CCOLSTORED) - - // update c00:c70 - - vmovups(mem(rcx), ymm0) // load c00:c70 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0 - vmovups(ymm0, mem(rcx)) // store c00:c70 - add(rdi, rcx) // c += cs_c; - - // update c80:cf0 - - vmovups(mem(rdx), ymm0) // load c80:f0 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0 - vmovups(ymm0, mem(rdx)) // store c80:cf0 - add(rdi, rdx) // c += cs_c; - - // update c00:c70 - - vmovups(mem(rcx), ymm0) // load c01:c71 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0 - vmovups(ymm0, mem(rcx)) // store c01:c71 - add(rdi, rcx) // c += cs_c; - - // update c81:cf1 - - vmovups(mem(rdx), ymm0) // load c81:f1 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0 - vmovups(ymm0, mem(rdx)) // store c81:cf1 - add(rdi, rdx) // c += cs_c; - - // update c02:c72 - - vmovups(mem(rcx), ymm0) // load c02:c72 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0 - vmovups(ymm0, mem(rcx)) // store c02:c72 - add(rdi, rcx) // c += cs_c; - - // update c82:cf2 - - vmovups(mem(rdx), ymm0) // load c82:f2 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0 - vmovups(ymm0, mem(rdx)) // store c82:cf2 - add(rdi, rdx) // c += cs_c; - - // update c03:c73 - - vmovups(mem(rcx), ymm0) // load c03:c73 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0 - vmovups(ymm0, mem(rcx)) // store c03:c73 - add(rdi, rcx) // c += cs_c; - - // update c83:cf3 - - vmovups(mem(rdx), ymm0) // load c83:f3 into ymm0 - vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta - vmulps(ymm7, ymm0, ymm0) - vmulps(ymm6, ymm2, ymm2) - vaddsubps(ymm2, ymm0, ymm0) - vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0 - vmovups(ymm0, mem(rdx)) // store c83:cf3 - add(rdi, rdx) // c += cs_c; - - - - jmp(.CDONE) // jump to end. - - - + + // update c00:c70 + + vmovups(mem(rcx), ymm0) // load c00:c70 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rcx)) // store c00:c70 + add(rdi, rcx) // c += cs_c; + + // update c80:cf0 + + vmovups(mem(rdx), ymm0) // load c80:f0 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rdx)) // store c80:cf0 + add(rdi, rdx) // c += cs_c; + + // update c00:c70 + + vmovups(mem(rcx), ymm0) // load c01:c71 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rcx)) // store c01:c71 + add(rdi, rcx) // c += cs_c; + + // update c81:cf1 + + vmovups(mem(rdx), ymm0) // load c81:f1 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rdx)) // store c81:cf1 + add(rdi, rdx) // c += cs_c; + + // update c02:c72 + + vmovups(mem(rcx), ymm0) // load c02:c72 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rcx)) // store c02:c72 + add(rdi, rcx) // c += cs_c; + + // update c82:cf2 + + vmovups(mem(rdx), ymm0) // load c82:f2 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rdx)) // store c82:cf2 + add(rdi, rdx) // c += cs_c; + + // update c03:c73 + + vmovups(mem(rcx), ymm0) // load c03:c73 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rcx)) // store c03:c73 + add(rdi, rcx) // c += cs_c; + + // update c83:cf3 + + vmovups(mem(rdx), ymm0) // load c83:f3 into ymm0 + vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta + vmulps(ymm7, ymm0, ymm0) + vmulps(ymm6, ymm2, ymm2) + vaddsubps(ymm2, ymm0, ymm0) + vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0 + vmovups(ymm0, mem(rdx)) // store c83:cf3 + add(rdi, rdx) // c += cs_c; + + jmp(.CDONE) // jump to end. + label(.CBETAZERO) - - cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. - jz(.CCOLSTORBZ) // jump to column storage case - - - - label(.CGENSTORBZ) - - // update c00:c70 - - vextractf128(imm(1), ymm15, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm15, mem(rcx)) // store (c00,c10) - vmovhpd(xmm15, mem(rcx, rsi, 1)) // store (c20,c30) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c40,c50) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c60,c70) - add(rdi, rcx) // c += cs_c; - - // update c80:cf0 - - vextractf128(imm(1), ymm14, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm14, mem(rdx)) // store (c80,c90) - vmovhpd(xmm14, mem(rdx, rsi, 1)) // store (ca0,cb0) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc0,cd0) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce0,cf0) - add(rdi, rdx) // c += cs_c; - - // update c01:c71 - - vextractf128(imm(1), ymm13, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm13, mem(rcx)) // store (c01,c11) - vmovhpd(xmm13, mem(rcx, rsi, 1)) // store (c21,c31) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c41,c51) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c61,c71) - add(rdi, rcx) // c += cs_c; - - // update c81:cf1 - - vextractf128(imm(1), ymm12, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm12, mem(rdx)) // store (c81,c91) - vmovhpd(xmm12, mem(rdx, rsi, 1)) // store (ca1,cb1) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc1,cd1) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce1,cf1) - add(rdi, rdx) // c += cs_c; - - // update c02:c72 - - vextractf128(imm(1), ymm11, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm11, mem(rcx)) // store (c02,c12) - vmovhpd(xmm11, mem(rcx, rsi, 1)) // store (c22,c32) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c42,c52) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c62,c72) - add(rdi, rcx) // c += cs_c; - - // update c82:cf2 - - vextractf128(imm(1), ymm10, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm10, mem(rdx)) // store (c82,c92) - vmovhpd(xmm10, mem(rdx, rsi, 1)) // store (ca2,cb2) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc2,cd2) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce2,cf2) - add(rdi, rdx) // c += cs_c; - - // update c03:c73 - - vextractf128(imm(1), ymm9, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm9, mem(rcx)) // store (c03,c13) - vmovhpd(xmm9, mem(rcx, rsi, 1)) // store (c23,c33) - vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c43,c53) - vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c63,c73) - add(rdi, rcx) // c += cs_c; - - // update c83:cf3 - - vextractf128(imm(1), ymm8, xmm2) // xmm2 := ymm0[4:7] - vmovlpd(xmm8, mem(rdx)) // store (c83,c93) - vmovhpd(xmm8, mem(rdx, rsi, 1)) // store (ca3,cb3) - vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc3,cd3) - vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce3,cf3) - add(rdi, rdx) // c += cs_c; - - - - jmp(.CDONE) // jump to end. - - - - label(.CCOLSTORBZ) - - - vmovups(ymm15, mem(rcx)) // store c00:c70 - add(rdi, rcx) // c += cs_c; - - vmovups(ymm14, mem(rdx)) // store c80:cf0 - add(rdi, rdx) // c += cs_c; - - vmovups(ymm13, mem(rcx)) // store c01:c71 - add(rdi, rcx) // c += cs_c; - - vmovups(ymm12, mem(rdx)) // store c81:cf1 - add(rdi, rdx) // c += cs_c; - - vmovups(ymm11, mem(rcx)) // store c02:c72 - add(rdi, rcx) // c += cs_c; - - vmovups(ymm10, mem(rdx)) // store c82:cf2 - add(rdi, rdx) // c += cs_c; - - vmovups(ymm9, mem(rcx)) // store c03:c73 - add(rdi, rcx) // c += cs_c; - - vmovups(ymm8, mem(rdx)) // store c83:cf3 - add(rdi, rdx) // c += cs_c; - - - - - + + vmovups(ymm15, mem(rcx)) // store c00:c70 + add(rdi, rcx) // c += cs_c; + + vmovups(ymm14, mem(rdx)) // store c80:cf0 + add(rdi, rdx) // c += cs_c; + + vmovups(ymm13, mem(rcx)) // store c01:c71 + add(rdi, rcx) // c += cs_c; + + vmovups(ymm12, mem(rdx)) // store c81:cf1 + add(rdi, rdx) // c += cs_c; + + vmovups(ymm11, mem(rcx)) // store c02:c72 + add(rdi, rcx) // c += cs_c; + + vmovups(ymm10, mem(rdx)) // store c82:cf2 + add(rdi, rdx) // c += cs_c; + + vmovups(ymm9, mem(rcx)) // store c03:c73 + add(rdi, rcx) // c += cs_c; + + vmovups(ymm8, mem(rdx)) // store c83:cf3 + add(rdi, rdx) // c += cs_c; + label(.CDONE) - - vzeroupper() - + vzeroupper() - - end_asm( + + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c), // 8 - [b_next] "m" (b_next)/*, // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c), // 8 + [b_next] "m" (b_next)/*, // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -2660,13 +1691,17 @@ void bli_cgemm_sandybridge_asm_8x4 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( c ); } void bli_zgemm_sandybridge_asm_4x4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, @@ -2681,34 +1716,36 @@ void bli_zgemm_sandybridge_asm_4x4 // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 4; - uint64_t k_left = k0 % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMM_UKR_SETUP_CT( z, 4, 4, false ); + begin_asm() - - + + mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(var(b_next), r15) // load address of b_next. //mov(var(a_next), r14) // load address of a_next. - + vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovddup(mem(rbx, 0+0*32), ymm2) vmovddup(mem(rbx, 0+1*32), ymm3) - + mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) lea(mem(, rdi, 2), rdi) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; - + prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c - + vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) @@ -2717,18 +1754,18 @@ void bli_zgemm_sandybridge_asm_4x4 vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.ZLOOPKITER) // MAIN LOOP - - + + // iteration 0 vmovapd(mem(rax, 1*32), ymm1) vmulpd(ymm0, ymm2, ymm6) @@ -2737,7 +1774,7 @@ void bli_zgemm_sandybridge_asm_4x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm6, ymm15, ymm15) vaddpd(ymm7, ymm11, ymm11) - + prefetch(0, mem(rax, 16*32)) vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 8+0*32), ymm2) @@ -2745,45 +1782,45 @@ void bli_zgemm_sandybridge_asm_4x4 vmovddup(mem(rbx, 8+1*32), ymm3) vaddpd(ymm6, ymm14, ymm14) vaddpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vpermilpd(imm(0x5), ymm0, ymm0) vaddpd(ymm6, ymm13, ymm13) vaddpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmulpd(ymm1, ymm5, ymm7) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm6, ymm12, ymm12) vaddpd(ymm7, ymm8, ymm8) - + vpermilpd(imm(0x5), ymm1, ymm1) vmulpd(ymm0, ymm2, ymm6) vmulpd(ymm0, ymm3, ymm7) vaddsubpd(ymm6, ymm15, ymm15) vaddsubpd(ymm7, ymm11, ymm11) - + vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+2*32), ymm2) vmulpd(ymm1, ymm3, ymm7) vmovddup(mem(rbx, 0+3*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 2*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) - - + + // iteration 1 vmovapd(mem(rax, 3*32), ymm1) vmulpd(ymm0, ymm2, ymm6) @@ -2792,7 +1829,7 @@ void bli_zgemm_sandybridge_asm_4x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm6, ymm15, ymm15) vaddpd(ymm7, ymm11, ymm11) - + prefetch(0, mem(rax, 18*32)) vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 8+2*32), ymm2) @@ -2800,45 +1837,45 @@ void bli_zgemm_sandybridge_asm_4x4 vmovddup(mem(rbx, 8+3*32), ymm3) vaddpd(ymm6, ymm14, ymm14) vaddpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vpermilpd(imm(0x5), ymm0, ymm0) vaddpd(ymm6, ymm13, ymm13) vaddpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmulpd(ymm1, ymm5, ymm7) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm6, ymm12, ymm12) vaddpd(ymm7, ymm8, ymm8) - + vpermilpd(imm(0x5), ymm1, ymm1) vmulpd(ymm0, ymm2, ymm6) vmulpd(ymm0, ymm3, ymm7) vaddsubpd(ymm6, ymm15, ymm15) vaddsubpd(ymm7, ymm11, ymm11) - + vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+4*32), ymm2) vmulpd(ymm1, ymm3, ymm7) vmovddup(mem(rbx, 0+5*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 4*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) - - + + // iteration 2 vmovapd(mem(rax, 5*32), ymm1) vmulpd(ymm0, ymm2, ymm6) @@ -2847,7 +1884,7 @@ void bli_zgemm_sandybridge_asm_4x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm6, ymm15, ymm15) vaddpd(ymm7, ymm11, ymm11) - + prefetch(0, mem(rax, 20*32)) vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 8+4*32), ymm2) @@ -2855,45 +1892,45 @@ void bli_zgemm_sandybridge_asm_4x4 vmovddup(mem(rbx, 8+5*32), ymm3) vaddpd(ymm6, ymm14, ymm14) vaddpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vpermilpd(imm(0x5), ymm0, ymm0) vaddpd(ymm6, ymm13, ymm13) vaddpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmulpd(ymm1, ymm5, ymm7) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm6, ymm12, ymm12) vaddpd(ymm7, ymm8, ymm8) - + vpermilpd(imm(0x5), ymm1, ymm1) vmulpd(ymm0, ymm2, ymm6) vmulpd(ymm0, ymm3, ymm7) vaddsubpd(ymm6, ymm15, ymm15) vaddsubpd(ymm7, ymm11, ymm11) - + vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+6*32), ymm2) vmulpd(ymm1, ymm3, ymm7) vmovddup(mem(rbx, 0+7*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 6*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) - - + + // iteration 3 vmovapd(mem(rax, 7*32), ymm1) vmulpd(ymm0, ymm2, ymm6) @@ -2902,7 +1939,7 @@ void bli_zgemm_sandybridge_asm_4x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm6, ymm15, ymm15) vaddpd(ymm7, ymm11, ymm11) - + prefetch(0, mem(rax, 22*32)) vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 8+6*32), ymm2) @@ -2910,67 +1947,67 @@ void bli_zgemm_sandybridge_asm_4x4 vmovddup(mem(rbx, 8+7*32), ymm3) vaddpd(ymm6, ymm14, ymm14) vaddpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vpermilpd(imm(0x5), ymm0, ymm0) vaddpd(ymm6, ymm13, ymm13) vaddpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmulpd(ymm1, ymm5, ymm7) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm6, ymm12, ymm12) vaddpd(ymm7, ymm8, ymm8) - + vpermilpd(imm(0x5), ymm1, ymm1) vmulpd(ymm0, ymm2, ymm6) vmulpd(ymm0, ymm3, ymm7) vaddsubpd(ymm6, ymm15, ymm15) vaddsubpd(ymm7, ymm11, ymm11) - + vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+8*32), ymm2) vmulpd(ymm1, ymm3, ymm7) vmovddup(mem(rbx, 0+9*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 8*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) - - + + add(imm(4*4*16), rbx) // b += 4*4 (unroll x nr) add(imm(4*4*16), rax) // a += 4*4 (unroll x mr) - - + + dec(rsi) // i -= 1; jne(.ZLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.ZCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.ZLOOPKLEFT) // EDGE LOOP - + // iteration 0 vmovapd(mem(rax, 1*32), ymm1) vmulpd(ymm0, ymm2, ymm6) @@ -2979,7 +2016,7 @@ void bli_zgemm_sandybridge_asm_4x4 vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm6, ymm15, ymm15) vaddpd(ymm7, ymm11, ymm11) - + prefetch(0, mem(rax, 16*32)) vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 8+0*32), ymm2) @@ -2987,166 +2024,166 @@ void bli_zgemm_sandybridge_asm_4x4 vmovddup(mem(rbx, 8+1*32), ymm3) vaddpd(ymm6, ymm14, ymm14) vaddpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vpermilpd(imm(0x5), ymm0, ymm0) vaddpd(ymm6, ymm13, ymm13) vaddpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmulpd(ymm1, ymm5, ymm7) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vaddpd(ymm6, ymm12, ymm12) vaddpd(ymm7, ymm8, ymm8) - + vpermilpd(imm(0x5), ymm1, ymm1) vmulpd(ymm0, ymm2, ymm6) vmulpd(ymm0, ymm3, ymm7) vaddsubpd(ymm6, ymm15, ymm15) vaddsubpd(ymm7, ymm11, ymm11) - + vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+2*32), ymm2) vmulpd(ymm1, ymm3, ymm7) vmovddup(mem(rbx, 0+3*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) - + vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 2*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) - + vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) - - + + add(imm(4*1*16), rax) // a += 4 (1 x mr) add(imm(4*1*16), rbx) // b += 4 (1 x nr) - - + + dec(rsi) // i -= 1; jne(.ZLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.ZPOSTACCUM) - + // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab01 ( ab02 ( ab03 - // ab10 ab11 ab12 ab13 + // ab10 ab11 ab12 ab13 // ab21 ab20 ab23 ab22 // ab31 ) ab30 ) ab33 ) ab32 ) - + // ymm14: ymm12: ymm10: ymm8: // ( ab40 ( ab41 ( ab42 ( ab43 - // ab50 ab51 ab52 ab53 + // ab50 ab51 ab52 ab53 // ab61 ab60 ab63 ab62 // ab71 ) ab70 ) ab73 ) ab72 ) - - + + vmovapd(ymm15, ymm7) vperm2f128(imm(0x12), ymm15, ymm13, ymm15) vperm2f128(imm(0x30), ymm7, ymm13, ymm13) - + vmovapd(ymm11, ymm7) vperm2f128(imm(0x12), ymm11, ymm9, ymm11) vperm2f128(imm(0x30), ymm7, ymm9, ymm9) - + vmovapd(ymm14, ymm7) vperm2f128(imm(0x12), ymm14, ymm12, ymm14) vperm2f128(imm(0x30), ymm7, ymm12, ymm12) - + vmovapd(ymm10, ymm7) vperm2f128(imm(0x12), ymm10, ymm8, ymm10) vperm2f128(imm(0x30), ymm7, ymm8, ymm8) - - + + // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab01 ( ab02 ( ab03 - // ab10 ab11 ab12 ab13 + // ab10 ab11 ab12 ab13 // ab20 ab21 ab22 ab23 // ab30 ) ab31 ) ab32 ) ab33 ) - + // ymm14: ymm12: ymm10: ymm8: // ( ab40 ( ab41 ( ab42 ( ab43 - // ab50 ab51 ab52 ab53 + // ab50 ab51 ab52 ab53 // ab60 ab61 ab62 ab63 // ab70 ) ab71 ) ab72 ) ab73 ) - - + + // scale by alpha - + mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm7) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm6) // load alpha_i and duplicate - + vpermilpd(imm(0x5), ymm15, ymm3) vmulpd(ymm7, ymm15, ymm15) vmulpd(ymm6, ymm3, ymm3) vaddsubpd(ymm3, ymm15, ymm15) - + vpermilpd(imm(0x5), ymm14, ymm2) vmulpd(ymm7, ymm14, ymm14) vmulpd(ymm6, ymm2, ymm2) vaddsubpd(ymm2, ymm14, ymm14) - + vpermilpd(imm(0x5), ymm13, ymm1) vmulpd(ymm7, ymm13, ymm13) vmulpd(ymm6, ymm1, ymm1) vaddsubpd(ymm1, ymm13, ymm13) - + vpermilpd(imm(0x5), ymm12, ymm0) vmulpd(ymm7, ymm12, ymm12) vmulpd(ymm6, ymm0, ymm0) vaddsubpd(ymm0, ymm12, ymm12) - + vpermilpd(imm(0x5), ymm11, ymm3) vmulpd(ymm7, ymm11, ymm11) vmulpd(ymm6, ymm3, ymm3) vaddsubpd(ymm3, ymm11, ymm11) - + vpermilpd(imm(0x5), ymm10, ymm2) vmulpd(ymm7, ymm10, ymm10) vmulpd(ymm6, ymm2, ymm2) vaddsubpd(ymm2, ymm10, ymm10) - + vpermilpd(imm(0x5), ymm9, ymm1) vmulpd(ymm7, ymm9, ymm9) vmulpd(ymm6, ymm1, ymm1) vaddsubpd(ymm1, ymm9, ymm9) - + vpermilpd(imm(0x5), ymm8, ymm0) vmulpd(ymm7, ymm8, ymm8) vmulpd(ymm6, ymm0, ymm0) vaddsubpd(ymm0, ymm8, ymm8) - - - - + + + + mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm7) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm6) // load beta_i and duplicate - - - - - - - + + + + + + + mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex) lea(mem(, rsi, 2), rsi) lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c; - - + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm7) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); @@ -3154,355 +2191,142 @@ void bli_zgemm_sandybridge_asm_4x4 sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.ZBETAZERO) // if ZF = 0, jump to beta == 0 case - - - cmp(imm(16), rsi) // set ZF if (16*cs_c) == 16. - jz(.ZCOLSTORED) // jump to column storage case - - - - label(.ZGENSTORED) - // update c00:c30 - - vmovupd(mem(rcx), xmm0) // load (c00,c10) into xmm0 - vmovupd(mem(rcx, rsi, 1), xmm2) // load (c20,c30) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rcx)) // store (c00,c10) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c20,c30) - add(rdi, rcx) // c += cs_c; - - // update c40:c70 - - vmovupd(mem(rdx), xmm0) // load (c40,c50) into xmm0 - vmovupd(mem(rdx, rsi, 1), xmm2) // load (c60,c70) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rdx)) // store (c40,c50) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c60,c70) - add(rdi, rdx) // c += cs_c; - - // update c01:c31 - - vmovupd(mem(rcx), xmm0) // load (c01,c11) into xmm0 - vmovupd(mem(rcx, rsi, 1), xmm2) // load (c21,c31) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rcx)) // store (c01,c11) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c21,c31) - add(rdi, rcx) // c += cs_c; - - // update c41:c71 - - vmovupd(mem(rdx), xmm0) // load (c41,c51) into xmm0 - vmovupd(mem(rdx, rsi, 1), xmm2) // load (c61,c71) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rdx)) // store (c41,c51) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c61,c71) - add(rdi, rdx) // c += cs_c; - - // update c02:c32 - - vmovupd(mem(rcx), xmm0) // load (c02,c12) into xmm0 - vmovupd(mem(rcx, rsi, 1), xmm2) // load (c22,c32) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rcx)) // store (c02,c12) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c22,c32) - add(rdi, rcx) // c += cs_c; - - // update c42:c72 - - vmovupd(mem(rdx), xmm0) // load (c42,c52) into xmm0 - vmovupd(mem(rdx, rsi, 1), xmm2) // load (c62,c72) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rdx)) // store (c42,c52) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c62,c72) - add(rdi, rdx) // c += cs_c; - - // update c03:c33 - - vmovupd(mem(rcx), xmm0) // load (c03,c13) into xmm0 - vmovupd(mem(rcx, rsi, 1), xmm2) // load (c23,c33) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rcx)) // store (c03,c13) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c23,c33) - add(rdi, rcx) // c += cs_c; - - // update c43:c73 - - vmovupd(mem(rdx), xmm0) // load (c43,c53) into xmm0 - vmovupd(mem(rdx, rsi, 1), xmm2) // load (c63,c73) into xmm2 - vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2) - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0 - vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3] - vmovupd(xmm0, mem(rdx)) // store (c43,c53) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c63,c73) - - - - jmp(.ZDONE) // jump to end. - - - - label(.ZCOLSTORED) - // update c00:c30 - - vmovupd(mem(rcx), ymm0) // load c00:c30 into ymm0 - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0 - vmovupd(ymm0, mem(rcx)) // store c00:c30 - add(rdi, rcx) // c += cs_c; - - // update c40:c70 - - vmovupd(mem(rdx), ymm0) // load c40:c70 into ymm0 - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0 - vmovupd(ymm0, mem(rdx)) // store c40:c70 - add(rdi, rdx) // c += cs_c; - - // update c01:c31 - - vmovupd(mem(rcx), ymm0) // load c01:c31 into ymm0 - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0 - vmovupd(ymm0, mem(rcx)) // store c01:c31 - add(rdi, rcx) // c += cs_c; - - // update c41:c71 - - vmovupd(mem(rdx), ymm0) // load c41:c71 into ymm0 - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0 - vmovupd(ymm0, mem(rdx)) // store c41:c71 - add(rdi, rdx) // c += cs_c; - - // update c02:c32 - - vmovupd(mem(rcx), ymm0) // load c02:c32 into ymm0 - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0 - vmovupd(ymm0, mem(rcx)) // store c02:c32 - add(rdi, rcx) // c += cs_c; - - // update c42:c72 - - vmovupd(mem(rdx), ymm0) // load c42:c72 into ymm0 - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0 - vmovupd(ymm0, mem(rdx)) // store c42:c72 - add(rdi, rdx) // c += cs_c; - - // update c03:c33 - - vmovupd(mem(rcx), ymm0) // load c03:c33 into ymm0 - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0 - vmovupd(ymm0, mem(rcx)) // store c03:c33 - add(rdi, rcx) // c += cs_c; - - // update c43:c73 - - vmovupd(mem(rdx), ymm0) // load c43:c73 into ymm0 - vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta - vmulpd(ymm7, ymm0, ymm0) - vmulpd(ymm6, ymm2, ymm2) - vaddsubpd(ymm2, ymm0, ymm0) - vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0 - vmovupd(ymm0, mem(rdx)) // store c43:c73 - - - - jmp(.ZDONE) // jump to end. - - - + + // update c00:c30 + + vmovupd(mem(rcx), ymm0) // load c00:c30 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rcx)) // store c00:c30 + add(rdi, rcx) // c += cs_c; + + // update c40:c70 + + vmovupd(mem(rdx), ymm0) // load c40:c70 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rdx)) // store c40:c70 + add(rdi, rdx) // c += cs_c; + + // update c01:c31 + + vmovupd(mem(rcx), ymm0) // load c01:c31 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rcx)) // store c01:c31 + add(rdi, rcx) // c += cs_c; + + // update c41:c71 + + vmovupd(mem(rdx), ymm0) // load c41:c71 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rdx)) // store c41:c71 + add(rdi, rdx) // c += cs_c; + + // update c02:c32 + + vmovupd(mem(rcx), ymm0) // load c02:c32 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rcx)) // store c02:c32 + add(rdi, rcx) // c += cs_c; + + // update c42:c72 + + vmovupd(mem(rdx), ymm0) // load c42:c72 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rdx)) // store c42:c72 + add(rdi, rdx) // c += cs_c; + + // update c03:c33 + + vmovupd(mem(rcx), ymm0) // load c03:c33 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rcx)) // store c03:c33 + add(rdi, rcx) // c += cs_c; + + // update c43:c73 + + vmovupd(mem(rdx), ymm0) // load c43:c73 into ymm0 + vpermilpd(imm(0x5), ymm0, ymm2) // scale ymm0 by beta + vmulpd(ymm7, ymm0, ymm0) + vmulpd(ymm6, ymm2, ymm2) + vaddsubpd(ymm2, ymm0, ymm0) + vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0 + vmovupd(ymm0, mem(rdx)) // store c43:c73 + + jmp(.ZDONE) // jump to end. + label(.ZBETAZERO) - - cmp(imm(16), rsi) // set ZF if (16*cs_c) == 16. - jz(.ZCOLSTORBZ) // jump to column storage case - - - - label(.ZGENSTORBZ) - // update c00:c30 - - vextractf128(imm(1), ymm15, xmm2) - vmovupd(xmm15, mem(rcx)) // store (c00,c10) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c20,c30) - add(rdi, rcx) // c += cs_c; - - // update c40:c70 - - vextractf128(imm(1), ymm14, xmm2) - vmovupd(xmm14, mem(rdx)) // store (c40,c50) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c60,c70) - add(rdi, rdx) // c += cs_c; - - // update c01:c31 - - vextractf128(imm(1), ymm13, xmm2) - vmovupd(xmm13, mem(rcx)) // store (c01,c11) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c21,c31) - add(rdi, rcx) // c += cs_c; - - // update c41:c71 - - vextractf128(imm(1), ymm12, xmm2) - vmovupd(xmm12, mem(rdx)) // store (c41,c51) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c61,c71) - add(rdi, rdx) // c += cs_c; - - // update c02:c32 - - vextractf128(imm(1), ymm11, xmm2) - vmovupd(xmm11, mem(rcx)) // store (c02,c12) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c22,c32) - add(rdi, rcx) // c += cs_c; - - // update c42:c72 - - vextractf128(imm(1), ymm10, xmm2) - vmovupd(xmm10, mem(rdx)) // store (c42,c52) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c62,c72) - add(rdi, rdx) // c += cs_c; - - // update c03:c33 - - vextractf128(imm(1), ymm9, xmm2) - vmovupd(xmm9, mem(rcx)) // store (c03,c13) - vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c23,c33) - add(rdi, rcx) // c += cs_c; - - // update c43:c73 - - vextractf128(imm(1), ymm8, xmm2) - vmovupd(xmm8, mem(rdx)) // store (c43,c53) - vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c63,c73) - - - - jmp(.ZDONE) // jump to end. - - - - label(.ZCOLSTORBZ) - - - vmovupd(ymm15, mem(rcx)) // store c00:c30 - add(rdi, rcx) // c += cs_c; - - vmovupd(ymm14, mem(rdx)) // store c40:c70 - add(rdi, rdx) // c += cs_c; - - vmovupd(ymm13, mem(rcx)) // store c01:c31 - add(rdi, rcx) // c += cs_c; - - vmovupd(ymm12, mem(rdx)) // store c41:c71 - add(rdi, rdx) // c += cs_c; - - vmovupd(ymm11, mem(rcx)) // store c02:c32 - add(rdi, rcx) // c += cs_c; - - vmovupd(ymm10, mem(rdx)) // store c42:c72 - add(rdi, rdx) // c += cs_c; - - vmovupd(ymm9, mem(rcx)) // store c03:c33 - add(rdi, rcx) // c += cs_c; - - vmovupd(ymm8, mem(rdx)) // store c43:c73 - - - - - + + vmovupd(ymm15, mem(rcx)) // store c00:c30 + add(rdi, rcx) // c += cs_c; + + vmovupd(ymm14, mem(rdx)) // store c40:c70 + add(rdi, rdx) // c += cs_c; + + vmovupd(ymm13, mem(rcx)) // store c01:c31 + add(rdi, rcx) // c += cs_c; + + vmovupd(ymm12, mem(rdx)) // store c41:c71 + add(rdi, rdx) // c += cs_c; + + vmovupd(ymm11, mem(rcx)) // store c02:c32 + add(rdi, rcx) // c += cs_c; + + vmovupd(ymm10, mem(rdx)) // store c42:c72 + add(rdi, rdx) // c += cs_c; + + vmovupd(ymm9, mem(rcx)) // store c03:c33 + add(rdi, rcx) // c += cs_c; + + vmovupd(ymm8, mem(rdx)) // store c43:c73 + label(.ZDONE) - - vzeroupper() - + vzeroupper() - + end_asm( : // output operands (none) : // input operands - [k_iter] "m" (k_iter), // 0 - [k_left] "m" (k_left), // 1 - [a] "m" (a), // 2 - [b] "m" (b), // 3 - [alpha] "m" (alpha), // 4 - [beta] "m" (beta), // 5 - [c] "m" (c), // 6 - [rs_c] "m" (rs_c), // 7 - [cs_c] "m" (cs_c)/*, // 8 - [b_next] "m" (b_next), // 9 - [a_next] "m" (a_next)*/ // 10 + [k_iter] "m" (k_iter), // 0 + [k_left] "m" (k_left), // 1 + [a] "m" (a), // 2 + [b] "m" (b), // 3 + [alpha] "m" (alpha), // 4 + [beta] "m" (beta), // 5 + [c] "m" (c), // 6 + [rs_c] "m" (rs_c), // 7 + [cs_c] "m" (cs_c)/*, // 8 + [b_next] "m" (b_next), // 9 + [a_next] "m" (a_next)*/ // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -3510,6 +2334,8 @@ void bli_zgemm_sandybridge_asm_4x4 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMM_UKR_FLUSH_CT( z ); } diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c index 6a1bb04f5..6bf991082 100644 --- a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c +++ b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c @@ -32,14 +32,17 @@ */ -#include +#include +#include #include "blis.h" #if 0 void bli_sgemm_sandybridge_int_8x8 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, float* restrict alpha, float* restrict a, float* restrict b, @@ -52,11 +55,11 @@ void bli_sgemm_sandybridge_int_8x8 } #endif - - void bli_dgemm_sandybridge_int_8x4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, double* restrict alpha, double* restrict a, double* restrict b, @@ -66,19 +69,22 @@ void bli_dgemm_sandybridge_int_8x4 cntx_t* restrict cntx ) { + //void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - uint64_t k_iter = k0 / 2; - uint64_t k_left = k0 % 2; + uint64_t k_iter = k / 2; + uint64_t k_left = k % 2; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; uint64_t i; - double *c00, *c01, *c02, *c03; - double *c40, *c41, *c42, *c43; + GEMM_UKR_SETUP_CT( d, 8, 4, false ); + + double *c00, *c01, *c02, *c03; + double *c40, *c41, *c42, *c43; // Quad registers. __m256d va0_3, va4_7; @@ -87,23 +93,20 @@ void bli_dgemm_sandybridge_int_8x4 __m256d vb; __m256d vB0; - __m256d va0_3b_0, va4_7b_0; - __m256d va0_3b_1, va4_7b_1; - __m256d va0_3b_2, va4_7b_2; - __m256d va0_3b_3, va4_7b_3; - - __m256d va0_3b0, va4_7b0; - __m256d va0_3b1, va4_7b1; - __m256d va0_3b2, va4_7b2; - __m256d va0_3b3, va4_7b3; + __m256d va0_3b_0, va4_7b_0; + __m256d va0_3b_1, va4_7b_1; + __m256d va0_3b_2, va4_7b_2; + __m256d va0_3b_3, va4_7b_3; + __m256d va0_3b0, va4_7b0; + __m256d va0_3b1, va4_7b1; + __m256d va0_3b2, va4_7b2; + __m256d va0_3b3, va4_7b3; - __m256d valpha, vbeta, vtmp; + __m256d valpha, vbeta, vtmp; __m256d vc0_3_0, vc0_3_1, vc0_3_2, vc0_3_3; __m256d vc4_7_0, vc4_7_1, vc4_7_2, vc4_7_3; - __m128d aa, bb; - __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"(a) ); __asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"(b_next) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"(c) ); @@ -129,19 +132,19 @@ void bli_dgemm_sandybridge_int_8x4 va4_7b_3 = _mm256_setzero_pd(); // Load va0_3 - va0_3 = _mm256_load_pd( a ); + va0_3 = _mm256_load_pd( a ); // Load va4_7 - va4_7 = _mm256_load_pd( a + 4 ); + va4_7 = _mm256_load_pd( a + 4 ); - // Load vb (b0,b1,b2,b3) - vb0 = _mm256_load_pd( b ); + // Load vb (b0,b1,b2,b3) + vb0 = _mm256_load_pd( b ); for( i = 0; i < k_iter; ++i ) { __asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) ); // Load va0_3 (Prefetch) - vA0_3 = _mm256_load_pd( a + 8 ); + vA0_3 = _mm256_load_pd( a + 8 ); // Iteration 0. vtmp = _mm256_mul_pd( va0_3, vb0 ); @@ -151,10 +154,10 @@ void bli_dgemm_sandybridge_int_8x4 va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp ); // Load va4_7 (Prefetch) - vA4_7 = _mm256_load_pd( a + 12 ); + vA4_7 = _mm256_load_pd( a + 12 ); // Shuffle vb (b1,b0,b3,b2) - vb1 = _mm256_shuffle_pd( vb0, vb0, 0x5 ); + vb1 = _mm256_shuffle_pd( vb0, vb0, 0x5 ); vtmp = _mm256_mul_pd( va0_3, vb1 ); va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp ); @@ -163,10 +166,10 @@ void bli_dgemm_sandybridge_int_8x4 va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp ); // Permute vb (b3,b2,b1,b0) - vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 ); + vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 ); // Load vb (b0,b1,b2,b3) (Prefetch) - vB0 = _mm256_load_pd( b + 4 ); + vB0 = _mm256_load_pd( b + 4 ); vtmp = _mm256_mul_pd( va0_3, vb2 ); va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp ); @@ -175,7 +178,7 @@ void bli_dgemm_sandybridge_int_8x4 va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp ); // Shuffle vb (b3,b2,b1,b0) - vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 ); + vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 ); vtmp = _mm256_mul_pd( va0_3, vb3 ); va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp ); @@ -186,14 +189,14 @@ void bli_dgemm_sandybridge_int_8x4 // Iteration 1. __asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) ); - + // Load va0_3 (Next iteration) - va0_3 = _mm256_load_pd( a + 16 ); + va0_3 = _mm256_load_pd( a + 16 ); vtmp = _mm256_mul_pd( vA0_3, vB0 ); va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp ); - vb1 = _mm256_shuffle_pd( vB0, vB0, 0x5 ); + vb1 = _mm256_shuffle_pd( vB0, vB0, 0x5 ); vtmp = _mm256_mul_pd( vA4_7, vB0 ); va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp ); @@ -202,9 +205,9 @@ void bli_dgemm_sandybridge_int_8x4 va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp ); // Load va4_7 (Next iteration) - va4_7 = _mm256_load_pd( a + 20 ); + va4_7 = _mm256_load_pd( a + 20 ); - vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 ); + vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 ); vtmp = _mm256_mul_pd( vA4_7, vb1 ); va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp ); @@ -212,13 +215,13 @@ void bli_dgemm_sandybridge_int_8x4 vtmp = _mm256_mul_pd( vA0_3, vb2 ); va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp ); - vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 ); + vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 ); vtmp = _mm256_mul_pd( vA4_7, vb2 ); va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp ); // Load vb0(Next iteration) - vb0 = _mm256_load_pd( b + 8 ); + vb0 = _mm256_load_pd( b + 8 ); vtmp = _mm256_mul_pd( vA0_3, vb3 ); va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp ); @@ -236,12 +239,12 @@ void bli_dgemm_sandybridge_int_8x4 // Iteration 0. // Load va0_3 - va0_3 = _mm256_load_pd( a ); + va0_3 = _mm256_load_pd( a ); // Load va4_7 - va4_7 = _mm256_load_pd( a + 4 ); + va4_7 = _mm256_load_pd( a + 4 ); - // Load vb (b0,b1,b2,b3) - vb = _mm256_load_pd( b ); + // Load vb (b0,b1,b2,b3) + vb = _mm256_load_pd( b ); vtmp = _mm256_mul_pd( va0_3, vb ); va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp ); @@ -250,7 +253,7 @@ void bli_dgemm_sandybridge_int_8x4 va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp ); // Shuffle vb (b1,b0,b3,b2) - vb = _mm256_shuffle_pd( vb, vb, 0x5 ); + vb = _mm256_shuffle_pd( vb, vb, 0x5 ); vtmp = _mm256_mul_pd( va0_3, vb ); va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp ); @@ -259,7 +262,7 @@ void bli_dgemm_sandybridge_int_8x4 va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp ); // Permute vb (b3,b2,b1,b0) - vb = _mm256_permute2f128_pd( vb, vb, 0x1 ); + vb = _mm256_permute2f128_pd( vb, vb, 0x1 ); vtmp = _mm256_mul_pd( va0_3, vb ); va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp ); @@ -268,7 +271,7 @@ void bli_dgemm_sandybridge_int_8x4 va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp ); // Shuffle vb (b3,b2,b1,b0) - vb = _mm256_shuffle_pd( vb, vb, 0x5 ); + vb = _mm256_shuffle_pd( vb, vb, 0x5 ); vtmp = _mm256_mul_pd( va0_3, vb ); va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp ); @@ -309,131 +312,73 @@ void bli_dgemm_sandybridge_int_8x4 va4_7b1 = _mm256_permute2f128_pd( vtmpa_4_7b_1, vtmpa_4_7b_3, 0x30 ); va4_7b2 = _mm256_permute2f128_pd( vtmpa_4_7b_3, vtmpa_4_7b_1, 0x30 ); - if( rs_c == 1 ) + __m128d vzero = _mm_setzero_pd( ); + + if( _mm_comieq_sd( _mm256_castpd256_pd128(vbeta), vzero ) ) { // Calculate address - c00 = ( c + 0*rs_c + 0*cs_c ); - // Load - //vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c ); - vc0_3_0 = _mm256_load_pd( c00 ); + c00 = ( c + 0 + 0*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b0); - // Scale by beta - vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 ); - // Add gemm result - vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp ); // Store back to memory - _mm256_store_pd( c00, vc0_3_0 ); - + _mm256_store_pd( c00, vtmp ); + // Calculate address - c40 = ( c + 4*rs_c + 0*cs_c ); - // Load - //vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c ); - vc4_7_0 = _mm256_load_pd( c40 ); + c40 = ( c + 4 + 0*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b0); - // Scale by beta - vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 ); - // Add gemm result - vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp ); // Store back to memory - _mm256_store_pd( c40, vc4_7_0 ); - + _mm256_store_pd( c40, vtmp ); + // Calculate address - c01 = ( c + 0*rs_c + 1*cs_c ); - // Load - //vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c ); - vc0_3_1 = _mm256_load_pd( c01 ); + c01 = ( c + 0 + 1*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b1); - // Scale by beta - vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 ); - // Add gemm result - vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp ); // Store back to memory - _mm256_store_pd( c01, vc0_3_1 ); - + _mm256_store_pd( c01, vtmp ); + // Calculate address - c41 = ( c + 4*rs_c + 1*cs_c ); - // Load - //vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c ); - vc4_7_1 = _mm256_load_pd( c41 ); + c41 = ( c + 4 + 1*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b1); - // Scale by beta - vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 ); - // Add gemm result - vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp ); // Store back to memory - _mm256_store_pd( c41, vc4_7_1 ); - + _mm256_store_pd( c41, vtmp ); + // Calculate address - c02 = ( c + 0*rs_c + 2*cs_c ); - // Load - //vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c ); - vc0_3_2 = _mm256_load_pd( c02 ); + c02 = ( c + 0 + 2*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b2); - // Scale by beta - vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 ); - // Add gemm result - vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp ); // Store back to memory - _mm256_store_pd( c02, vc0_3_2 ); - + _mm256_store_pd( c02, vtmp ); + // Calculate address - c42 = ( c + 4*rs_c + 2*cs_c ); - // Load - //vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c ); - vc4_7_2 = _mm256_load_pd( c42 ); + c42 = ( c + 4 + 2*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b2); - // Scale by beta - vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 ); - // Add gemm result - vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp ); // Store back to memory - _mm256_store_pd( c42, vc4_7_2 ); - + _mm256_store_pd( c42, vtmp ); + // Calculate address - c03 = ( c + 0*rs_c + 3*cs_c ); - // Load - //vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c ); - vc0_3_3 = _mm256_load_pd( c03 ); + c03 = ( c + 0 + 3*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b3); - // Scale by beta - vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 ); - // Add gemm result - vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp ); // Store back to memory - _mm256_store_pd( c03, vc0_3_3 ); - + _mm256_store_pd( c03, vtmp ); + // Calculate address - c43 = ( c + 4*rs_c + 3*cs_c ); - // Load - //vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c ); - vc4_7_3 = _mm256_load_pd( c43 ); + c43 = ( c + 4 + 3*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b3); - // Scale by beta - vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 ); - // Add gemm result - vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp ); // Store back to memory - _mm256_store_pd( c43, vc4_7_3 ); - + _mm256_store_pd( c43, vtmp ); } else { // Calculate address - c00 = ( c + 0*rs_c + 0*cs_c ); + c00 = ( c + 0 + 0*cs_c ); // Load - //vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c ); - vc0_3_0 = _mm256_set_pd( *(c + 3*rs_c + 0*cs_c ), - *(c + 2*rs_c + 0*cs_c ), - *(c + 1*rs_c + 0*cs_c ), - *(c + 0*rs_c + 0*cs_c ) ); + //vc0_3_0 = _mm256_load_pd( c + 0 + 0*cs_c ); + vc0_3_0 = _mm256_load_pd( c00 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b0); // Scale by beta @@ -441,24 +386,13 @@ void bli_dgemm_sandybridge_int_8x4 // Add gemm result vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp ); // Store back to memory - //_mm256_store_pd( c00, vc0_3_0 ); - - aa = _mm256_extractf128_pd( vc0_3_0, 0 ) ; - bb = _mm256_extractf128_pd( vc0_3_0, 1 ) ; - - _mm_storel_pd( c + 0*rs_c + 0*cs_c, aa ); - _mm_storeh_pd( c + 1*rs_c + 0*cs_c, aa ); - _mm_storel_pd( c + 2*rs_c + 0*cs_c, bb ); - _mm_storeh_pd( c + 3*rs_c + 0*cs_c, bb ); + _mm256_store_pd( c00, vc0_3_0 ); // Calculate address - c40 = ( c + 4*rs_c + 0*cs_c ); + c40 = ( c + 4 + 0*cs_c ); // Load - //vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c ); - vc4_7_0 = _mm256_set_pd( *(c + 7*rs_c + 0*cs_c ), - *(c + 6*rs_c + 0*cs_c ), - *(c + 5*rs_c + 0*cs_c ), - *(c + 4*rs_c + 0*cs_c ) ); + //vc4_7_0 = _mm256_load_pd( c + 4 + 0*cs_c ); + vc4_7_0 = _mm256_load_pd( c40 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b0); // Scale by beta @@ -466,24 +400,13 @@ void bli_dgemm_sandybridge_int_8x4 // Add gemm result vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp ); // Store back to memory - //_mm256_store_pd( c40, vc4_7_0 ); - - aa = _mm256_extractf128_pd( vc4_7_0, 0 ) ; - bb = _mm256_extractf128_pd( vc4_7_0, 1 ) ; - - _mm_storel_pd( c + 4*rs_c + 0*cs_c, aa ); - _mm_storeh_pd( c + 5*rs_c + 0*cs_c, aa ); - _mm_storel_pd( c + 6*rs_c + 0*cs_c, bb ); - _mm_storeh_pd( c + 7*rs_c + 0*cs_c, bb ); + _mm256_store_pd( c40, vc4_7_0 ); // Calculate address - c01 = ( c + 0*rs_c + 1*cs_c ); + c01 = ( c + 0 + 1*cs_c ); // Load - //vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c ); - vc0_3_1 = _mm256_set_pd( *(c + 3*rs_c + 1*cs_c ), - *(c + 2*rs_c + 1*cs_c ), - *(c + 1*rs_c + 1*cs_c ), - *(c + 0*rs_c + 1*cs_c ) ); + //vc0_3_1 = _mm256_load_pd( c + 0 + 1*cs_c ); + vc0_3_1 = _mm256_load_pd( c01 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b1); // Scale by beta @@ -491,24 +414,13 @@ void bli_dgemm_sandybridge_int_8x4 // Add gemm result vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp ); // Store back to memory - //_mm256_store_pd( c01, vc0_3_1 ); - - aa = _mm256_extractf128_pd( vc0_3_1, 0 ) ; - bb = _mm256_extractf128_pd( vc0_3_1, 1 ) ; - - _mm_storel_pd( c + 0*rs_c + 1*cs_c, aa ); - _mm_storeh_pd( c + 1*rs_c + 1*cs_c, aa ); - _mm_storel_pd( c + 2*rs_c + 1*cs_c, bb ); - _mm_storeh_pd( c + 3*rs_c + 1*cs_c, bb ); + _mm256_store_pd( c01, vc0_3_1 ); // Calculate address - c41 = ( c + 4*rs_c + 1*cs_c ); + c41 = ( c + 4 + 1*cs_c ); // Load - //vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c ); - vc4_7_1 = _mm256_set_pd( *(c + 7*rs_c + 1*cs_c ), - *(c + 6*rs_c + 1*cs_c ), - *(c + 5*rs_c + 1*cs_c ), - *(c + 4*rs_c + 1*cs_c ) ); + //vc4_7_1 = _mm256_load_pd( c + 4 + 1*cs_c ); + vc4_7_1 = _mm256_load_pd( c41 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b1); // Scale by beta @@ -516,24 +428,13 @@ void bli_dgemm_sandybridge_int_8x4 // Add gemm result vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp ); // Store back to memory - //_mm256_store_pd( c41, vc4_7_1 ); - - aa = _mm256_extractf128_pd( vc4_7_1, 0 ) ; - bb = _mm256_extractf128_pd( vc4_7_1, 1 ) ; - - _mm_storel_pd( c + 4*rs_c + 1*cs_c, aa ); - _mm_storeh_pd( c + 5*rs_c + 1*cs_c, aa ); - _mm_storel_pd( c + 6*rs_c + 1*cs_c, bb ); - _mm_storeh_pd( c + 7*rs_c + 1*cs_c, bb ); + _mm256_store_pd( c41, vc4_7_1 ); // Calculate address - c02 = ( c + 0*rs_c + 2*cs_c ); + c02 = ( c + 0 + 2*cs_c ); // Load - //vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c ); - vc0_3_2 = _mm256_set_pd( *(c + 3*rs_c + 2*cs_c ), - *(c + 2*rs_c + 2*cs_c ), - *(c + 1*rs_c + 2*cs_c ), - *(c + 0*rs_c + 2*cs_c ) ); + //vc0_3_2 = _mm256_load_pd( c + 0 + 2*cs_c ); + vc0_3_2 = _mm256_load_pd( c02 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b2); // Scale by beta @@ -541,24 +442,13 @@ void bli_dgemm_sandybridge_int_8x4 // Add gemm result vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp ); // Store back to memory - //_mm256_store_pd( c02, vc0_3_2 ); - - aa = _mm256_extractf128_pd( vc0_3_2, 0 ) ; - bb = _mm256_extractf128_pd( vc0_3_2, 1 ) ; - - _mm_storel_pd( c + 0*rs_c + 2*cs_c, aa ); - _mm_storeh_pd( c + 1*rs_c + 2*cs_c, aa ); - _mm_storel_pd( c + 2*rs_c + 2*cs_c, bb ); - _mm_storeh_pd( c + 3*rs_c + 2*cs_c, bb ); + _mm256_store_pd( c02, vc0_3_2 ); // Calculate address - c42 = ( c + 4*rs_c + 2*cs_c ); + c42 = ( c + 4 + 2*cs_c ); // Load - //vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c ); - vc4_7_2 = _mm256_set_pd( *(c + 7*rs_c + 2*cs_c ), - *(c + 6*rs_c + 2*cs_c ), - *(c + 5*rs_c + 2*cs_c ), - *(c + 4*rs_c + 2*cs_c ) ); + //vc4_7_2 = _mm256_load_pd( c + 4 + 2*cs_c ); + vc4_7_2 = _mm256_load_pd( c42 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b2); // Scale by beta @@ -566,24 +456,13 @@ void bli_dgemm_sandybridge_int_8x4 // Add gemm result vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp ); // Store back to memory - //_mm256_store_pd( c42, vc4_7_2 ); - - aa = _mm256_extractf128_pd( vc4_7_2, 0 ) ; - bb = _mm256_extractf128_pd( vc4_7_2, 1 ) ; - - _mm_storel_pd( c + 4*rs_c + 2*cs_c, aa ); - _mm_storeh_pd( c + 5*rs_c + 2*cs_c, aa ); - _mm_storel_pd( c + 6*rs_c + 2*cs_c, bb ); - _mm_storeh_pd( c + 7*rs_c + 2*cs_c, bb ); + _mm256_store_pd( c42, vc4_7_2 ); // Calculate address - c03 = ( c + 0*rs_c + 3*cs_c ); + c03 = ( c + 0 + 3*cs_c ); // Load - //vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c ); - vc0_3_3 = _mm256_set_pd( *(c + 3*rs_c + 3*cs_c ), - *(c + 2*rs_c + 3*cs_c ), - *(c + 1*rs_c + 3*cs_c ), - *(c + 0*rs_c + 3*cs_c ) ); + //vc0_3_3 = _mm256_load_pd( c + 0 + 3*cs_c ); + vc0_3_3 = _mm256_load_pd( c03 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b3); // Scale by beta @@ -591,24 +470,13 @@ void bli_dgemm_sandybridge_int_8x4 // Add gemm result vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp ); // Store back to memory - //_mm256_store_pd( c03, vc0_3_3 ); - - aa = _mm256_extractf128_pd( vc0_3_3, 0 ) ; - bb = _mm256_extractf128_pd( vc0_3_3, 1 ) ; - - _mm_storel_pd( c + 0*rs_c + 3*cs_c, aa ); - _mm_storeh_pd( c + 1*rs_c + 3*cs_c, aa ); - _mm_storel_pd( c + 2*rs_c + 3*cs_c, bb ); - _mm_storeh_pd( c + 3*rs_c + 3*cs_c, bb ); + _mm256_store_pd( c03, vc0_3_3 ); // Calculate address - c43 = ( c + 4*rs_c + 3*cs_c ); + c43 = ( c + 4 + 3*cs_c ); // Load - //vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c ); - vc4_7_3 = _mm256_set_pd( *(c + 7*rs_c + 3*cs_c ), - *(c + 6*rs_c + 3*cs_c ), - *(c + 5*rs_c + 3*cs_c ), - *(c + 4*rs_c + 3*cs_c ) ); + //vc4_7_3 = _mm256_load_pd( c + 4 + 3*cs_c ); + vc4_7_3 = _mm256_load_pd( c43 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b3); // Scale by beta @@ -616,17 +484,10 @@ void bli_dgemm_sandybridge_int_8x4 // Add gemm result vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp ); // Store back to memory - //_mm256_store_pd( c43, vc4_7_3 ); - - aa = _mm256_extractf128_pd( vc4_7_3, 0 ) ; - bb = _mm256_extractf128_pd( vc4_7_3, 1 ) ; - - _mm_storel_pd( c + 4*rs_c + 3*cs_c, aa ); - _mm_storeh_pd( c + 5*rs_c + 3*cs_c, aa ); - _mm_storel_pd( c + 6*rs_c + 3*cs_c, bb ); - _mm_storeh_pd( c + 7*rs_c + 3*cs_c, bb ); + _mm256_store_pd( c43, vc4_7_3 ); } + GEMM_UKR_FLUSH_CT( d ); } @@ -634,7 +495,9 @@ void bli_dgemm_sandybridge_int_8x4 #if 0 void bli_cgemm_sandybridge_int_8x4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, @@ -652,7 +515,9 @@ void bli_cgemm_sandybridge_int_8x4 #if 0 void bli_zgemm_sandybridge_int_4x4 ( - dim_t k0, + dim_t m, + dim_t n, + dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c index 3a20cd861..9943a170b 100644 --- a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c +++ b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c @@ -287,24 +287,28 @@ static int64_t offsets[16] __attribute__((aligned(64))) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}; -void bli_dgemm_skx_asm_16x12_l2( - dim_t k_, - double* restrict alpha, - double* restrict a, - double* restrict b, - double* restrict beta, - double* restrict c, inc_t rs_c_, inc_t cs_c_, - auxinfo_t* data, - cntx_t* restrict cntx - ) +void bli_dgemm_skx_asm_16x12_l2 + ( + dim_t m, + dim_t n, + dim_t k_, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c_, inc_t cs_c_, + auxinfo_t* data, + cntx_t* restrict cntx + ) { (void)data; (void)cntx; - const int64_t* offsetPtr = &offsets[0]; - const int64_t k = k_; - const int64_t rs_c = rs_c_; - const int64_t cs_c = cs_c_; + int64_t k = k_; + int64_t rs_c = rs_c_; + int64_t cs_c = cs_c_; + + GEMM_UKR_SETUP_CT( d, 16, 12, false ); BEGIN_ASM() @@ -464,62 +468,26 @@ void bli_dgemm_skx_asm_16x12_l2( MOV(RAX, VAR(cs_c)) LEA(RAX, MEM(,RAX,8)) - MOV(RBX, VAR(rs_c)) - LEA(RBX, MEM(,RBX,8)) - - // Check if C is column stride. If not, jump to the slow scattered update - CMP(RBX, IMM(1)) - JNE(SCATTEREDUPDATE) - - VCOMISD(XMM(1), XMM(7)) - JE(COLSTORBZ) - UPDATE_C( 8, 9,10,11) - UPDATE_C(12,13,14,15) - UPDATE_C(16,17,18,19) - UPDATE_C(20,21,22,23) - UPDATE_C(24,25,26,27) - UPDATE_C(28,29,30,31) + VCOMISD(XMM(1), XMM(7)) + JE(COLSTORBZ) - JMP(END) - LABEL(COLSTORBZ) - - UPDATE_C_BZ( 8, 9,10,11) - UPDATE_C_BZ(12,13,14,15) - UPDATE_C_BZ(16,17,18,19) - UPDATE_C_BZ(20,21,22,23) - UPDATE_C_BZ(24,25,26,27) - UPDATE_C_BZ(28,29,30,31) + UPDATE_C( 8, 9,10,11) + UPDATE_C(12,13,14,15) + UPDATE_C(16,17,18,19) + UPDATE_C(20,21,22,23) + UPDATE_C(24,25,26,27) + UPDATE_C(28,29,30,31) JMP(END) - LABEL(SCATTEREDUPDATE) - - MOV(RDI, VAR(offsetPtr)) - VMOVDQA64(ZMM(2), MEM(RDI,0*64)) - VMOVDQA64(ZMM(3), MEM(RDI,1*64)) - VPBROADCASTQ(ZMM(6), RBX) - VPMULLQ(ZMM(2), ZMM(6), ZMM(2)) - VPMULLQ(ZMM(3), ZMM(6), ZMM(3)) - - VCOMISD(XMM(1), XMM(7)) - JE(SCATTERBZ) - - UPDATE_C_ROW_SCATTERED( 8, 9,10,11) - UPDATE_C_ROW_SCATTERED(12,13,14,15) - UPDATE_C_ROW_SCATTERED(16,17,18,19) - UPDATE_C_ROW_SCATTERED(20,21,22,23) - UPDATE_C_ROW_SCATTERED(24,25,26,27) - UPDATE_C_ROW_SCATTERED(28,29,30,31) - - JMP(END) - LABEL(SCATTERBZ) - - UPDATE_C_BZ_ROW_SCATTERED( 8, 9,10,11) - UPDATE_C_BZ_ROW_SCATTERED(12,13,14,15) - UPDATE_C_BZ_ROW_SCATTERED(16,17,18,19) - UPDATE_C_BZ_ROW_SCATTERED(20,21,22,23) - UPDATE_C_BZ_ROW_SCATTERED(24,25,26,27) - UPDATE_C_BZ_ROW_SCATTERED(28,29,30,31) + LABEL(COLSTORBZ) + + UPDATE_C_BZ( 8, 9,10,11) + UPDATE_C_BZ(12,13,14,15) + UPDATE_C_BZ(16,17,18,19) + UPDATE_C_BZ(20,21,22,23) + UPDATE_C_BZ(24,25,26,27) + UPDATE_C_BZ(28,29,30,31) LABEL(END) @@ -535,8 +503,7 @@ void bli_dgemm_skx_asm_16x12_l2( [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), - [cs_c] "m" (cs_c), - [offsetPtr] "m" (offsetPtr) + [cs_c] "m" (cs_c) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", @@ -545,4 +512,6 @@ void bli_dgemm_skx_asm_16x12_l2( "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ) + + GEMM_UKR_FLUSH_CT( d ); } diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c index 136f31532..e3bc52041 100644 --- a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c +++ b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c @@ -153,24 +153,28 @@ static int64_t offsets[16] __attribute__((aligned(64))) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}; -void bli_dgemm_skx_asm_16x14( - dim_t k_, - double* restrict alpha, - double* restrict a, - double* restrict b, - double* restrict beta, - double* restrict c, inc_t rs_c_, inc_t cs_c_, - auxinfo_t* data, - cntx_t* restrict cntx - ) +void bli_dgemm_skx_asm_16x14 + ( + dim_t m, + dim_t n, + dim_t k_, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c_, inc_t cs_c_, + auxinfo_t* data, + cntx_t* restrict cntx + ) { (void)data; (void)cntx; - const int64_t* offsetPtr = &offsets[0]; - const int64_t k = k_; - const int64_t rs_c = rs_c_*8; - const int64_t cs_c = cs_c_*8; + int64_t k = k_; + int64_t rs_c = rs_c_; + int64_t cs_c = cs_c_; + + GEMM_UKR_SETUP_CT( d, 16, 14, false ); BEGIN_ASM() @@ -220,6 +224,8 @@ void bli_dgemm_skx_asm_16x14( MOV(R12, VAR(rs_c)) MOV(R10, VAR(cs_c)) + LEA(R12, MEM(,R12,8)) + LEA(R10, MEM(,R10,8)) MOV(RDI, RSI) AND(RSI, IMM(3)) @@ -320,119 +326,41 @@ void bli_dgemm_skx_asm_16x14( MOV(RAX, R12) MOV(RBX, R10) - // Check if C is column stride. - CMP(RAX, IMM(8)) - JNE(SCATTEREDUPDATE) - - VCOMISD(XMM(1), XMM(2)) - JE(COLSTORBZ) - - UPDATE_C( 4, 5) - UPDATE_C( 6, 7) - UPDATE_C( 8, 9) - UPDATE_C(10,11) - UPDATE_C(12,13) - UPDATE_C(14,15) - UPDATE_C(16,17) - UPDATE_C(18,19) - UPDATE_C(20,21) - UPDATE_C(22,23) - UPDATE_C(24,25) - UPDATE_C(26,27) - UPDATE_C(28,29) - UPDATE_C(30,31) - - JMP(END) - LABEL(COLSTORBZ) - - UPDATE_C_BZ( 4, 5) - UPDATE_C_BZ( 6, 7) - UPDATE_C_BZ( 8, 9) - UPDATE_C_BZ(10,11) - UPDATE_C_BZ(12,13) - UPDATE_C_BZ(14,15) - UPDATE_C_BZ(16,17) - UPDATE_C_BZ(18,19) - UPDATE_C_BZ(20,21) - UPDATE_C_BZ(22,23) - UPDATE_C_BZ(24,25) - UPDATE_C_BZ(26,27) - UPDATE_C_BZ(28,29) - UPDATE_C_BZ(30,31) + VCOMISD(XMM(1), XMM(2)) + JE(COLSTORBZ) + + UPDATE_C( 4, 5) + UPDATE_C( 6, 7) + UPDATE_C( 8, 9) + UPDATE_C(10,11) + UPDATE_C(12,13) + UPDATE_C(14,15) + UPDATE_C(16,17) + UPDATE_C(18,19) + UPDATE_C(20,21) + UPDATE_C(22,23) + UPDATE_C(24,25) + UPDATE_C(26,27) + UPDATE_C(28,29) + UPDATE_C(30,31) JMP(END) - LABEL(SCATTEREDUPDATE) - - VMULPD(ZMM( 4), ZMM( 4), ZMM(0)) - VMULPD(ZMM( 5), ZMM( 5), ZMM(0)) - VMULPD(ZMM( 6), ZMM( 6), ZMM(0)) - VMULPD(ZMM( 7), ZMM( 7), ZMM(0)) - VMULPD(ZMM( 8), ZMM( 8), ZMM(0)) - VMULPD(ZMM( 9), ZMM( 9), ZMM(0)) - VMULPD(ZMM(10), ZMM(10), ZMM(0)) - VMULPD(ZMM(11), ZMM(11), ZMM(0)) - VMULPD(ZMM(12), ZMM(12), ZMM(0)) - VMULPD(ZMM(13), ZMM(13), ZMM(0)) - VMULPD(ZMM(14), ZMM(14), ZMM(0)) - VMULPD(ZMM(15), ZMM(15), ZMM(0)) - VMULPD(ZMM(16), ZMM(16), ZMM(0)) - VMULPD(ZMM(17), ZMM(17), ZMM(0)) - VMULPD(ZMM(18), ZMM(18), ZMM(0)) - VMULPD(ZMM(19), ZMM(19), ZMM(0)) - VMULPD(ZMM(20), ZMM(20), ZMM(0)) - VMULPD(ZMM(21), ZMM(21), ZMM(0)) - VMULPD(ZMM(22), ZMM(22), ZMM(0)) - VMULPD(ZMM(23), ZMM(23), ZMM(0)) - VMULPD(ZMM(24), ZMM(24), ZMM(0)) - VMULPD(ZMM(25), ZMM(25), ZMM(0)) - VMULPD(ZMM(26), ZMM(26), ZMM(0)) - VMULPD(ZMM(27), ZMM(27), ZMM(0)) - VMULPD(ZMM(28), ZMM(28), ZMM(0)) - VMULPD(ZMM(29), ZMM(29), ZMM(0)) - VMULPD(ZMM(30), ZMM(30), ZMM(0)) - VMULPD(ZMM(31), ZMM(31), ZMM(0)) - - VCOMISD(XMM(1), XMM(2)) - - MOV(RDI, VAR(offsetPtr)) - VPBROADCASTQ(ZMM(0), RAX) - VPMULLQ(ZMM(2), ZMM(0), MEM(RDI)) - VPMULLQ(ZMM(3), ZMM(0), MEM(RDI,64)) - - JE(SCATTERBZ) - - UPDATE_C_COL_SCATTERED( 4, 5) - UPDATE_C_COL_SCATTERED( 6, 7) - UPDATE_C_COL_SCATTERED( 8, 9) - UPDATE_C_COL_SCATTERED(10,11) - UPDATE_C_COL_SCATTERED(12,13) - UPDATE_C_COL_SCATTERED(14,15) - UPDATE_C_COL_SCATTERED(16,17) - UPDATE_C_COL_SCATTERED(18,19) - UPDATE_C_COL_SCATTERED(20,21) - UPDATE_C_COL_SCATTERED(22,23) - UPDATE_C_COL_SCATTERED(24,25) - UPDATE_C_COL_SCATTERED(26,27) - UPDATE_C_COL_SCATTERED(28,29) - UPDATE_C_COL_SCATTERED(30,31) - - JMP(END) - LABEL(SCATTERBZ) - - UPDATE_C_BZ_COL_SCATTERED( 4, 5) - UPDATE_C_BZ_COL_SCATTERED( 6, 7) - UPDATE_C_BZ_COL_SCATTERED( 8, 9) - UPDATE_C_BZ_COL_SCATTERED(10,11) - UPDATE_C_BZ_COL_SCATTERED(12,13) - UPDATE_C_BZ_COL_SCATTERED(14,15) - UPDATE_C_BZ_COL_SCATTERED(16,17) - UPDATE_C_BZ_COL_SCATTERED(18,19) - UPDATE_C_BZ_COL_SCATTERED(20,21) - UPDATE_C_BZ_COL_SCATTERED(22,23) - UPDATE_C_BZ_COL_SCATTERED(24,25) - UPDATE_C_BZ_COL_SCATTERED(26,27) - UPDATE_C_BZ_COL_SCATTERED(28,29) - UPDATE_C_BZ_COL_SCATTERED(30,31) + LABEL(COLSTORBZ) + + UPDATE_C_BZ( 4, 5) + UPDATE_C_BZ( 6, 7) + UPDATE_C_BZ( 8, 9) + UPDATE_C_BZ(10,11) + UPDATE_C_BZ(12,13) + UPDATE_C_BZ(14,15) + UPDATE_C_BZ(16,17) + UPDATE_C_BZ(18,19) + UPDATE_C_BZ(20,21) + UPDATE_C_BZ(22,23) + UPDATE_C_BZ(24,25) + UPDATE_C_BZ(26,27) + UPDATE_C_BZ(28,29) + UPDATE_C_BZ(30,31) LABEL(END) @@ -449,8 +377,7 @@ void bli_dgemm_skx_asm_16x14( [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), - [cs_c] "m" (cs_c), - [offsetPtr] "m" (offsetPtr) + [cs_c] "m" (cs_c) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", @@ -459,4 +386,6 @@ void bli_dgemm_skx_asm_16x14( "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ) + + GEMM_UKR_FLUSH_CT( d ); } diff --git a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c index 40af49614..8808449b6 100644 --- a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c +++ b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c @@ -317,24 +317,28 @@ ahead*/ static int64_t offsets[16] __attribute__((aligned(64))) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}; -void bli_sgemm_skx_asm_32x12_l2( - dim_t k_, - float* restrict alpha, - float* restrict a, - float* restrict b, - float* restrict beta, - float* restrict c, inc_t rs_c_, inc_t cs_c_, - auxinfo_t* data, - cntx_t* restrict cntx - ) +void bli_sgemm_skx_asm_32x12_l2 + ( + dim_t m, + dim_t n, + dim_t k_, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c_, inc_t cs_c_, + auxinfo_t* data, + cntx_t* restrict cntx + ) { (void)data; (void)cntx; - const int64_t* offsetPtr = &offsets[0]; - const int64_t k = k_; - const int64_t rs_c = rs_c_; - const int64_t cs_c = cs_c_; + int64_t k = k_; + int64_t rs_c = rs_c_; + int64_t cs_c = cs_c_; + + GEMM_UKR_SETUP_CT( s, 32, 12, false ); BEGIN_ASM() @@ -381,7 +385,7 @@ void bli_sgemm_skx_asm_32x12_l2( #endif #ifdef PREFETCH_B_BEFORE - /* Prefetching 3 cachlines of B (4 iterations worth of data + /* Prefetching 3 cachlines of B (4 iterations worth of data (12 (NR) x 4 (sizeof(float)) x 4 iter /64 = 3 cachelines) */ PREFETCH(0, MEM(RBX,0*64)) PREFETCH(0, MEM(RBX,1*64)) @@ -485,66 +489,26 @@ void bli_sgemm_skx_asm_32x12_l2( MOV(RAX, VAR(cs_c)) LEA(RAX, MEM(,RAX,4)) - MOV(RBX, VAR(rs_c)) - LEA(RBX, MEM(,RBX,4)) - - - // Check if C is column major (rs_c = 1). If not, jump to the slow scattered update - CMP(RBX, IMM(4)) - JNE(SCATTEREDUPDATE) - - VCOMISS(XMM(1), XMM(7)) - JE(COLSTORBZ) - UPDATE_C( 8, 9,10,11) - UPDATE_C(12,13,14,15) - UPDATE_C(16,17,18,19) - UPDATE_C(20,21,22,23) - UPDATE_C(24,25,26,27) - UPDATE_C(28,29,30,31) + VCOMISS(XMM(1), XMM(7)) + JE(COLSTORBZ) - JMP(END) - LABEL(COLSTORBZ) - - UPDATE_C_BZ( 8, 9,10,11) - UPDATE_C_BZ(12,13,14,15) - UPDATE_C_BZ(16,17,18,19) - UPDATE_C_BZ(20,21,22,23) - UPDATE_C_BZ(24,25,26,27) - UPDATE_C_BZ(28,29,30,31) + UPDATE_C( 8, 9,10,11) + UPDATE_C(12,13,14,15) + UPDATE_C(16,17,18,19) + UPDATE_C(20,21,22,23) + UPDATE_C(24,25,26,27) + UPDATE_C(28,29,30,31) JMP(END) - LABEL(SCATTEREDUPDATE) - - LEA(RDX, MEM(RCX,RBX,8)) - LEA(RDX, MEM(RDX,RBX,8)) - - MOV(RDI, VAR(offsetPtr)) - VMOVDQA64(ZMM(2), MEM(RDI,0*64)) - VMOVDQA64(ZMM(3), MEM(RDI,1*64)) - VPBROADCASTQ(ZMM(6), RBX) - VPMULLQ(ZMM(2), ZMM(6), ZMM(2)) - VPMULLQ(ZMM(3), ZMM(6), ZMM(3)) - - VCOMISS(XMM(1), XMM(7)) - JE(SCATTERBZ) - - UPDATE_C_ROW_SCATTERED( 8, 9,10,11) - UPDATE_C_ROW_SCATTERED(12,13,14,15) - UPDATE_C_ROW_SCATTERED(16,17,18,19) - UPDATE_C_ROW_SCATTERED(20,21,22,23) - UPDATE_C_ROW_SCATTERED(24,25,26,27) - UPDATE_C_ROW_SCATTERED(28,29,30,31) - - JMP(END) - LABEL(SCATTERBZ) - - UPDATE_C_BZ_ROW_SCATTERED( 8, 9,10,11) - UPDATE_C_BZ_ROW_SCATTERED(12,13,14,15) - UPDATE_C_BZ_ROW_SCATTERED(16,17,18,19) - UPDATE_C_BZ_ROW_SCATTERED(20,21,22,23) - UPDATE_C_BZ_ROW_SCATTERED(24,25,26,27) - UPDATE_C_BZ_ROW_SCATTERED(28,29,30,31) + LABEL(COLSTORBZ) + + UPDATE_C_BZ( 8, 9,10,11) + UPDATE_C_BZ(12,13,14,15) + UPDATE_C_BZ(16,17,18,19) + UPDATE_C_BZ(20,21,22,23) + UPDATE_C_BZ(24,25,26,27) + UPDATE_C_BZ(28,29,30,31) LABEL(END) @@ -560,8 +524,7 @@ void bli_sgemm_skx_asm_32x12_l2( [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), - [cs_c] "m" (cs_c), - [offsetPtr] "m" (offsetPtr) + [cs_c] "m" (cs_c) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", @@ -570,4 +533,6 @@ void bli_sgemm_skx_asm_32x12_l2( "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ) + + GEMM_UKR_FLUSH_CT( s ); } diff --git a/ref_kernels/3/bb/bli_gemmbb_ref.c b/ref_kernels/3/bb/bli_gemmbb_ref.c index b45718d45..4c75c064c 100644 --- a/ref_kernels/3/bb/bli_gemmbb_ref.c +++ b/ref_kernels/3/bb/bli_gemmbb_ref.c @@ -42,6 +42,8 @@ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ @@ -59,9 +61,6 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ \ const inc_t cs_a = packmr; \ \ diff --git a/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c b/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c index 681b740b5..dd4e1f153 100644 --- a/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c +++ b/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c @@ -87,6 +87,8 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \ /* upper: b11 = alpha * b11 - a12 * b21; */ \ gemm_ukr \ ( \ + mr, \ + nr, \ k, \ minus_one, \ a1x, \ diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c index 931fe994b..51ff9df4b 100644 --- a/ref_kernels/3/bli_gemm_ref.c +++ b/ref_kernels/3/bli_gemm_ref.c @@ -44,6 +44,8 @@ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ @@ -107,8 +109,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ - for ( dim_t i = 0; i < mr; ++i ) \ - for ( dim_t j = 0; j < nr; ++j ) \ + for ( dim_t i = 0; i < m; ++i ) \ + for ( dim_t j = 0; j < n; ++j ) \ PASTEMAC(ch,copys) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ @@ -117,8 +119,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ else \ { \ - for ( dim_t i = 0; i < mr; ++i ) \ - for ( dim_t j = 0; j < nr; ++j ) \ + for ( dim_t i = 0; i < m; ++i ) \ + for ( dim_t j = 0; j < n; ++j ) \ PASTEMAC(ch,xpbys) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ @@ -133,8 +135,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ - for ( dim_t j = 0; j < nr; ++j ) \ - for ( dim_t i = 0; i < mr; ++i ) \ + for ( dim_t j = 0; j < n; ++j ) \ + for ( dim_t i = 0; i < m; ++i ) \ PASTEMAC(ch,copys) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ @@ -143,8 +145,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ else \ { \ - for ( dim_t j = 0; j < nr; ++j ) \ - for ( dim_t i = 0; i < mr; ++i ) \ + for ( dim_t j = 0; j < n; ++j ) \ + for ( dim_t i = 0; i < m; ++i ) \ PASTEMAC(ch,xpbys) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ @@ -171,6 +173,8 @@ GENTFUNC( dcomplex, z, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 4 ) \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ @@ -188,9 +192,6 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ \ const inc_t cs_a = packmr; \ \ diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c index 2b756963e..2b260c881 100644 --- a/ref_kernels/3/bli_gemmtrsm_ref.c +++ b/ref_kernels/3/bli_gemmtrsm_ref.c @@ -52,6 +52,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ + const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const inc_t rs_b = packnr; \ @@ -68,6 +70,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ /* upper: b11 = alpha * b11 - a12 * b21; */ \ gemm_ukr \ ( \ + mr, \ + nr, \ k, \ minus_one, \ a1x, \ diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c index 6d2464de9..fbd15d695 100644 --- a/ref_kernels/ind/bli_gemm1m_ref.c +++ b/ref_kernels/ind/bli_gemm1m_ref.c @@ -39,6 +39,8 @@ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ @@ -59,6 +61,9 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t mr_r = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ + const dim_t nr_r = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ \ const dim_t k2 = 2 * k; \ \ @@ -118,6 +123,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ else using_ct = FALSE; \ \ +\ + /* If we are not computing a full micro-tile, then we must write to + ct and then accumulate to c afterwards. */ \ + if ( mr != m || nr != n ) using_ct = TRUE; \ +\ \ if ( using_ct ) \ { \ @@ -149,6 +159,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ /* c = beta * c + alpha_r * a * b; */ \ rgemm_ukr \ ( \ + mr_r, \ + nr_r, \ k2, \ alpha_r, \ a_r, \ @@ -164,8 +176,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ /* Accumulate the final result in ct back to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ - for ( j = 0; j < nr; ++j ) \ - for ( i = 0; i < mr; ++i ) \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,adds)( *(ct + i*rs_ct + j*cs_ct), \ *(c + i*rs_c + j*cs_c ) ); \ @@ -173,8 +185,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ - for ( j = 0; j < nr; ++j ) \ - for ( i = 0; i < mr; ++i ) \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,copys)( *(ct + i*rs_ct + j*cs_ct), \ *(c + i*rs_c + j*cs_c ) ); \ @@ -182,8 +194,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ else \ { \ - for ( j = 0; j < nr; ++j ) \ - for ( i = 0; i < mr; ++i ) \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \ *beta, \ @@ -215,6 +227,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ /* c = beta * c + alpha_r * a * b; */ \ rgemm_ukr \ ( \ + mr_r, \ + nr_r, \ k2, \ alpha_r, \ a_r, \ diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c index 5cfaee9ec..96f5a16fe 100644 --- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c +++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c @@ -153,6 +153,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ upper: bt = -1.0 * a12 * b21; */ \ rgemm_ukr \ ( \ + mr_r, \ + nr_r, \ k2, \ minus_one_r, \ a1x_r, \ diff --git a/test/syrk_diagonal/complex_math.hpp b/test/syrk_diagonal/complex_math.hpp new file mode 100644 index 000000000..9c68e730a --- /dev/null +++ b/test/syrk_diagonal/complex_math.hpp @@ -0,0 +1,267 @@ +#include +#include +#include + +#include "blis.h" + +template +struct is_complex : std::false_type {}; + +template <> +struct is_complex : std::true_type {}; + +template <> +struct is_complex : std::true_type {}; + +template +struct is_real : std::integral_constant::value> {}; + +template struct make_complex; + +template <> struct make_complex { using type = scomplex; }; +template <> struct make_complex { using type = dcomplex; }; +template <> struct make_complex { using type = scomplex; }; +template <> struct make_complex { using type = dcomplex; }; + +template +using make_complex_t = typename make_complex::type; + +template struct make_real; + +template <> struct make_real { using type = float; }; +template <> struct make_real { using type = double; }; +template <> struct make_real { using type = float; }; +template <> struct make_real { using type = double; }; + +template +using make_real_t = typename make_real::type; + +template +struct make_complex_if : std::conditional,make_real_t> {}; + +template +using make_complex_if_t = typename make_complex_if::type; + +template +struct real_imag_part +{ + real_imag_part& operator=(T) { return *this; } + + operator T() const { return T(); } +}; + +template +std::enable_if_t::type>::value,T&> real(T& x) { return x; } + +template +std::enable_if_t::value,real_imag_part> imag(T x) { return {}; } + +inline float& real(scomplex& x) { return x.real; } + +inline float& imag(scomplex& x) { return x.imag; } + +inline double& real(dcomplex& x) { return x.real; } + +inline double& imag(dcomplex& x) { return x.imag; } + +inline const float& real(const scomplex& x) { return x.real; } + +inline const float& imag(const scomplex& x) { return x.imag; } + +inline const double& real(const dcomplex& x) { return x.real; } + +inline const double& imag(const dcomplex& x) { return x.imag; } + +template +std::enable_if_t::value,T> conj(T x) { return x; } + +template +std::enable_if_t::value,T> conj(const T& x) { return {x.real, -x.imag}; } + +template +struct convert_impl; + +template +struct convert_impl::value && is_real::value>> +{ + void operator()(T x, U& y) const { y = x; } +}; + +template +struct convert_impl::value && is_complex::value>> +{ + void operator()(T x, U& y) const { y.real = x; y.imag = 0; } +}; + +template +struct convert_impl::value && is_real::value>> +{ + void operator()(T x, U& y) const { y = x.real; } +}; + +template +struct convert_impl::value && is_complex::value>> +{ + void operator()(T x, U& y) const { y.real = x.real; y.imag = x.imag; } +}; + +template +U convert(T x) +{ + U y; + convert_impl{}(x,y); + return y; +} + +template +auto convert_prec(T x) -> make_complex_if_t::value> +{ + return convert::value>>(x); +} + +#define COMPLEX_MATH_OPS(rtype, ctype) \ +\ +inline bool operator==(rtype x, ctype y) \ +{ \ + return x == y.real && y.imag == 0; \ +} \ +\ +inline bool operator==(ctype x, rtype y) \ +{ \ + return y == x.real && x.imag == 0; \ +} \ +\ +inline bool operator==(ctype x, ctype y) \ +{ \ + return x.real == y.real && \ + x.imag == y.imag; \ + } \ + \ +inline ctype operator-(ctype x) \ +{ \ + return {-x.real, -x.imag}; \ +} \ +\ +inline ctype operator+(rtype x, ctype y) \ +{ \ + return {x+y.real, y.imag}; \ +} \ +\ +inline ctype operator+(ctype x, rtype y) \ +{ \ + return {y+x.real, x.imag}; \ +} \ +\ +inline ctype operator+(ctype x, ctype y) \ +{ \ + return {x.real+y.real, x.imag+y.imag}; \ +} \ +\ +inline ctype operator-(rtype x, ctype y) \ +{ \ + return {x-y.real, -y.imag}; \ +} \ +\ +inline ctype operator-(ctype x, rtype y) \ +{ \ + return {x.real-y, x.imag}; \ +} \ +\ +inline ctype operator-(ctype x, ctype y) \ +{ \ + return {x.real-y.real, x.imag-y.imag}; \ +} \ +\ +inline ctype operator*(rtype x, ctype y) \ +{ \ + return {x*y.real, x*y.imag}; \ +} \ +\ +inline ctype operator*(ctype x, rtype y) \ +{ \ + return {y*x.real, y*x.imag}; \ +} \ +\ +inline ctype operator*(ctype x, ctype y) \ +{ \ + return {x.real*y.real - x.imag*y.imag, \ + x.real*y.imag + x.imag*y.real}; \ +} \ +\ +inline ctype operator/(rtype x, ctype y) \ +{ \ + auto scale = std::max(std::abs(y.real), std::abs(y.imag)); \ + auto n = std::ilogb(scale); \ + auto yrs = std::scalbn(y.real, -n); \ + auto yis = std::scalbn(y.imag, -n); \ + auto denom = y.real*yrs + y.imag*yis; \ + return {x*yrs/denom, -x*yis/denom}; \ +} \ +\ +inline ctype operator/(ctype x, rtype y) \ +{ \ + return {x.real/y, x.imag/y}; \ +} \ +\ +inline ctype operator/(ctype x, ctype y) \ +{ \ + auto scale = std::max(std::abs(y.real), std::abs(y.imag)); \ + auto n = std::ilogb(scale); \ + auto yrs = std::scalbn(y.real, -n); \ + auto yis = std::scalbn(y.imag, -n); \ + auto denom = y.real*yrs + y.imag*yis; \ + return {(x.real*yrs + x.imag*yis)/denom, \ + (x.imag*yrs - x.real*yis)/denom}; \ +} \ +\ +inline ctype& operator+=(ctype& x, rtype y) \ +{ \ + x.real += y; \ + return x; \ +} \ +\ +inline ctype& operator+=(ctype& x, ctype y) \ +{ \ + x.real += y.real; x.imag += y.imag; \ + return x; \ +} \ +\ +inline ctype& operator-=(ctype& x, rtype y) \ +{ \ + x.real -= y; \ + return x; \ +} \ +\ +inline ctype& operator-=(ctype& x, ctype y) \ +{ \ + x.real -= y.real; x.imag -= y.imag; \ + return x; \ +} \ +\ +inline ctype& operator*=(ctype& x, rtype y) \ +{ \ + x.real *= y; x.imag *= y; \ + return x; \ +} \ +\ +inline ctype& operator*=(ctype& x, ctype y) \ +{ \ + x = x * y; \ + return x; \ +} \ +\ +inline ctype& operator/=(ctype& x, rtype y) \ +{ \ + x.real /= y; x.imag /= y; \ + return x; \ +} \ +\ +inline ctype& operator/=(ctype& x, ctype y) \ +{ \ + x = x / y; \ + return x; \ +} + +COMPLEX_MATH_OPS(float, scomplex); +COMPLEX_MATH_OPS(double, dcomplex); + diff --git a/test/syrk_diagonal/syrk_diagonal_example.c b/test/syrk_diagonal/syrk_diagonal_example.c new file mode 100644 index 000000000..c2bfd8fa1 --- /dev/null +++ b/test/syrk_diagonal/syrk_diagonal_example.c @@ -0,0 +1,186 @@ +#include "syrk_diagonal_ref.h" + +/* + * Structure which includes all additional information beyond what is + * already stored in the obj_t structure. + * + * This structure is **read-only** during the operation! + */ +typedef struct packm_diag_params_t +{ + packm_blk_var1_params_t super; + void* d; + inc_t incd; +} packm_diag_params_t; + +/* + * Declare the pack kernel type and set up and array of + * packing kernels, one for each data type. + */ +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +void PASTEMAC(ch,op) \ + ( \ + struc_t struca, \ + diag_t diaga, \ + uplo_t uploa, \ + conj_t conja, \ + pack_t schema, \ + bool invdiag, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp, \ + inc_t is_p, \ + cntx_t* cntx, \ + void* params \ + ) \ +{ \ + packm_diag_params_t* params_cast = params; \ + ctype* restrict a_cast = a; \ + ctype* restrict p_cast = p; \ + ctype* restrict d_cast = params_cast->d; \ + inc_t incd = params_cast->incd; \ + ctype kappa_cast = *( ctype* )kappa; \ +\ + if ( schema != BLIS_PACKED_ROW_PANELS && \ + schema != BLIS_PACKED_COL_PANELS ) \ + bli_abort(); \ +\ + /* Apply the offset */ \ + d_cast += panel_len_off * incd; \ +\ + if ( conja ) \ + { \ + for ( dim_t j = 0; j < panel_len; j++ ) \ + { \ + ctype kappa_d; \ + PASTEMAC(ch,scal2s)( kappa_cast, d_cast[ j*incd ], kappa_d ); \ +\ + for (dim_t i = 0;i < panel_dim;i++) \ + PASTEMAC(ch,scal2js)( kappa_d, a_cast[ i*inca + j*lda ], p_cast[ i + j*ldp ] ); \ +\ + for (dim_t i = panel_dim;i < panel_dim_max;i++) \ + PASTEMAC(ch,set0s)( p_cast[ i + j*ldp ] ); \ + } \ + } \ + else \ + { \ + for ( dim_t j = 0; j < panel_len; j++ ) \ + { \ + ctype kappa_d; \ + PASTEMAC(ch,scal2s)( kappa_cast, d_cast[ j*incd ], kappa_d ); \ +\ + for (dim_t i = 0;i < panel_dim;i++) \ + PASTEMAC(ch,scal2s)( kappa_d, a_cast[ i*inca + j*lda ], p_cast[ i + j*ldp ] ); \ +\ + for (dim_t i = panel_dim;i < panel_dim_max;i++) \ + PASTEMAC(ch,set0s)( p_cast[ i + j*ldp ] ); \ + } \ + } \ +\ + for (dim_t j = panel_len;j < panel_len_max;j++) \ + for (dim_t i = 0;i < panel_dim_max;i++) \ + PASTEMAC(ch,set0s)( p_cast[ i + j*ldp ] ); \ +} + +INSERT_GENTFUNC_BASIC0(packm_diag_ukr); + +static packm_ker_vft GENARRAY( packm_diag_ukrs, packm_diag_ukr ); + +/* + * Modify the object A to include information about the diagonal D, + * and imbue it with special function pointers which will take care + * of the actual work of forming (D * A^T) + */ +void attach_diagonal_factor( packm_diag_params_t* params, obj_t* d, obj_t* a ) +{ + memset( params, 0, sizeof(*params) ); + + // Assumes D is a column vector + params->d = bli_obj_buffer_at_off( d ); + params->incd = bli_obj_row_stride( d ); + + for ( int i = BLIS_DT_LO; i <= BLIS_DT_HI; i++ ) + params->super.ukr_fn[i][i] = packm_diag_ukrs[i]; + + // Attach the parameters to the A object. + bli_obj_set_pack_params( params, a ); +} + +/* + * Implements C := alpha * A * D * A^T + beta * C + * + * where D is a diagonal matrix with elements taken from the "d" vector. + */ +void syrk_diag( obj_t* alpha, obj_t* a, obj_t* d, obj_t* beta, obj_t* c ) +{ + obj_t ad; // this is (D * A^T) + packm_diag_params_t params; + + bli_obj_alias_to( a, &ad ); + bli_obj_toggle_trans( &ad ); // because gemmt is A*B instead of A*B^T + attach_diagonal_factor( ¶ms, d, &ad ); + + // Does C := alpha * A * B + beta * C using B = (D + A^T) + bli_gemmtnat( alpha, a, &ad, beta, c, NULL, NULL ); +} + +int main( void ) +{ + obj_t a; + obj_t d; + obj_t c; + obj_t c_copy; + obj_t norm; + + dim_t m = 10; + dim_t k = 10; + + for ( int dt_ = BLIS_DT_LO; dt_ <= BLIS_DT_HI; dt_++ ) + for ( int upper = 0; upper <= 1; upper++ ) + for ( int transa = 0; transa <= 1; transa++ ) + for ( int transc = 0; transc <= 1; transc++ ) + { + num_t dt = dt_; + uplo_t uplo = upper ? BLIS_UPPER : BLIS_LOWER; + + bli_obj_create( dt, m, k, transa ? k : 1, transa ? 1 : m, &a ); + bli_obj_create( dt, k, 1, 1, 1, &d ); + bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c ); + bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c_copy ); + bli_obj_set_struc( BLIS_SYMMETRIC , &c ); + bli_obj_set_struc( BLIS_SYMMETRIC , &c_copy ); + bli_obj_set_uplo( uplo , &c ); + bli_obj_set_uplo( uplo , &c_copy ); + bli_obj_create_1x1( bli_dt_proj_to_real( dt ), &norm ); + + bli_randm( &a ); + bli_randm( &d ); + bli_randm( &c ); + bli_copym( &c, &c_copy ); + + syrk_diag( &BLIS_ONE, &a, &d, &BLIS_ONE, &c ); + syrk_diag_ref( &BLIS_ONE, &a, &d, &BLIS_ONE, &c_copy ); + + bli_subm( &c_copy, &c ); + bli_normfm( &c, &norm ); + + double normr, normi; + bli_getsc( &norm, &normr, &normi ); + + printf( "dt: %d, upper: %d, transa: %d, transc: %d, norm: %g\n", + dt, upper, transa, transc, normr ); + + bli_obj_free( &a ); + bli_obj_free( &d ); + bli_obj_free( &c ); + bli_obj_free( &c_copy ); + bli_obj_free( &norm ); + } +} diff --git a/test/syrk_diagonal/syrk_diagonal_example.cxx b/test/syrk_diagonal/syrk_diagonal_example.cxx new file mode 100644 index 000000000..1c269d5c4 --- /dev/null +++ b/test/syrk_diagonal/syrk_diagonal_example.cxx @@ -0,0 +1,220 @@ +#include "syrk_diagonal_ref.h" + +/* + * Forward-declare the pack kernel type and set up and array of + * packing kernels, one for each data type. + */ +template +void packm_diag_ukr + ( + struc_t /*struca*/, + diag_t /*diaga*/, + uplo_t /*uploa*/, + conj_t conja, + pack_t schema, + bool /*invdiag*/, + dim_t panel_dim, + dim_t panel_len, + dim_t panel_dim_max, + dim_t panel_len_max, + dim_t /*panel_dim_off*/, + dim_t panel_len_off, + void* restrict kappa, + void* restrict a, inc_t inca, inc_t lda, + void* restrict p, inc_t ldp, + inc_t /*is_p*/, + cntx_t* /*cntx*/, + void* params + ); + +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +static auto PASTEMAC(ch,op) = &packm_diag_ukr; + +INSERT_GENTFUNC_BASIC0(packm_diag_ukr); + +static packm_ker_vft GENARRAY( packm_diag_ukrs, packm_diag_ukr ); + +/* + * Structure which includes all additional information beyond what is + * already stored in the obj_t structure. + * + * This structure is **read-only** during the operation! + */ +struct packm_diag_params_t : packm_blk_var1_params_t +{ + void* d; + inc_t incd; + + packm_diag_params_t() {} + + packm_diag_params_t( void* d, inc_t incd ) + : d(d), incd(incd) + { + for ( int i = BLIS_DT_LO; i <= BLIS_DT_HI; i++ ) + ukr_fn[i][i] = packm_diag_ukrs[i]; + } +}; + +/* + * Selecting a different kernel based on the current architecture is + * currently not possible, but is something we plan to support. + */ +template +void packm_diag_ukr + ( + struc_t /*struca*/, + diag_t /*diaga*/, + uplo_t /*uploa*/, + conj_t conja, + pack_t schema, + bool /*invdiag*/, + dim_t panel_dim, + dim_t panel_len, + dim_t panel_dim_max, + dim_t panel_len_max, + dim_t /*panel_dim_off*/, + dim_t panel_len_off, + void* restrict kappa, + void* restrict a, inc_t inca, inc_t lda, + void* restrict p, inc_t ldp, + inc_t /*is_p*/, + cntx_t* /*cntx*/, + void* params + ) +{ + auto params_cast = ( packm_diag_params_t* )params; + T* restrict a_cast = ( T* )a; + T* restrict p_cast = ( T* )p; + T* restrict d_cast = ( T* )params_cast->d; + auto incd = params_cast->incd; + auto kappa_cast = *( T* )kappa; + + if ( schema != BLIS_PACKED_ROW_PANELS && + schema != BLIS_PACKED_COL_PANELS ) + bli_abort(); + + /* Apply the offset */ + d_cast += panel_len_off * incd; + + if ( conja ) + { + for ( dim_t j = 0; j < panel_len; j++ ) + { + auto kappa_d = kappa_cast * d_cast[ j*incd ]; + + for (dim_t i = 0;i < panel_dim;i++) + p_cast[ i + j*ldp ] = kappa_d * conj( a_cast[ i*inca + j*lda ] ); + + for (dim_t i = panel_dim;i < panel_dim_max;i++) + p_cast[ i + j*ldp ] = convert(0.0); + } + } + else + { + for ( dim_t j = 0; j < panel_len; j++ ) + { + auto kappa_d = kappa_cast * d_cast[ j*incd ]; + + for (dim_t i = 0;i < panel_dim;i++) + p_cast[ i + j*ldp ] = kappa_d * a_cast[ i*inca + j*lda ]; + + for (dim_t i = panel_dim;i < panel_dim_max;i++) + p_cast[ i + j*ldp ] = convert(0.0); + } + } + + for (dim_t j = panel_len;j < panel_len_max;j++) + for (dim_t i = 0;i < panel_dim_max;i++) + p_cast[ i + j*ldp ] = convert(0.0); +} + +/* + * Modify the object A to include information about the diagonal D, + * and imbue it with special function pointers which will take care + * of the actual work of forming (D * A^T) + */ +void attach_diagonal_factor( packm_diag_params_t* params, obj_t* d, obj_t* a ) +{ + // Assumes D is a column vector + new (params) packm_diag_params_t + ( + bli_obj_buffer_at_off( d ), + bli_obj_row_stride( d ) + ); + + // Attach the parameters to the A object. + bli_obj_set_pack_params( params, a ); +} + +/* + * Implements C := alpha * A * D * A^T + beta * C + * + * where D is a diagonal matrix with elements taken from the "d" vector. + */ +void syrk_diag( obj_t* alpha, obj_t* a, obj_t* d, obj_t* beta, obj_t* c ) +{ + obj_t ad; // this is (D * A^T) + packm_diag_params_t params; + + bli_obj_alias_to( a, &ad ); + bli_obj_toggle_trans( &ad ); // because gemmt is A*B instead of A*B^T + attach_diagonal_factor( ¶ms, d, &ad ); + + // Does C := alpha * A * B + beta * C using B = (D + A^T) + bli_gemmtnat( alpha, a, &ad, beta, c, NULL, NULL ); +} + +int main() +{ + obj_t a; + obj_t d; + obj_t c; + obj_t c_copy; + obj_t norm; + + auto m = 10; + auto k = 10; + + for ( int dt_ = BLIS_DT_LO; dt_ <= BLIS_DT_HI; dt_++ ) + for ( int upper = 0; upper <= 1; upper++ ) + for ( int transa = 0; transa <= 1; transa++ ) + for ( int transc = 0; transc <= 1; transc++ ) + { + auto dt = ( num_t )dt_; + auto uplo = upper ? BLIS_UPPER : BLIS_LOWER; + + bli_obj_create( dt, m, k, transa ? k : 1, transa ? 1 : m, &a ); + bli_obj_create( dt, k, 1, 1, 1, &d ); + bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c ); + bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c_copy ); + bli_obj_set_struc( BLIS_SYMMETRIC , &c ); + bli_obj_set_struc( BLIS_SYMMETRIC , &c_copy ); + bli_obj_set_uplo( uplo , &c ); + bli_obj_set_uplo( uplo , &c_copy ); + bli_obj_create_1x1( bli_dt_proj_to_real( dt ), &norm ); + + bli_randm( &a ); + bli_randm( &d ); + bli_randm( &c ); + bli_copym( &c, &c_copy ); + + syrk_diag( &BLIS_ONE, &a, &d, &BLIS_ONE, &c ); + syrk_diag_ref( &BLIS_ONE, &a, &d, &BLIS_ONE, &c_copy ); + + bli_subm( &c_copy, &c ); + bli_normfm( &c, &norm ); + + double normr, normi; + bli_getsc( &norm, &normr, &normi ); + + printf("dt: %d, upper: %d, transa: %d, transc: %d, norm: %g\n", + dt, upper, transa, transc, normr); + + bli_obj_free( &a ); + bli_obj_free( &d ); + bli_obj_free( &c ); + bli_obj_free( &c_copy ); + bli_obj_free( &norm ); + } +} diff --git a/test/syrk_diagonal/syrk_diagonal_example2.c b/test/syrk_diagonal/syrk_diagonal_example2.c new file mode 100644 index 000000000..92371f48b --- /dev/null +++ b/test/syrk_diagonal/syrk_diagonal_example2.c @@ -0,0 +1,354 @@ +#include "syrk_diagonal_ref.h" + +/* + * Structure which includes all additional information beyond what is + * already stored in the obj_t structure. + * + * This structure is **read-only** during the operation! + */ +typedef struct packm_diag_params_t +{ + void* d; + inc_t incd; +} packm_diag_params_t; + +typedef void (*packm_diag_ukr_vft) + ( + bool conja, + dim_t panel_dim, + dim_t panel_len, + dim_t panel_dim_max, + dim_t panel_len_max, + void* restrict kappa, + void* restrict d, inc_t incd, + void* restrict a, inc_t inca, inc_t lda, + void* restrict p, inc_t ldp + ); + +/* + * Declare the pack kernel type and set up and array of + * packing kernels, one for each data type. + */ +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +void PASTEMAC(ch,op) \ + ( \ + bool conja, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + void* restrict kappa, \ + void* restrict d, inc_t incd, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + ctype* restrict a_cast = a; \ + ctype* restrict p_cast = p; \ + ctype* restrict d_cast = d; \ + ctype kappa_cast = *( ctype* )kappa; \ +\ + if ( conja ) \ + { \ + for ( dim_t j = 0; j < panel_len; j++ ) \ + { \ + ctype kappa_d; \ + PASTEMAC(ch,scal2s)( kappa_cast, d_cast[ j*incd ], kappa_d ); \ +\ + for (dim_t i = 0;i < panel_dim;i++) \ + PASTEMAC(ch,scal2js)( kappa_d, a_cast[ i*inca + j*lda ], p_cast[ i + j*ldp ] ); \ +\ + for (dim_t i = panel_dim;i < panel_dim_max;i++) \ + PASTEMAC(ch,set0s)( p_cast[ i + j*ldp ] ); \ + } \ + } \ + else \ + { \ + for ( dim_t j = 0; j < panel_len; j++ ) \ + { \ + ctype kappa_d; \ + PASTEMAC(ch,scal2s)( kappa_cast, d_cast[ j*incd ], kappa_d ); \ +\ + for (dim_t i = 0;i < panel_dim;i++) \ + PASTEMAC(ch,scal2s)( kappa_d, a_cast[ i*inca + j*lda ], p_cast[ i + j*ldp ] ); \ +\ + for (dim_t i = panel_dim;i < panel_dim_max;i++) \ + PASTEMAC(ch,set0s)( p_cast[ i + j*ldp ] ); \ + } \ + } \ +\ + for (dim_t j = panel_len;j < panel_len_max;j++) \ + for (dim_t i = 0;i < panel_dim_max;i++) \ + PASTEMAC(ch,set0s)( p_cast[ i + j*ldp ] ); \ +} + +INSERT_GENTFUNC_BASIC0(packm_diag_ukr); + +static packm_diag_ukr_vft GENARRAY( packm_diag_ukrs, packm_diag_ukr ); + +void packm_diag + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ +#if 1 + + // We begin by copying the fields of A. + bli_obj_alias_to( a, p ); + + // Get information about data types. + num_t dt = bli_obj_dt( a ); + num_t dt_tar = bli_obj_target_dt( a ); + num_t dt_scalar = bli_obj_scalar_dt( a ); + dim_t dt_size = bli_dt_size( dt ); + + if ( dt_scalar != dt || dt_tar != dt ) + bli_abort(); + + // Extract various fields from the control tree. + bszid_t bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); + bszid_t bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); + pack_t schema = bli_cntl_packm_params_pack_schema( cntl ); + dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); + dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); + dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); + + if ( schema != BLIS_PACKED_ROW_PANELS && + schema != BLIS_PACKED_COL_PANELS ) + bli_abort(); + + // Store the pack schema to the object. + bli_obj_set_pack_schema( schema, p ); + + // Clear the conjugation field from the object since matrix packing + // in BLIS is deemed to take care of all conjugation necessary. + bli_obj_set_conj( BLIS_NO_CONJUGATE, p ); + + // If we are packing micropanels, mark P as dense. + bli_obj_set_uplo( BLIS_DENSE, p ); + + // Reset the view offsets to (0,0). + bli_obj_set_offs( 0, 0, p ); + + // Compute the dimensions padded by the dimension multiples. These + // dimensions will be the dimensions of the packed matrices, including + // zero-padding, and will be used by the macro- and micro-kernels. + // We compute them by starting with the effective dimensions of A (now + // in P) and aligning them to the dimension multiples (typically equal + // to register blocksizes). This does waste a little bit of space for + // level-2 operations, but that's okay with us. + dim_t m_p = bli_obj_length( p ); + dim_t n_p = bli_obj_width( p ); + dim_t m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def ); + dim_t n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def ); + + // Save the padded dimensions into the packed object. It is important + // to save these dimensions since they represent the actual dimensions + // of the zero-padded matrix. + bli_obj_set_padded_dims( m_p_pad, n_p_pad, p ); + + // The "panel stride" of a micropanel packed object is interpreted as + // the distance between the (0,0) element of panel k and the (0,0) + // element of panel k+1. We use the padded width computed above to + // allow for zero-padding (if necessary/desired) along the far end + // of each micropanel (ie: the right edge of the matrix). Zero-padding + // can also occur along the long edge of the last micropanel if the m + // dimension of the matrix is not a whole multiple of MR. + inc_t ps_p = bmult_m_pack * n_p_pad; + + /* Compute the total number of iterations we'll need. */ + dim_t n_iter = m_p_pad / bmult_m_def; + + // Store the strides and panel dimension in P. + bli_obj_set_strides( 1, bmult_m_pack, p ); + bli_obj_set_imag_stride( 1, p ); + bli_obj_set_panel_dim( bmult_m_def, p ); + bli_obj_set_panel_stride( ps_p, p ); + bli_obj_set_panel_length( bmult_m_def, p ); + bli_obj_set_panel_width( n_p, p ); + + // Compute the size of the packed buffer. + siz_t size_p = ps_p * n_iter * dt_size; + if ( size_p == 0 ) return; + + // Update the buffer address in p to point to the buffer associated + // with the mem_t entry acquired from the memory broker (now cached in + // the control tree node). + char* p_cast = (char*)bli_packm_alloc( size_p, rntm, cntl, thread ); + bli_obj_set_buffer( p_cast, p ); + +#else + + // Every thread initializes p and determines the size of memory + // block needed (which gets embedded into the otherwise "blank" mem_t + // entry in the control tree node). Return early if no packing is required. + if ( !bli_packm_init( a, p, cntx, rntm, cntl, thread ) ) + return; + + num_t dt = bli_obj_dt( a ); + dim_t dt_size = bli_dt_size( dt ); + + bszid_t bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); + dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx ); + dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx ); + + dim_t m_p = bli_obj_length( p ); + dim_t n_p = bli_obj_width( p ); + dim_t m_p_pad = bli_obj_padded_length( p ); + dim_t n_p_pad = bli_obj_padded_width( p ); + dim_t n_iter = m_p_pad / bmult_m_def; + + char* p_cast = bli_obj_buffer( p ); + inc_t ps_p = bli_obj_panel_stride( p ); + +#endif + + char* a_cast = bli_obj_buffer_at_off( a ); + inc_t inca = bli_obj_row_stride( a ); + inc_t lda = bli_obj_col_stride( a ); + dim_t panel_len_off = bli_obj_col_off( a ); + conj_t conja = bli_obj_conj_status( a ); + + packm_diag_params_t* params = bli_obj_pack_params( a ); + char* d_cast = params->d; + inc_t incd = params->incd; + + obj_t kappa_local; + char* kappa_cast = bli_packm_scalar( &kappa_local, p ); + + packm_diag_ukr_vft packm_ker_cast = packm_diag_ukrs[ dt ]; + + /* Query the number of threads and thread ids from the current thread's + packm thrinfo_t node. */ + const dim_t nt = bli_thread_n_way( thread ); + const dim_t tid = bli_thread_work_id( thread ); + + /* Determine the thread range and increment using the current thread's + packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + will depend on whether slab or round-robin partitioning was requested + at configure-time. */ + dim_t it_start, it_end, it_inc; + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); + + /* Iterate over every logical micropanel in the source matrix. */ + for ( dim_t it = 0; it < n_iter; it += 1 ) + { + dim_t panel_dim_i = bli_min( bmult_m_def, m_p - it*bmult_m_def ); + + char* d_begin = d_cast + panel_len_off*incd*dt_size; + char* a_begin = a_cast + it* bmult_m_def*inca*dt_size; + char* p_begin = p_cast + it* ps_p*dt_size; + + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) + { + packm_ker_cast + ( + conja, + panel_dim_i, + n_p, + bmult_m_def, + n_p_pad, + kappa_cast, + d_begin, incd, + a_begin, inca, lda, + p_begin, bmult_m_pack + ); + } + } +} + +/* + * Modify the object A to include information about the diagonal D, + * and imbue it with special function pointers which will take care + * of the actual work of forming (D * A^T) + */ +void attach_diagonal_factor( packm_diag_params_t* params, obj_t* d, obj_t* a ) +{ + // Assumes D is a column vector + params->d = bli_obj_buffer_at_off( d ); + params->incd = bli_obj_row_stride( d ); + + // Set the custom pack function. + bli_obj_set_pack_fn( packm_diag, a ); + + // Attach the parameters to the A object. + bli_obj_set_pack_params( params, a ); +} + +/* + * Implements C := alpha * A * D * A^T + beta * C + * + * where D is a diagonal matrix with elements taken from the "d" vector. + */ +void syrk_diag( obj_t* alpha, obj_t* a, obj_t* d, obj_t* beta, obj_t* c ) +{ + obj_t ad; // this is (D * A^T) + packm_diag_params_t params; + + bli_obj_alias_to( a, &ad ); + bli_obj_toggle_trans( &ad ); // because gemmt is A*B instead of A*B^T + attach_diagonal_factor( ¶ms, d, &ad ); + + // Does C := alpha * A * B + beta * C using B = (D + A^T) + bli_gemmt( alpha, a, &ad, beta, c ); +} + +int main( void ) +{ + obj_t a; + obj_t d; + obj_t c; + obj_t c_copy; + obj_t norm; + + dim_t m = 10; + dim_t k = 10; + + for ( int dt_ = BLIS_DT_LO; dt_ <= BLIS_DT_HI; dt_++ ) + for ( int upper = 0; upper <= 1; upper++ ) + for ( int transa = 0; transa <= 1; transa++ ) + for ( int transc = 0; transc <= 1; transc++ ) + { + num_t dt = dt_; + uplo_t uplo = upper ? BLIS_UPPER : BLIS_LOWER; + + bli_obj_create( dt, m, k, transa ? k : 1, transa ? 1 : m, &a ); + bli_obj_create( dt, k, 1, 1, 1, &d ); + bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c ); + bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c_copy ); + bli_obj_set_struc( BLIS_SYMMETRIC , &c ); + bli_obj_set_struc( BLIS_SYMMETRIC , &c_copy ); + bli_obj_set_uplo( uplo , &c ); + bli_obj_set_uplo( uplo , &c_copy ); + bli_obj_create_1x1( bli_dt_proj_to_real( dt ), &norm ); + + bli_randm( &a ); + bli_randm( &d ); + bli_randm( &c ); + bli_copym( &c, &c_copy ); + + syrk_diag( &BLIS_ONE, &a, &d, &BLIS_ONE, &c ); + syrk_diag_ref( &BLIS_ONE, &a, &d, &BLIS_ONE, &c_copy ); + + bli_subm( &c_copy, &c ); + bli_normfm( &c, &norm ); + + double normr, normi; + bli_getsc( &norm, &normr, &normi ); + + printf( "dt: %d, upper: %d, transa: %d, transc: %d, norm: %g\n", + dt, upper, transa, transc, normr ); + + bli_obj_free( &a ); + bli_obj_free( &d ); + bli_obj_free( &c ); + bli_obj_free( &c_copy ); + bli_obj_free( &norm ); + } +} diff --git a/test/syrk_diagonal/syrk_diagonal_example2.cxx b/test/syrk_diagonal/syrk_diagonal_example2.cxx new file mode 100644 index 000000000..8312a07ee --- /dev/null +++ b/test/syrk_diagonal/syrk_diagonal_example2.cxx @@ -0,0 +1,338 @@ +#include "syrk_diagonal_ref.h" + +/* + * Forward-declare the pack kernel type and set up and array of + * packing kernels, one for each data type. + */ +template +void packm_diag_ukr + ( + bool conja, + dim_t panel_dim, + dim_t panel_len, + dim_t panel_dim_max, + dim_t panel_len_max, + void* restrict kappa, + void* restrict d, inc_t incd, + void* restrict a, inc_t inca, inc_t lda, + void* restrict p, inc_t ldp + ); + +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +static auto PASTEMAC(ch,op) = &packm_diag_ukr; + +INSERT_GENTFUNC_BASIC0(packm_diag_ukr); + +using packm_diag_ukr_vft = decltype(&packm_diag_ukr); +static packm_diag_ukr_vft GENARRAY( packm_diag_ukrs, packm_diag_ukr ); + +/* + * Structure which includes all additional information beyond what is + * already stored in the obj_t structure. + * + * This structure is **read-only** during the operation! + */ +struct packm_diag_params_t +{ + void* d; + inc_t incd; + + packm_diag_params_t() {} + + packm_diag_params_t( void* d, inc_t incd ) + : d(d), incd(incd) {} +}; + +/* + * Selecting a different kernel based on the current architecture is + * currently not possible, but is something we plan to support. + */ +template +void packm_diag_ukr + ( + bool conja, + dim_t panel_dim, + dim_t panel_len, + dim_t panel_dim_max, + dim_t panel_len_max, + void* restrict kappa, + void* restrict d, inc_t incd, + void* restrict a, inc_t inca, inc_t lda, + void* restrict p, inc_t ldp + ) +{ + T* restrict a_cast = ( T* )a; + T* restrict p_cast = ( T* )p; + T* restrict d_cast = ( T* )d; + auto kappa_cast = *( T* )kappa; + + if ( conja ) + { + for ( dim_t j = 0; j < panel_len; j++ ) + { + auto kappa_d = kappa_cast * d_cast[ j*incd ]; + + for (dim_t i = 0;i < panel_dim;i++) + p_cast[ i + j*ldp ] = kappa_d * conj( a_cast[ i*inca + j*lda ] ); + + for (dim_t i = panel_dim;i < panel_dim_max;i++) + p_cast[ i + j*ldp ] = convert(0.0); + } + } + else + { + for ( dim_t j = 0; j < panel_len; j++ ) + { + auto kappa_d = kappa_cast * d_cast[ j*incd ]; + + for (dim_t i = 0;i < panel_dim;i++) + p_cast[ i + j*ldp ] = kappa_d * a_cast[ i*inca + j*lda ]; + + for (dim_t i = panel_dim;i < panel_dim_max;i++) + p_cast[ i + j*ldp ] = convert(0.0); + } + } + + for (dim_t j = panel_len;j < panel_len_max;j++) + for (dim_t i = 0;i < panel_dim_max;i++) + p_cast[ i + j*ldp ] = convert(0.0); +} + +void packm_diag + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + // We begin by copying the fields of A. + bli_obj_alias_to( a, p ); + + // Get information about data types. + num_t dt = bli_obj_dt( a ); + num_t dt_tar = bli_obj_target_dt( a ); + num_t dt_scalar = bli_obj_scalar_dt( a ); + dim_t dt_size = bli_dt_size( dt ); + + if ( dt_scalar != dt || dt_tar != dt ) + bli_abort(); + + // Extract various fields from the control tree. + bszid_t bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); + bszid_t bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); + pack_t schema = bli_cntl_packm_params_pack_schema( cntl ); + dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); + dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); + dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); + + if ( schema != BLIS_PACKED_ROW_PANELS && + schema != BLIS_PACKED_COL_PANELS ) + bli_abort(); + + // Store the pack schema to the object. + bli_obj_set_pack_schema( schema, p ); + + // Clear the conjugation field from the object since matrix packing + // in BLIS is deemed to take care of all conjugation necessary. + bli_obj_set_conj( BLIS_NO_CONJUGATE, p ); + + // If we are packing micropanels, mark P as dense. + bli_obj_set_uplo( BLIS_DENSE, p ); + + // Reset the view offsets to (0,0). + bli_obj_set_offs( 0, 0, p ); + + // Compute the dimensions padded by the dimension multiples. These + // dimensions will be the dimensions of the packed matrices, including + // zero-padding, and will be used by the macro- and micro-kernels. + // We compute them by starting with the effective dimensions of A (now + // in P) and aligning them to the dimension multiples (typically equal + // to register blocksizes). This does waste a little bit of space for + // level-2 operations, but that's okay with us. + dim_t m_p = bli_obj_length( p ); + dim_t n_p = bli_obj_width( p ); + dim_t m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def ); + dim_t n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def ); + + // Save the padded dimensions into the packed object. It is important + // to save these dimensions since they represent the actual dimensions + // of the zero-padded matrix. + bli_obj_set_padded_dims( m_p_pad, n_p_pad, p ); + + // The "panel stride" of a micropanel packed object is interpreted as + // the distance between the (0,0) element of panel k and the (0,0) + // element of panel k+1. We use the padded width computed above to + // allow for zero-padding (if necessary/desired) along the far end + // of each micropanel (ie: the right edge of the matrix). Zero-padding + // can also occur along the long edge of the last micropanel if the m + // dimension of the matrix is not a whole multiple of MR. + inc_t ps_p = bmult_m_pack * n_p_pad; + + /* Compute the total number of iterations we'll need. */ + dim_t n_iter = m_p_pad / bmult_m_def; + + // Store the strides and panel dimension in P. + bli_obj_set_strides( 1, bmult_m_pack, p ); + bli_obj_set_imag_stride( 1, p ); + bli_obj_set_panel_dim( bmult_m_def, p ); + bli_obj_set_panel_stride( ps_p, p ); + bli_obj_set_panel_length( bmult_m_def, p ); + bli_obj_set_panel_width( n_p, p ); + + // Compute the size of the packed buffer. + siz_t size_p = ps_p * n_iter * dt_size; + if ( size_p == 0 ) return; + + // Update the buffer address in p to point to the buffer associated + // with the mem_t entry acquired from the memory broker (now cached in + // the control tree node). + char* p_cast = (char*)bli_packm_alloc( size_p, rntm, cntl, thread ); + bli_obj_set_buffer( p_cast, p ); + + char* a_cast = (char*)bli_obj_buffer_at_off( a ); + inc_t inca = bli_obj_row_stride( a ); + inc_t lda = bli_obj_col_stride( a ); + dim_t panel_len_off = bli_obj_col_off( a ); + conj_t conja = bli_obj_conj_status( a ); + + auto params = (packm_diag_params_t*)bli_obj_pack_params( a ); + char* d_cast = (char*)params->d; + inc_t incd = params->incd; + + obj_t kappa_local; + char* kappa_cast = (char*)bli_packm_scalar( &kappa_local, p ); + + auto packm_ker_cast = packm_diag_ukrs[ dt ]; + + /* Query the number of threads and thread ids from the current thread's + packm thrinfo_t node. */ + const dim_t nt = bli_thread_n_way( thread ); + const dim_t tid = bli_thread_work_id( thread ); + + /* Determine the thread range and increment using the current thread's + packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + will depend on whether slab or round-robin partitioning was requested + at configure-time. */ + dim_t it_start, it_end, it_inc; + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); + + /* Iterate over every logical micropanel in the source matrix. */ + for ( dim_t it = 0; it < n_iter; it += 1 ) + { + dim_t panel_dim_i = bli_min( bmult_m_def, m_p - it*bmult_m_def ); + + char* d_begin = d_cast + panel_len_off*incd*dt_size; + char* a_begin = a_cast + it* bmult_m_def*inca*dt_size; + char* p_begin = p_cast + it* ps_p*dt_size; + + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) + { + packm_ker_cast( conja, + panel_dim_i, + n_p, + bmult_m_def, + n_p_pad, + kappa_cast, + d_begin, incd, + a_begin, inca, lda, + p_begin, bmult_m_pack ); + } + } +} + +/* + * Modify the object A to include information about the diagonal D, + * and imbue it with special function pointers which will take care + * of the actual work of forming (D * A^T) + */ +void attach_diagonal_factor( packm_diag_params_t* params, obj_t* d, obj_t* a ) +{ + // Assumes D is a column vector + new (params) packm_diag_params_t + ( + bli_obj_buffer_at_off( d ), + bli_obj_row_stride( d ) + ); + + // Set the custom pack function. + bli_obj_set_pack_fn( packm_diag, a ); + + // Attach the parameters to the A object. + bli_obj_set_pack_params( params, a ); +} + +/* + * Implements C := alpha * A * D * A^T + beta * C + * + * where D is a diagonal matrix with elements taken from the "d" vector. + */ +void syrk_diag( obj_t* alpha, obj_t* a, obj_t* d, obj_t* beta, obj_t* c ) +{ + obj_t ad; // this is (D * A^T) + packm_diag_params_t params; + + bli_obj_alias_to( a, &ad ); + bli_obj_toggle_trans( &ad ); // because gemmt is A*B instead of A*B^T + attach_diagonal_factor( ¶ms, d, &ad ); + + // Does C := alpha * A * B + beta * C using B = (D + A^T) + bli_gemmt( alpha, a, &ad, beta, c ); +} + +int main() +{ + obj_t a; + obj_t d; + obj_t c; + obj_t c_copy; + obj_t norm; + + auto m = 10; + auto k = 10; + + for ( int dt_ = BLIS_DT_LO; dt_ <= BLIS_DT_HI; dt_++ ) + for ( int upper = 0; upper <= 1; upper++ ) + for ( int transa = 0; transa <= 1; transa++ ) + for ( int transc = 0; transc <= 1; transc++ ) + { + auto dt = ( num_t )dt_; + auto uplo = upper ? BLIS_UPPER : BLIS_LOWER; + + bli_obj_create( dt, m, k, transa ? k : 1, transa ? 1 : m, &a ); + bli_obj_create( dt, k, 1, 1, 1, &d ); + bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c ); + bli_obj_create( dt, m, m, transc ? m : 1, transc ? 1 : m, &c_copy ); + bli_obj_set_struc( BLIS_SYMMETRIC , &c ); + bli_obj_set_struc( BLIS_SYMMETRIC , &c_copy ); + bli_obj_set_uplo( uplo , &c ); + bli_obj_set_uplo( uplo , &c_copy ); + bli_obj_create_1x1( bli_dt_proj_to_real( dt ), &norm ); + + bli_randm( &a ); + bli_randm( &d ); + bli_randm( &c ); + bli_copym( &c, &c_copy ); + + syrk_diag( &BLIS_ONE, &a, &d, &BLIS_ONE, &c ); + syrk_diag_ref( &BLIS_ONE, &a, &d, &BLIS_ONE, &c_copy ); + + bli_subm( &c_copy, &c ); + bli_normfm( &c, &norm ); + + double normr, normi; + bli_getsc( &norm, &normr, &normi ); + + printf("dt: %d, upper: %d, transa: %d, transc: %d, norm: %g\n", + dt, upper, transa, transc, normr); + + bli_obj_free( &a ); + bli_obj_free( &d ); + bli_obj_free( &c ); + bli_obj_free( &c_copy ); + bli_obj_free( &norm ); + } +} diff --git a/test/syrk_diagonal/syrk_diagonal_ref.cxx b/test/syrk_diagonal/syrk_diagonal_ref.cxx new file mode 100644 index 000000000..1d7c5d96e --- /dev/null +++ b/test/syrk_diagonal/syrk_diagonal_ref.cxx @@ -0,0 +1,102 @@ +#include "syrk_diagonal_ref.h" +#include "complex_math.hpp" + +typedef void (*syrk_diag_ref_vft) + ( + uplo_t uplo, + dim_t m, + dim_t k, + void* alpha, + void* a, inc_t rs_a, inc_t cs_a, + void* d, inc_t incd, + void* beta, + void* c, inc_t rs_c, inc_t cs_c + ); + +template +void syrk_diag_ref + ( + uplo_t uplo, + dim_t m, + dim_t k, + void* alpha, + void* a, inc_t rs_a, inc_t cs_a, + void* d, inc_t incd, + void* beta, + void* c, inc_t rs_c, inc_t cs_c + ) +{ + auto alpha_cast = *( T* )alpha; + auto beta_cast = *( T* )beta; + auto a_cast = ( T* )a; + auto d_cast = ( T* )d; + auto c_cast = ( T* )c; + + for ( dim_t i = 0; i < m; i++ ) + { + dim_t j_min = uplo == BLIS_UPPER ? i : 0; + dim_t j_max = uplo == BLIS_UPPER ? m : i+1; + + for ( dim_t j = j_min; j < j_max; j++ ) + { + auto ada = convert(0.0); + + for ( dim_t p = 0; p < k; p++ ) + { + ada += a_cast[ i*rs_a + p*cs_a ] * + d_cast[ p*incd ] * + a_cast[ j*rs_a + p*cs_a ]; + } + + if ( beta_cast == convert(0.0) ) + { + c_cast[ i*rs_c + j*cs_c ] = alpha_cast * ada; + } + else + { + c_cast[ i*rs_c + j*cs_c ] = alpha_cast * ada + + beta_cast * c_cast[ i*rs_c + j*cs_c ]; + } + } + } +} + +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +static auto PASTEMAC(ch,op) = &syrk_diag_ref; + +INSERT_GENTFUNC_BASIC0(syrk_diag_ref); + +static syrk_diag_ref_vft GENARRAY( syrk_diag_ref_impl, syrk_diag_ref ); + +void syrk_diag_ref( obj_t* alpha, obj_t* a, obj_t* d, obj_t* beta, obj_t* c ) +{ + num_t dt = bli_obj_dt( a ); + + dim_t m = bli_obj_length_after_trans( a ); + dim_t k = bli_obj_width_after_trans( a ); + + inc_t rs_a = bli_obj_row_stride( a ); + inc_t cs_a = bli_obj_col_stride( a ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + inc_t incd = bli_obj_row_stride( d ); + + if ( bli_obj_has_trans( a ) ) + bli_swap_incs( &rs_a, &cs_a ); + + if ( bli_obj_has_trans( c ) ) + bli_swap_incs( &rs_c, &cs_c ); + + syrk_diag_ref_impl[ dt ] + ( + bli_obj_uplo( c ), + m, k, + bli_obj_buffer_for_1x1( dt, alpha ), + bli_obj_buffer_at_off( a ), rs_a, cs_a, + bli_obj_buffer_at_off( d ), incd, + bli_obj_buffer_for_1x1( dt, beta ), + bli_obj_buffer_at_off( c ), rs_c, cs_c + ); +} + diff --git a/test/syrk_diagonal/syrk_diagonal_ref.h b/test/syrk_diagonal/syrk_diagonal_ref.h new file mode 100644 index 000000000..a6864caec --- /dev/null +++ b/test/syrk_diagonal/syrk_diagonal_ref.h @@ -0,0 +1,8 @@ +#include "blis.h" + +#ifdef __cplusplus +#include "complex_math.hpp" +extern "C" +#endif +void syrk_diag_ref( obj_t* alpha, obj_t* a, obj_t* d, obj_t* beta, obj_t* c ); + diff --git a/test/tensor_contraction/complex_math.hpp b/test/tensor_contraction/complex_math.hpp new file mode 100644 index 000000000..9c68e730a --- /dev/null +++ b/test/tensor_contraction/complex_math.hpp @@ -0,0 +1,267 @@ +#include +#include +#include + +#include "blis.h" + +template +struct is_complex : std::false_type {}; + +template <> +struct is_complex : std::true_type {}; + +template <> +struct is_complex : std::true_type {}; + +template +struct is_real : std::integral_constant::value> {}; + +template struct make_complex; + +template <> struct make_complex { using type = scomplex; }; +template <> struct make_complex { using type = dcomplex; }; +template <> struct make_complex { using type = scomplex; }; +template <> struct make_complex { using type = dcomplex; }; + +template +using make_complex_t = typename make_complex::type; + +template struct make_real; + +template <> struct make_real { using type = float; }; +template <> struct make_real { using type = double; }; +template <> struct make_real { using type = float; }; +template <> struct make_real { using type = double; }; + +template +using make_real_t = typename make_real::type; + +template +struct make_complex_if : std::conditional,make_real_t> {}; + +template +using make_complex_if_t = typename make_complex_if::type; + +template +struct real_imag_part +{ + real_imag_part& operator=(T) { return *this; } + + operator T() const { return T(); } +}; + +template +std::enable_if_t::type>::value,T&> real(T& x) { return x; } + +template +std::enable_if_t::value,real_imag_part> imag(T x) { return {}; } + +inline float& real(scomplex& x) { return x.real; } + +inline float& imag(scomplex& x) { return x.imag; } + +inline double& real(dcomplex& x) { return x.real; } + +inline double& imag(dcomplex& x) { return x.imag; } + +inline const float& real(const scomplex& x) { return x.real; } + +inline const float& imag(const scomplex& x) { return x.imag; } + +inline const double& real(const dcomplex& x) { return x.real; } + +inline const double& imag(const dcomplex& x) { return x.imag; } + +template +std::enable_if_t::value,T> conj(T x) { return x; } + +template +std::enable_if_t::value,T> conj(const T& x) { return {x.real, -x.imag}; } + +template +struct convert_impl; + +template +struct convert_impl::value && is_real::value>> +{ + void operator()(T x, U& y) const { y = x; } +}; + +template +struct convert_impl::value && is_complex::value>> +{ + void operator()(T x, U& y) const { y.real = x; y.imag = 0; } +}; + +template +struct convert_impl::value && is_real::value>> +{ + void operator()(T x, U& y) const { y = x.real; } +}; + +template +struct convert_impl::value && is_complex::value>> +{ + void operator()(T x, U& y) const { y.real = x.real; y.imag = x.imag; } +}; + +template +U convert(T x) +{ + U y; + convert_impl{}(x,y); + return y; +} + +template +auto convert_prec(T x) -> make_complex_if_t::value> +{ + return convert::value>>(x); +} + +#define COMPLEX_MATH_OPS(rtype, ctype) \ +\ +inline bool operator==(rtype x, ctype y) \ +{ \ + return x == y.real && y.imag == 0; \ +} \ +\ +inline bool operator==(ctype x, rtype y) \ +{ \ + return y == x.real && x.imag == 0; \ +} \ +\ +inline bool operator==(ctype x, ctype y) \ +{ \ + return x.real == y.real && \ + x.imag == y.imag; \ + } \ + \ +inline ctype operator-(ctype x) \ +{ \ + return {-x.real, -x.imag}; \ +} \ +\ +inline ctype operator+(rtype x, ctype y) \ +{ \ + return {x+y.real, y.imag}; \ +} \ +\ +inline ctype operator+(ctype x, rtype y) \ +{ \ + return {y+x.real, x.imag}; \ +} \ +\ +inline ctype operator+(ctype x, ctype y) \ +{ \ + return {x.real+y.real, x.imag+y.imag}; \ +} \ +\ +inline ctype operator-(rtype x, ctype y) \ +{ \ + return {x-y.real, -y.imag}; \ +} \ +\ +inline ctype operator-(ctype x, rtype y) \ +{ \ + return {x.real-y, x.imag}; \ +} \ +\ +inline ctype operator-(ctype x, ctype y) \ +{ \ + return {x.real-y.real, x.imag-y.imag}; \ +} \ +\ +inline ctype operator*(rtype x, ctype y) \ +{ \ + return {x*y.real, x*y.imag}; \ +} \ +\ +inline ctype operator*(ctype x, rtype y) \ +{ \ + return {y*x.real, y*x.imag}; \ +} \ +\ +inline ctype operator*(ctype x, ctype y) \ +{ \ + return {x.real*y.real - x.imag*y.imag, \ + x.real*y.imag + x.imag*y.real}; \ +} \ +\ +inline ctype operator/(rtype x, ctype y) \ +{ \ + auto scale = std::max(std::abs(y.real), std::abs(y.imag)); \ + auto n = std::ilogb(scale); \ + auto yrs = std::scalbn(y.real, -n); \ + auto yis = std::scalbn(y.imag, -n); \ + auto denom = y.real*yrs + y.imag*yis; \ + return {x*yrs/denom, -x*yis/denom}; \ +} \ +\ +inline ctype operator/(ctype x, rtype y) \ +{ \ + return {x.real/y, x.imag/y}; \ +} \ +\ +inline ctype operator/(ctype x, ctype y) \ +{ \ + auto scale = std::max(std::abs(y.real), std::abs(y.imag)); \ + auto n = std::ilogb(scale); \ + auto yrs = std::scalbn(y.real, -n); \ + auto yis = std::scalbn(y.imag, -n); \ + auto denom = y.real*yrs + y.imag*yis; \ + return {(x.real*yrs + x.imag*yis)/denom, \ + (x.imag*yrs - x.real*yis)/denom}; \ +} \ +\ +inline ctype& operator+=(ctype& x, rtype y) \ +{ \ + x.real += y; \ + return x; \ +} \ +\ +inline ctype& operator+=(ctype& x, ctype y) \ +{ \ + x.real += y.real; x.imag += y.imag; \ + return x; \ +} \ +\ +inline ctype& operator-=(ctype& x, rtype y) \ +{ \ + x.real -= y; \ + return x; \ +} \ +\ +inline ctype& operator-=(ctype& x, ctype y) \ +{ \ + x.real -= y.real; x.imag -= y.imag; \ + return x; \ +} \ +\ +inline ctype& operator*=(ctype& x, rtype y) \ +{ \ + x.real *= y; x.imag *= y; \ + return x; \ +} \ +\ +inline ctype& operator*=(ctype& x, ctype y) \ +{ \ + x = x * y; \ + return x; \ +} \ +\ +inline ctype& operator/=(ctype& x, rtype y) \ +{ \ + x.real /= y; x.imag /= y; \ + return x; \ +} \ +\ +inline ctype& operator/=(ctype& x, ctype y) \ +{ \ + x = x / y; \ + return x; \ +} + +COMPLEX_MATH_OPS(float, scomplex); +COMPLEX_MATH_OPS(double, dcomplex); + diff --git a/test/tensor_contraction/tcontract_example.cxx b/test/tensor_contraction/tcontract_example.cxx new file mode 100644 index 000000000..0b935c54d --- /dev/null +++ b/test/tensor_contraction/tcontract_example.cxx @@ -0,0 +1,988 @@ + +#include "tcontract_ref.hpp" + +#include +#include + +static constexpr dim_t BS_K = 8; + +struct packm_tensor_params_t +{ + gint_t ndim_m, ndim_n; + const dim_t *len_m, *len_n; + const inc_t *stride_m, *stride_n; + + packm_tensor_params_t() {} + + packm_tensor_params_t( gint_t ndim_m, const dim_t* len_m, const inc_t* stride_m, + gint_t ndim_n, const dim_t* len_n, const inc_t* stride_n ) + : ndim_m(ndim_m), ndim_n(ndim_n), + len_m(len_m), len_n(len_n), + stride_m(stride_m), stride_n(stride_n) {} +}; + +using gemm_tensor_params_t = packm_tensor_params_t; + +template +void packm_ckx_nb + ( + bool conja, + dim_t panel_dim, + dim_t panel_len, + dim_t panel_dim_max, + dim_t panel_len_max, + void* kappa, + void* a, inc_t inca, inc_t* bsa, inc_t* scata, + void* p, inc_t ldp + ) +{ + T* restrict a_cast = ( T* )a; + T* restrict p_cast = ( T* )p; + auto kappa_cast = *( T* )kappa; + + if ( conja ) + { + for ( auto j0 = 0; j0 < panel_len; j0 += BS_K, bsa += BS_K, scata += BS_K ) + { + auto lda = *bsa; + auto panel_len_j = std::min( panel_len-j0, BS_K ); + + if ( lda ) + { + T* restrict aj = a_cast + *scata; + + for ( auto j = 0; j < panel_len_j; j++ ) + { + for ( auto i = 0; i < panel_dim; i++ ) + p_cast[ i ] = kappa_cast * conj( aj[ i*inca + j*lda ] ); + + for ( auto i = panel_dim; i < panel_dim_max; i++ ) + p_cast[ i ] = convert(0.0); + + p_cast += ldp; + } + } + else + { + for ( auto j = 0; j < panel_len_j; j++) + { + for ( auto i = 0; i < panel_dim; i++) + p_cast[ i ] = kappa_cast * conj( a_cast[ i*inca + scata[j] ] ); + + for ( auto i = panel_dim; i < panel_dim_max; i++) + p_cast[ i ] = convert(0.0); + + p_cast += ldp; + } + } + } + } + else + { + for ( auto j0 = 0; j0 < panel_len; j0 += BS_K, bsa += BS_K, scata += BS_K ) + { + auto lda = *bsa; + auto panel_len_j = std::min( panel_len-j0, BS_K ); + + if ( lda ) + { + T* restrict aj = a_cast + *scata; + + for ( auto j = 0; j < panel_len_j; j++ ) + { + for ( auto i = 0; i < panel_dim; i++ ) + p_cast[ i ] = kappa_cast * aj[ i*inca + j*lda ]; + + for ( auto i = panel_dim; i < panel_dim_max; i++ ) + p_cast[ i ] = convert(0.0); + + p_cast += ldp; + } + } + else + { + for ( auto j = 0; j < panel_len_j; j++ ) + { + for ( auto i = 0; i < panel_dim; i++ ) + p_cast[ i ] = kappa_cast * a_cast[ i*inca + scata[j] ]; + + for ( auto i = panel_dim; i < panel_dim_max; i++ ) + p_cast[ i ] = convert(0.0); + + p_cast += ldp; + } + } + } + } + + for ( auto j = panel_len; j < panel_len_max; j++) + { + for ( auto i = 0; i < panel_dim_max; i++) + p_cast[ i ] = convert(0.0); + + p_cast += ldp; + } +} + +template +void packm_ckx_ss + ( + bool conja, + dim_t panel_dim, + dim_t panel_len, + dim_t panel_dim_max, + dim_t panel_len_max, + void* kappa, + void* a, inc_t* inca, inc_t* scata, + void* p, inc_t ldp + ) +{ + T* restrict a_cast = ( T* )a; + T* restrict p_cast = ( T* )p; + auto kappa_cast = *( T* )kappa; + + if ( conja ) + { + for (dim_t j = 0;j < panel_len;j++) + { + for (dim_t i = 0;i < panel_dim;i++) + p_cast[ i ] = kappa_cast * conj( a_cast[ inca[i] + scata[j] ] ); + + for (dim_t i = panel_dim;i < panel_dim_max;i++) + p_cast[ i ] = convert(0.0); + + p_cast += ldp; + } + } + else + { + for (dim_t j = 0;j < panel_len;j++) + { + for (dim_t i = 0;i < panel_dim;i++) + p_cast[ i ] = kappa_cast * a_cast[ inca[i] + scata[j] ]; + + for (dim_t i = panel_dim;i < panel_dim_max;i++) + p_cast[ i ] = convert(0.0); + + p_cast += ldp; + } + } + + for (dim_t j = panel_len;j < panel_len_max;j++) + { + for (dim_t i = 0;i < panel_dim_max;i++) + p_cast[ i ] = convert(0.0); + + p_cast += ldp; + } +} + +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +static auto PASTEMAC(ch,op) = &packm_ckx_nb; + +INSERT_GENTFUNC_BASIC0(packm_ckx_nb); + +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +static auto PASTEMAC(ch,op) = &packm_ckx_ss; + +INSERT_GENTFUNC_BASIC0(packm_ckx_ss); + +static decltype(&packm_ckx_nb) GENARRAY( packm_ckx_nb_ukrs, packm_ckx_nb ); +static decltype(&packm_ckx_ss) GENARRAY( packm_ckx_ss_ukrs, packm_ckx_ss ); + +static void fill_scatter + ( + gint_t ndim, + const dim_t* restrict len, + const inc_t* restrict stride, + dim_t BS, + inc_t off, + dim_t size, + inc_t* restrict scat, + inc_t* restrict bs + ) +{ + if ( size == 0 ) return; + + if ( ndim == 0 ) + { + *scat = 0; + *bs = 0; + return; + } + + if ( ndim == 1 ) + { + auto l = *len; + auto s = *stride; + for ( auto i = 0; i < l; i++ ) + { + scat[i] = i*s; + bs[i] = s; + } + } + + dim_t tot_len = 1; + for ( auto i = 0; i < ndim; i++ ) + tot_len *= len[i]; + + assert(off >= 0); + assert(size >= 0); + assert(off+size <= tot_len); + + auto len0 = len[0]; + auto stride0 = stride[0]; + auto off0 = off % len0; + auto off1 = off / len0; + auto size1 = ( size + off0 + len0 - 1) / len0; + + inc_t pos1 = 0; + inc_t idx = 0; + for_each( ndim-1, len+1, off1, size1, pos1, stride+1, + [&] + { + auto pos = pos1 + off0 * stride0; + auto len_i = std::min( len0-off0, size-idx ); + for ( auto i = 0; i < len_i; i++ ) + { + scat[idx++] = pos; + pos += stride0; + } + off0 = 0; + }); + assert(idx == size); + + for ( idx = 0; idx < size; idx += BS ) + { + auto len_i = std::min( BS, size-idx ); + auto s = stride0; + + for ( auto i = idx; i < idx+len_i-1; i++) + { + if (scat[i+1]-scat[i] != s) + { + s = 0; + break; + } + } + + bs[idx] = s; + } +} + +void packm_tensor + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + // We begin by copying the fields of A. + bli_obj_alias_to( a, p ); + + // Get information about data types. + auto dt = bli_obj_dt( a ); + auto dt_tar = bli_obj_target_dt( a ); + auto dt_scalar = bli_obj_scalar_dt( a ); + auto dt_size = bli_dt_size( dt ); + + if ( dt_scalar != dt || dt_tar != dt ) + bli_abort(); + + // Extract various fields from the control tree. + auto bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); + auto bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); + auto schema = bli_cntl_packm_params_pack_schema( cntl ); + auto bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); + auto bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); + auto bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); + + if ( schema != BLIS_PACKED_ROW_PANELS && + schema != BLIS_PACKED_COL_PANELS ) + bli_abort(); + + // Store the pack schema to the object. + bli_obj_set_pack_schema( schema, p ); + + // Clear the conjugation field from the object since matrix packing + // in BLIS is deemed to take care of all conjugation necessary. + bli_obj_set_conj( BLIS_NO_CONJUGATE, p ); + + // If we are packing micropanels, mark P as dense. + bli_obj_set_uplo( BLIS_DENSE, p ); + + // Reset the view offsets to (0,0). + bli_obj_set_offs( 0, 0, p ); + + // Compute the dimensions padded by the dimension multiples. These + // dimensions will be the dimensions of the packed matrices, including + // zero-padding, and will be used by the macro- and micro-kernels. + // We compute them by starting with the effective dimensions of A (now + // in P) and aligning them to the dimension multiples (typically equal + // to register blocksizes). This does waste a little bit of space for + // level-2 operations, but that's okay with us. + auto m_p = bli_obj_length( p ); + auto n_p = bli_obj_width( p ); + auto m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def ); + auto n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def ); + + // Save the padded dimensions into the packed object. It is important + // to save these dimensions since they represent the actual dimensions + // of the zero-padded matrix. + bli_obj_set_padded_dims( m_p_pad, n_p_pad, p ); + + // The "panel stride" of a micropanel packed object is interpreted as + // the distance between the (0,0) element of panel k and the (0,0) + // element of panel k+1. We use the padded width computed above to + // allow for zero-padding (if necessary/desired) along the far end + // of each micropanel (ie: the right edge of the matrix). Zero-padding + // can also occur along the long edge of the last micropanel if the m + // dimension of the matrix is not a whole multiple of MR. + auto ps_p = bmult_m_pack * n_p_pad; + + /* Compute the total number of iterations we'll need. */ + auto n_iter = m_p_pad / bmult_m_def; + + // Store the strides and panel dimension in P. + bli_obj_set_strides( 1, bmult_m_pack, p ); + bli_obj_set_imag_stride( 1, p ); + bli_obj_set_panel_dim( bmult_m_def, p ); + bli_obj_set_panel_stride( ps_p, p ); + bli_obj_set_panel_length( bmult_m_def, p ); + bli_obj_set_panel_width( n_p, p ); + + // Compute the size of the packed buffer. + auto size_p = ps_p * n_iter * dt_size; + if ( size_p == 0 ) return; + + // Compute the size of the scatter and block-scatter vectors to the total. + // It is never necessary to add padding for alignment because: + // 1) ps_p is always even + // 2) dt_size is a power of two >= 4 + // 3) the alignment of the scatter vectors is at most 8 + auto scat_size = 2 * (m_p + n_p) * sizeof(inc_t); + + // Update the buffer address in p to point to the buffer associated + // with the mem_t entry acquired from the memory broker (now cached in + // the control tree node). + auto p_cast = (char*)bli_packm_alloc( size_p + scat_size, rntm, cntl, thread ); + bli_obj_set_buffer( p_cast, p ); + + // Get the addresses of the scatter and block-scatter vectors. These are + // placed directly after the packed matrix buffer. + auto rscat = (inc_t*)(p_cast + size_p); + auto rbs = rscat + m_p; + auto cscat = rbs + m_p; + auto cbs = cscat + n_p; + + auto a_cast = (char*)bli_obj_buffer_at_off( a ); + auto panel_dim_off = bli_obj_row_off( a ); + auto panel_len_off = bli_obj_col_off( a ); + auto conja = bli_obj_conj_status( a ); + + auto params = (packm_tensor_params_t*)bli_obj_pack_params( a ); + auto ndim_m = params->ndim_m; + auto ndim_n = params->ndim_n; + auto len_m = params->len_m; + auto len_n = params->len_n; + auto stride_m = params->stride_m; + auto stride_n = params->stride_n; + + obj_t kappa_local; + auto kappa_cast = (char*)bli_packm_scalar( &kappa_local, p ); + + auto packm_nb_ker = packm_ckx_nb_ukrs[ dt ]; + auto packm_ss_ker = packm_ckx_ss_ukrs[ dt ]; + + a_cast -= ( panel_dim_off * stride_m[0] + + panel_len_off * stride_n[0] ) * dt_size; + + /* Fill in the scatter and block-scatter vectors. This is done single-threaded for now. */ + if ( bli_thread_am_ochief( thread ) ) + { + fill_scatter + ( + ndim_m, + len_m, + stride_m, + bmult_m_def, + panel_dim_off, + m_p, + rscat, + rbs + ); + + fill_scatter + ( + ndim_n, + len_n, + stride_n, + BS_K, + panel_len_off, + n_p, + cscat, + cbs + ); + } + + /* Wait for the scatter vectors to be done. */ + bli_thread_barrier( thread ); + + /* Query the number of threads and thread ids from the current thread's + packm thrinfo_t node. */ + auto nt = bli_thread_n_way( thread ); + auto tid = bli_thread_work_id( thread ); + + /* Determine the thread range and increment using the current thread's + packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + will depend on whether slab or round-robin partitioning was requested + at configure-time. */ + dim_t it_start, it_end, it_inc; + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); + + /* Iterate over every logical micropanel in the source matrix. */ + for ( auto it = 0; it < n_iter; it += 1 ) + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) + { + auto panel_dim_i = bli_min( bmult_m_def, m_p - it*bmult_m_def ); + + auto p_begin = p_cast + it*ps_p*dt_size; + auto inca = rbs[ it*bmult_m_def ]; + + if ( inca ) + { + auto a_begin = a_cast + rscat[ it*bmult_m_def ]*dt_size; + + packm_nb_ker( conja, + panel_dim_i, + n_p, + bmult_m_def, + n_p_pad, + kappa_cast, + a_begin, inca, cbs, cscat, + p_begin, bmult_m_pack ); + } + else + { + auto a_begin = a_cast; + auto rscat_use = rscat + it*bmult_m_def; + + packm_ss_ker( conja, + panel_dim_i, + n_p, + bmult_m_def, + n_p_pad, + kappa_cast, + a_begin, rscat_use, cscat, + p_begin, bmult_m_pack ); + } + } +} + +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +void PASTEMAC(ch,op) \ + ( \ + dim_t m, \ + dim_t n, \ + void* x, inc_t rs_x, inc_t cs_x, \ + void* b, \ + void* y, inc_t* rs_y, inc_t* cs_y \ + ) \ +{ \ + ctype* restrict x_cast = (ctype*)x; \ + ctype b_cast = *(ctype*)b; \ + ctype* restrict y_cast = (ctype*)y; \ +\ + if ( PASTEMAC(ch,eq0)( b_cast ) ) \ + { \ + for ( auto i = 0; i < m; i++ ) \ + for ( auto j = 0; j < n; j++ ) \ + PASTEMAC(ch,copys)( x_cast[ i*rs_x + j*cs_x ], y_cast[ rs_y[i] + cs_y[j] ] ); \ + } \ + else \ + { \ + for ( auto i = 0; i < m; i++ ) \ + for ( auto j = 0; j < n; j++ ) \ + PASTEMAC(ch,xpbys)( x_cast[ i*rs_x + j*cs_x ], b_cast, y_cast[ rs_y[i] + cs_y[j] ] ); \ + } \ +} + +INSERT_GENTFUNC_BASIC0(scatter_mxn); + +static decltype(&bli_sscatter_mxn) GENARRAY(scatter_mxn, scatter_mxn); + +void gemm_tensor + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + auto dt = bli_obj_dt( c ); + auto dt_size = bli_dt_size( dt ); + + auto m = bli_obj_length( c ); + auto n = bli_obj_width( c ); + auto k = bli_obj_width( a ); + + auto a_cast = (char*)bli_obj_buffer_at_off( a ); + auto pd_a = bli_obj_panel_dim( a ); + auto ps_a = bli_obj_panel_stride( a ); + + auto b_cast = (char*)bli_obj_buffer_at_off( b ); + auto pd_b = bli_obj_panel_dim( b ); + auto ps_b = bli_obj_panel_stride( b ); + + auto c_cast = (char*)bli_obj_buffer_at_off( c ); + auto rs_c0 = bli_obj_row_stride( c ); + auto cs_c0 = bli_obj_col_stride( c ); + auto off_m = bli_obj_row_off( c ); + auto off_n = bli_obj_col_off( c ); + + auto params = (gemm_tensor_params_t*)bli_obj_ker_params( c ); + auto ndim_m = params->ndim_m; + auto ndim_n = params->ndim_n; + auto len_m = params->len_m; + auto len_n = params->len_n; + auto stride_m = params->stride_m; + auto stride_n = params->stride_n; + + if ( rs_c0 != stride_m[0] || cs_c0 != stride_n[0] ) + { + std::swap( ndim_m, ndim_n ); + std::swap( len_m, len_n ); + std::swap( stride_m, stride_n ); + } + + /* If any dimension is zero, return immediately. */ + if ( bli_zero_dim3( m, n, k ) ) return; + + c_cast -= ( off_m * stride_m[0] + + off_n * stride_n[0] ) * dt_size; + + // Detach and multiply the scalars attached to A and B. + // NOTE: We know that the internal scalars of A and B are already of the + // target datatypes because the necessary typecasting would have already + // taken place during bli_packm_init(). + obj_t scalar_a; + obj_t scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + // NOTE: We know that scalar_b is of type dt due to the above code + // that casts the scalars of A and B to dt via scalar_a and scalar_b, + // and we know that the internal scalar in C is already of the type dt + // due to the casting in the implementation of bli_obj_scalar_attach(). + auto alpha_cast = (char*)bli_obj_internal_scalar_buffer( &scalar_b ); + auto beta_cast = (char*)bli_obj_internal_scalar_buffer( c ); + + /* Alias some constants to simpler names. */ + auto MR = pd_a; + auto NR = pd_b; + + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ + auto gemm_ukr = (gemm_ukr_vft)bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ + char ct[ BLIS_STACK_BUF_MAX_SIZE ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); + auto col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); + auto rs_ct = ( col_pref ? 1 : NR ); + auto cs_ct = ( col_pref ? MR : 1 ); + auto zero = (char*)bli_obj_buffer_for_const( dt, &BLIS_ZERO ); + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + auto scat_size = 2 * (m + n) * sizeof(inc_t); + auto rscat_c = (inc_t*)bli_packm_alloc_ex( scat_size, BLIS_BUFFER_FOR_GEN_USE, rntm, cntl, thread ); + auto rbs_c = rscat_c + m; + auto cscat_c = rbs_c + m; + auto cbs_c = cscat_c + n; + + /* Fill in the scatter and block-scatter vectors. This is done single-threaded for now. */ + if ( bli_thread_am_ochief( thread ) ) + { + fill_scatter + ( + ndim_m, + len_m, + stride_m, + MR, + off_m, + m, + rscat_c, + rbs_c + ); + + fill_scatter + ( + ndim_n, + len_n, + stride_n, + NR, + off_n, + n, + cscat_c, + cbs_c + ); + } + + /* Wait for the scatter vectors to be done. */ + bli_thread_barrier( thread ); + + /* Compute number of primary and leftover components of the m and n + dimensions. */ + auto n_iter = n / NR; + auto n_left = n % NR; + + auto m_iter = m / MR; + auto m_left = m % MR; + + if ( n_left ) ++n_iter; + if ( m_left ) ++m_iter; + + /* Determine some increments used to step through A, B, and C. */ + auto rstep_a = ps_a * dt_size; + auto cstep_b = ps_b * dt_size; + + /* Save the virtual microkernel address and the params. */ + auxinfo_t aux; + bli_auxinfo_set_ukr( (void*)gemm_ukr, &aux ); + bli_auxinfo_set_params( params, &aux ); + + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ + auto caucus = bli_thrinfo_sub_node( thread ); + + /* Query the number of threads and thread ids for each loop. */ + auto jr_nt = bli_thread_n_way( thread ); + auto jr_tid = bli_thread_work_id( thread ); + auto ir_nt = bli_thread_n_way( caucus ); + auto ir_tid = bli_thread_work_id( caucus ); + + /* Determine the thread range and increment for the 2nd and 1st loops. + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. */ + dim_t jr_start, jr_end; + dim_t ir_start, ir_end; + dim_t jr_inc, ir_inc; + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + + /* Loop over the n dimension (NR columns at a time). */ + for ( auto j = jr_start; j < jr_end; j += jr_inc ) + { + auto b1 = b_cast + j * cstep_b; + + auto n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + /* Initialize our next panel of B to be the current panel of B. */ + auto b2 = b1; + + /* Loop over the m dimension (MR rows at a time). */ + for ( auto i = ir_start; i < ir_end; i += ir_inc ) + { + auto a1 = a_cast + i * rstep_a; + auto rscat_c1 = rscat_c + i * MR; + auto rbs_c1 = rbs_c + i * MR; + auto cscat_c1 = cscat_c + j * NR; + auto cbs_c1 = cbs_c + j * NR; + + auto m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + /* Compute the addresses of the next panels of A and B. */ + auto a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); + if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) + { + a2 = a_cast; + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); + if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + auto rs_c = *rbs_c1; + auto cs_c = *cbs_c1; + + if ( rs_c && cs_c ) + { + auto c11 = c_cast + ( *rscat_c1 + *cscat_c1 ) * dt_size; + + /* Invoke the gemm micro-kernel. */ + gemm_ukr + ( + m_cur, + n_cur, + k, + alpha_cast, + a1, + b1, + beta_cast, + c11, rs_c, cs_c, + &aux, + cntx + ); + } + else + { + /* Invoke the gemm micro-kernel. */ + gemm_ukr + ( + MR, + NR, + k, + alpha_cast, + a1, + b1, + zero, + &ct, rs_ct, cs_ct, + &aux, + cntx + ); + + /* Scatter to C. */ + scatter_mxn[ dt ] + ( + m_cur, n_cur, + &ct, rs_ct, cs_ct, + beta_cast, + c_cast, rscat_c1, cscat_c1 + ); + } + } + } +} + +static bool has_unit_stride( const std::vector& stride ) +{ + for ( auto s : stride ) + if ( s == 1 ) + return true; + return false; +} + +void tcontract( num_t dt, const std::vector& m, const std::vector& n, const std::vector& k, + const void* alpha, const void* a, std::vector rs_a, std::vector cs_a, + const void* b, std::vector rs_b, std::vector cs_b, + const void* beta, void* c, std::vector rs_c, std::vector cs_c ) +{ + if ( rs_a.size() != m.size() || + rs_b.size() != k.size() || + rs_c.size() != m.size() ) + bli_check_error_code( BLIS_INVALID_ROW_STRIDE ); + + if ( cs_a.size() != k.size() || + cs_b.size() != n.size() || + cs_c.size() != n.size() ) + bli_check_error_code( BLIS_INVALID_COL_STRIDE ); + + dim_t m_mat = 1; + dim_t n_mat = 1; + dim_t k_mat = 1; + for ( auto& i : m ) m_mat *= i; + for ( auto& i : n ) n_mat *= i; + for ( auto& i : k ) k_mat *= i; + + auto& stride_m = has_unit_stride( rs_c ) ? rs_c : rs_a; + for ( int i = 1;i < m.size(); i++ ) + for ( int j = 0;j < m.size()-i; j++ ) + if ( stride_m[j] > stride_m[j+1] ) + { + std::swap( rs_a[j], rs_a[j+1] ); + std::swap( rs_c[j], rs_c[j+1] ); + } + + auto& stride_n = has_unit_stride( cs_c ) ? cs_c : cs_b; + for ( int i = 1;i < n.size(); i++ ) + for ( int j = 0;j < n.size()-i; j++ ) + if ( stride_n[j] > stride_n[j+1] ) + { + std::swap( cs_b[j], cs_b[j+1] ); + std::swap( cs_c[j], cs_c[j+1] ); + } + + auto& stride_k = has_unit_stride( cs_a ) ? cs_a : rs_b; + for ( int i = 1;i < k.size(); i++ ) + for ( int j = 0;j < k.size()-i; j++ ) + if ( stride_k[j] > stride_k[j+1] ) + { + std::swap( cs_a[j], cs_a[j+1] ); + std::swap( rs_b[j], rs_b[j+1] ); + } + + if ( rs_a.empty() ) rs_a.push_back( 1 ); + if ( cs_a.empty() ) cs_a.push_back( 1 ); + if ( rs_b.empty() ) rs_b.push_back( 1 ); + if ( cs_b.empty() ) cs_b.push_back( 1 ); + if ( rs_c.empty() ) rs_c.push_back( 1 ); + if ( cs_c.empty() ) cs_c.push_back( 1 ); + + obj_t a_o, b_o, c_o; + bli_obj_create_with_attached_buffer( dt, m_mat, k_mat, const_cast(a), rs_a[0], cs_a[0], &a_o ); + bli_obj_create_with_attached_buffer( dt, k_mat, n_mat, const_cast(b), rs_b[0], cs_b[0], &b_o ); + bli_obj_create_with_attached_buffer( dt, m_mat, n_mat, c , rs_c[0], cs_c[0], &c_o ); + + packm_tensor_params_t params_a( m.size(), m.data(), rs_a.data(), + k.size(), k.data(), cs_a.data() ); + packm_tensor_params_t params_b( n.size(), n.data(), cs_b.data(), + k.size(), k.data(), rs_b.data() ); + gemm_tensor_params_t params_c( m.size(), m.data(), rs_c.data(), + n.size(), n.data(), cs_c.data() ); + + bli_obj_set_pack_fn( packm_tensor, &a_o ); + bli_obj_set_pack_fn( packm_tensor, &b_o ); + bli_obj_set_ker_fn( gemm_tensor, &c_o ); + bli_obj_set_pack_params( ¶ms_a, &a_o ); + bli_obj_set_pack_params( ¶ms_b, &b_o ); + bli_obj_set_ker_params( ¶ms_c, &c_o ); + + obj_t alpha_o, beta_o; + bli_obj_create_1x1_with_attached_buffer( dt, const_cast(alpha), &alpha_o ); + bli_obj_create_1x1_with_attached_buffer( dt, const_cast(beta), &beta_o ); + + rntm_t rntm; + bli_rntm_init_from_global( &rntm ); + bli_rntm_disable_l3_sup( &rntm ); + + bli_gemm_ex( &alpha_o, &a_o, &b_o, &beta_o, &c_o, NULL, &rntm ); +} + +int main() +{ + auto N = 5; + + gint_t ndim_a = 4; + gint_t ndim_b = 4; + gint_t ndim_c = 4; + + std::vector len_a(ndim_a, N); + std::vector len_b(ndim_b, N); + std::vector len_c(ndim_c, N); + + std::vector stride_a(ndim_a, 1); + std::vector stride_b(ndim_b, 1); + std::vector stride_c(ndim_c, 1); + for ( gint_t i = 1; i < ndim_a; i++ ) + stride_a[i] = stride_a[i-1] * len_a[i - 1]; + for ( gint_t i = 1; i < ndim_b; i++ ) + stride_b[i] = stride_b[i-1] * len_b[i - 1]; + for ( gint_t i = 1; i < ndim_c; i++ ) + stride_c[i] = stride_c[i-1] * len_c[i - 1]; + + std::vector dim_a(ndim_a); + std::vector dim_b(ndim_b); + std::vector dim_c(ndim_c); + std::iota(dim_a.begin(), dim_a.end(), 0); + std::iota(dim_b.begin(), dim_b.end(), 0); + std::iota(dim_c.begin(), dim_c.end(), 0); + + for ( int dt_ = BLIS_DT_LO; dt_ <= BLIS_DT_HI; dt_++ ) + do + do + do + { + auto dt = ( num_t )dt_; + + auto ndim_m = (ndim_a + ndim_c - ndim_b)/2; + auto ndim_k = (ndim_a + ndim_b - ndim_c)/2; + + std::vector m(len_a.begin(), len_a.begin()+ndim_m); + std::vector n(len_b.begin()+ndim_k, len_b.end()); + std::vector k(len_b.begin(), len_b.begin()+ndim_k); + + std::vector rs_a(stride_a.begin(), stride_a.begin()+ndim_m); + std::vector cs_a(stride_a.begin()+ndim_m, stride_a.end()); + std::vector rs_b(stride_b.begin(), stride_b.begin()+ndim_k); + std::vector cs_b(stride_b.begin()+ndim_k, stride_b.end()); + std::vector rs_c(stride_c.begin(), stride_c.begin()+ndim_m); + std::vector cs_c(stride_c.begin()+ndim_m, stride_c.end()); + + dim_t m_tot = 1; + dim_t n_tot = 1; + dim_t k_tot = 1; + for ( auto i : m ) m_tot *= i; + for ( auto i : n ) n_tot *= i; + for ( auto i : k ) k_tot *= i; + + obj_t a, b, c, c_ref, norm; + + bli_obj_create( dt, m_tot*k_tot, 1, 1, 1, &a ); + bli_obj_create( dt, k_tot*n_tot, 1, 1, 1, &b ); + bli_obj_create( dt, m_tot*n_tot, 1, 1, 1, &c ); + bli_obj_create( dt, m_tot*n_tot, 1, 1, 1, &c_ref ); + bli_obj_create_1x1( bli_dt_proj_to_real( dt ), &norm ); + + bli_randv( &a ); + bli_randv( &b ); + bli_randv( &c ); + bli_copyv( &c, &c_ref ); + + tcontract( dt, m, n, k, + bli_obj_buffer_for_const( dt, &BLIS_ONE ), + bli_obj_buffer( &a ), rs_a, cs_a, + bli_obj_buffer( &b ), rs_b, cs_b, + bli_obj_buffer_for_const( dt, &BLIS_ZERO ), + bli_obj_buffer( &c ), rs_c, cs_c ); + + tcontract_ref( dt, m, n, k, + bli_obj_buffer_for_const( dt, &BLIS_ONE ), + bli_obj_buffer( &a ), rs_a, cs_a, + bli_obj_buffer( &b ), rs_b, cs_b, + bli_obj_buffer_for_const( dt, &BLIS_ZERO ), + bli_obj_buffer( &c_ref ), rs_c, cs_c ); + + bli_subv( &c_ref, &c ); + bli_normfv( &c, &norm ); + + double normr, normi; + bli_getsc( &norm, &normr, &normi ); + + printf("dt: %d, dim_a: [%d,%d,%d,%d], dim_b: [%d,%d,%d,%d], dim_c: [%d,%d,%d,%d], norm: %g\n", + dt, dim_a[0], dim_a[1], dim_a[2], dim_a[3], + dim_b[0], dim_b[1], dim_b[2], dim_b[3], + dim_c[0], dim_c[1], dim_c[2], dim_c[3], + normr / std::sqrt( bli_obj_vector_dim( &c ) ) ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_ref ); + } + while (std::next_permutation(dim_a.begin(), dim_a.end())); + while (std::next_permutation(dim_b.begin(), dim_b.end())); + while (std::next_permutation(dim_c.begin(), dim_c.end())); +} + diff --git a/test/tensor_contraction/tcontract_ref.cxx b/test/tensor_contraction/tcontract_ref.cxx new file mode 100644 index 000000000..b4cd07f90 --- /dev/null +++ b/test/tensor_contraction/tcontract_ref.cxx @@ -0,0 +1,67 @@ +#include "tcontract_ref.hpp" + +template +void tcontract_ref( const std::vector& m, const std::vector& n, const std::vector& k, + const void* alpha, const void* a, const std::vector& rs_a, const std::vector& cs_a, + const void* b, const std::vector& rs_b, const std::vector& cs_b, + const void* beta, void* c, const std::vector& rs_c, const std::vector& cs_c ) +{ + auto alpha_cast = *( T* )alpha; + auto beta_cast = *( T* )beta; + auto a_cast = ( T* )a; + auto b_cast = ( T* )b; + auto c_cast = ( T* )c; + + for_each(m.size(), m.data(), a_cast, rs_a.data(), c_cast, rs_c.data(), + [&] + { + for_each(n.size(), n.data(), b_cast, cs_b.data(), c_cast, cs_c.data(), + [&] + { + auto ab = convert(0.0); + + for_each(k.size(), k.data(), a_cast, cs_a.data(), b_cast, rs_b.data(), + [&] + { + ab += (*a_cast) * (*b_cast); + }); + + if ( beta_cast == convert(0.0) ) + { + *c_cast = alpha_cast * ab; + } + else + { + *c_cast = alpha_cast * ab + beta_cast * (*c_cast); + } + }); + + assert(b_cast == b); + }); + + assert(a_cast == a); + assert(c_cast == c); +} + +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +static auto PASTEMAC(ch,op) = &tcontract_ref; + +INSERT_GENTFUNC_BASIC0(tcontract_ref); + +static decltype(&tcontract_ref) GENARRAY( tcontract_ref_impl, tcontract_ref ); + +void tcontract_ref( num_t dt, const std::vector& m, const std::vector& n, const std::vector& k, + const void* alpha, const void* a, const std::vector& rs_a, const std::vector& cs_a, + const void* b, const std::vector& rs_b, const std::vector& cs_b, + const void* beta, void* c, const std::vector& rs_c, const std::vector& cs_c ) +{ + tcontract_ref_impl[ dt ] + ( + m, n, k, + alpha, a, rs_a, cs_a, + b, rs_b, cs_b, + beta, c, rs_c, cs_c + ); +} + diff --git a/test/tensor_contraction/tcontract_ref.hpp b/test/tensor_contraction/tcontract_ref.hpp new file mode 100644 index 000000000..99d4380dc --- /dev/null +++ b/test/tensor_contraction/tcontract_ref.hpp @@ -0,0 +1,100 @@ +#include "blis.h" +#include "complex_math.hpp" + +#include +#include +#include + +inline void increment(inc_t, gint_t) {} + +template +void increment(inc_t n, gint_t i, T& off, const inc_t* s, Args&... args) +{ + off += s[i]*n; + increment(n, i, args...); +} + +template +void for_each_impl(gint_t ndim, const dim_t* n, + dim_t off, dim_t len, + Body& body, + Args&... args) +{ + std::array i = {}; + assert( ndim <= i.size() ); + + if ( off ) + { + for ( gint_t k = 0; k < ndim; k++ ) + { + i[k] = off % n[k]; + off /= n[k]; + increment(i[k], k, args...); + } + } + + for ( dim_t pos = 0; pos < len; pos++ ) + { + body(); + + for ( gint_t k = 0; k < ndim; k++ ) + { + if ( i[k] == n[k]-1 ) + { + increment(-i[k], k, args...); + i[k] = 0; + } + else + { + increment(1, k, args...); + i[k]++; + break; + } + } + } +} + +template +void for_each(gint_t ndim, const dim_t* n, + dim_t off, dim_t len, + T& a, const inc_t* s_a, + Body&& body) +{ + for_each_impl( ndim, n, off, len, body, a, s_a ); +} + +template +void for_each(gint_t ndim, const dim_t* n, + dim_t off, dim_t len, + T& a, const inc_t* s_a, + T& b, const inc_t* s_b, + Body&& body) +{ + for_each_impl( ndim, n, off, len, body, a, s_a, b, s_b ); +} + +template +void for_each(gint_t ndim, const dim_t* n, + T& a, const inc_t* s_a, + Body&& body) +{ + dim_t len = 1; + for ( gint_t i = 0;i < ndim;i++ ) len *= n[i]; + for_each_impl( ndim, n, 0, len, body, a, s_a ); +} + +template +void for_each(gint_t ndim, const dim_t* n, + T& a, const inc_t* s_a, + T& b, const inc_t* s_b, + Body&& body) +{ + dim_t len = 1; + for ( gint_t i = 0;i < ndim;i++ ) len *= n[i]; + for_each_impl( ndim, n, 0, len, body, a, s_a, b, s_b ); +} + +void tcontract_ref( num_t dt, const std::vector& m, const std::vector& n, const std::vector& k, + const void* alpha, const void* a, const std::vector& rs_a, const std::vector& cs_a, + const void* b, const std::vector& rs_b, const std::vector& cs_b, + const void* beta, void* c, const std::vector& rs_c, const std::vector& cs_c ); From 08174a2f6ebbd8ed5aa2bc4edc45da80962f06bb Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 1 Jan 2022 21:35:19 +0900 Subject: [PATCH 015/230] Evict Requirement for SVE GEMM For 8<= GCC < 10 compatibility. --- config/armsve/bli_cntx_init_armsve.c | 1 - .../armsve/3/bli_armsve_utils.c | 6 +++--- .../armsve/3/bli_armsve_utils.h | 2 +- kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c | 3 +-- kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c | 3 +-- kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c | 3 +-- kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c | 3 +-- kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c | 3 +-- kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c | 4 +--- kernels/armsve/bli_kernels_armsve.h | 1 + 10 files changed, 11 insertions(+), 18 deletions(-) rename config/armsve/bli_armsve_config_utils.c => kernels/armsve/3/bli_armsve_utils.c (97%) rename config/armsve/bli_armsve_config_utils.h => kernels/armsve/3/bli_armsve_utils.h (98%) diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c index fafed2229..cd07924a7 100644 --- a/config/armsve/bli_cntx_init_armsve.c +++ b/config/armsve/bli_cntx_init_armsve.c @@ -33,7 +33,6 @@ */ #include "blis.h" -#include "bli_armsve_config_utils.h" void bli_cntx_init_armsve( cntx_t* cntx ) { diff --git a/config/armsve/bli_armsve_config_utils.c b/kernels/armsve/3/bli_armsve_utils.c similarity index 97% rename from config/armsve/bli_armsve_config_utils.c rename to kernels/armsve/3/bli_armsve_utils.c index 70501e39d..1e3256d34 100644 --- a/config/armsve/bli_armsve_config_utils.c +++ b/kernels/armsve/3/bli_armsve_utils.c @@ -35,7 +35,7 @@ */ #include "blis.h" -dim_t bli_vl_bits_armsve(void) +dim_t bli_vl_bytes_armsve(void) { \ uint64_t vl = 0; __asm__ ( @@ -43,7 +43,7 @@ dim_t bli_vl_bits_armsve(void) " incb x0 \n\t" " mov %[vl], x0 \n\t" : [vl] "=r" (vl) - : + : : "x0" ); return vl; @@ -64,7 +64,7 @@ void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \ dim_t N_L3 = bli_env_get_var("BLIS_SVE_N_L3", N_L3_SVE_DEFAULT); \ dim_t C_L3 = bli_env_get_var("BLIS_SVE_C_L3", C_L3_SVE_DEFAULT); \ \ - dim_t vl_b = bli_vl_bits_armsve(); \ + dim_t vl_b = bli_vl_bytes_armsve(); \ dim_t vl = vl_b / S_Data; \ dim_t m_r = 2 * vl; \ dim_t n_r = 10; \ diff --git a/config/armsve/bli_armsve_config_utils.h b/kernels/armsve/3/bli_armsve_utils.h similarity index 98% rename from config/armsve/bli_armsve_config_utils.h rename to kernels/armsve/3/bli_armsve_utils.h index 87bba73ed..6d3aab05d 100644 --- a/config/armsve/bli_armsve_config_utils.h +++ b/kernels/armsve/3/bli_armsve_utils.h @@ -35,7 +35,7 @@ */ #include "blis.h" -dim_t bli_vl_bits_armsve(void); +dim_t bli_vl_bytes_armsve(void); void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c index 913abd1f6..c84a59f07 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c @@ -42,7 +42,6 @@ // 2vx10 microkernels. #include "armsve_asm_2vx10cmplx.h" -#include "arm_sve.h" void bli_cgemm_armsve_asm_2vx10_unindexed ( @@ -69,7 +68,7 @@ void bli_cgemm_armsve_asm_2vx10_unindexed uint64_t cs_c = cs_c0; uint64_t info = 0; - uint64_t mr = svcntw(); + uint64_t mr = bli_vl_bytes_armsve() * 2 / 8; GEMM_UKR_SETUP_CT( c, mr, 10, false ); __asm__ volatile ( diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c index 9730fb8ce..5a662df4e 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c @@ -42,7 +42,6 @@ // 2vx10 microkernels. #include "armsve_asm_2vx10.h" -#include "arm_sve.h" void bli_dgemm_armsve_asm_2vx10_unindexed ( @@ -68,7 +67,7 @@ void bli_dgemm_armsve_asm_2vx10_unindexed uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - uint64_t mr = 2*svcntd(); + uint64_t mr = bli_vl_bytes_armsve() * 2 / 8; GEMM_UKR_SETUP_CT( d, mr, 10, false ); __asm__ volatile ( diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c index 74c4779d7..caa70a5e5 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c @@ -42,7 +42,6 @@ // 2vx10 microkernels. #include "armsve_asm_2vx10.h" -#include "arm_sve.h" void bli_sgemm_armsve_asm_2vx10_unindexed ( @@ -68,7 +67,7 @@ void bli_sgemm_armsve_asm_2vx10_unindexed uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - uint64_t mr = 2*svcntw(); + uint64_t mr = bli_vl_bytes_armsve() * 2 / 4; GEMM_UKR_SETUP_CT( s, mr, 10, false ); __asm__ volatile ( diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c index ee041b3c4..25084af35 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c @@ -42,7 +42,6 @@ // 2vx10 microkernels. #include "armsve_asm_2vx10cmplx.h" -#include "arm_sve.h" void bli_zgemm_armsve_asm_2vx10_unindexed ( @@ -69,7 +68,7 @@ void bli_zgemm_armsve_asm_2vx10_unindexed uint64_t cs_c = cs_c0; uint64_t info = 0; - uint64_t mr = svcntd(); + uint64_t mr = bli_vl_bytes_armsve() * 2 / 16; GEMM_UKR_SETUP_CT( z, mr, 10, false ); __asm__ volatile ( diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c index 641944ecd..ca62f9db1 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c @@ -42,7 +42,6 @@ // 2vx7 microkernels. #include "armsve_asm_2vx7cmplx.h" -#include "arm_sve.h" void bli_zgemm_armsve_asm_2vx7_unindexed ( @@ -69,7 +68,7 @@ void bli_zgemm_armsve_asm_2vx7_unindexed uint64_t cs_c = cs_c0; uint64_t info = 0; - uint64_t mr = svcntd(); + uint64_t mr = bli_vl_bytes_armsve() * 2 / 16; GEMM_UKR_SETUP_CT( z, mr, 7, false ); __asm__ volatile ( diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c index 4272f72c0..4a910baac 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c @@ -42,8 +42,6 @@ // 2vx8 microkernels. #include "armsve_asm_2vx8cmplx.h" -#include "arm_sve.h" - void bli_zgemm_armsve_asm_2vx8_unindexed ( dim_t m, @@ -69,7 +67,7 @@ void bli_zgemm_armsve_asm_2vx8_unindexed uint64_t cs_c = cs_c0; uint64_t info = 0; - uint64_t mr = svcntd(); + uint64_t mr = bli_vl_bytes_armsve() * 2 / 16; GEMM_UKR_SETUP_CT( z, mr, 8, false ); __asm__ volatile ( diff --git a/kernels/armsve/bli_kernels_armsve.h b/kernels/armsve/bli_kernels_armsve.h index 0d5c5dc47..408300308 100644 --- a/kernels/armsve/bli_kernels_armsve.h +++ b/kernels/armsve/bli_kernels_armsve.h @@ -31,6 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include "./3/bli_armsve_utils.h" GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_armsve_asm_2vx10_unindexed ) From 466b68a3ad118342dc49a8130b7b02f5e7748521 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sun, 2 Jan 2022 14:59:41 -0600 Subject: [PATCH 016/230] Add unique tag to branch labels for Apple ARM64. Add `%=` tag to branch labels, which expands to a unique identifier for each inline assembly block. This prevents duplicate symbol errors on Apple Silicon (#594). Fixes #594. [ci skip] since we can't test Apple Silicon anyways... --- kernels/armv8a/3/armv8a_asm_utils.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h index 5cb0bad69..6ef6a3fb0 100644 --- a/kernels/armv8a/3/armv8a_asm_utils.h +++ b/kernels/armv8a/3/armv8a_asm_utils.h @@ -36,10 +36,10 @@ // Apple's local label requirements. #if defined(__APPLE__) -#define LABEL(str) " L" #str": \n\t" -#define BEQ(str) "b.eq L" #str" \n\t" -#define BNE(str) "b.ne L" #str" \n\t" -#define BRANCH(str) "b L" #str" \n\t" +#define LABEL(str) " L" #str"%=: \n\t" +#define BEQ(str) "b.eq L" #str"%= \n\t" +#define BNE(str) "b.ne L" #str"%= \n\t" +#define BRANCH(str) "b L" #str"%= \n\t" #else #define LABEL(str) " ." #str": \n\t" #define BEQ(str) "b.eq ." #str" \n\t" From 864bfab4486ac910ef9a366e9ade4b45a39747fc Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 4 Jan 2022 15:10:34 -0600 Subject: [PATCH 017/230] CREDITS file update. --- CREDITS | 1 + 1 file changed, 1 insertion(+) diff --git a/CREDITS b/CREDITS index 81fc9bec5..7dd452daa 100644 --- a/CREDITS +++ b/CREDITS @@ -70,6 +70,7 @@ but many others have contributed code and feedback, including Stepan Nassyr @stepannassyr (Jülich Supercomputing Centre) Nisanth Padinharepatt (AMD) Ajay Panyala @ajaypanyala + Marc-Antoine Parent @maparent (Conversence) Devangi Parikh @dnparikh (The University of Texas at Austin) Elmar Peise @elmar-peise (RWTH-Aachen) Clément Pernet @ClementPernet From 3f2440b0226d5e23a43d12105d74aa917cd6c610 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 6 Jan 2022 14:57:36 -0600 Subject: [PATCH 018/230] Added m, n dims to gemmd/gemmlike ukernel calls. Details: - Updated the gemmd addon and the gemmlike sandbox code to use the new microkernel calling sequence, which now includes m and n dimensions so that the microkernel has all the information necessary to handle edge cases. Thanks to Jeff Diamond for catching this, which ideally would have been included in commit 54fa28b. - Retired var2 of both gemmd and gemmlike to 'attic' directories and removed their corresponding prototypes. In both cases, var2 was a variant of the block-panel algorithm where edge-case handling was abstracted away to a microkernel wrapper. (Since this is now the official behavior of BLIS microkernels, I saw no need to have it included as a separate code path.) - Comment updates. --- addon/gemmd/{ => attic}/bao_gemmd_bp_var2.c | 0 addon/gemmd/bao_gemmd.c | 20 +----- addon/gemmd/bao_gemmd_bp_var1.c | 67 ++++--------------- addon/gemmd/bao_gemmd_var.h | 7 -- .../gemmlike/{ => attic}/bls_gemm_bp_var2.c | 0 sandbox/gemmlike/bls_gemm.c | 19 +----- sandbox/gemmlike/bls_gemm_bp_var1.c | 67 ++++--------------- sandbox/gemmlike/bls_gemm_var.h | 7 -- 8 files changed, 30 insertions(+), 157 deletions(-) rename addon/gemmd/{ => attic}/bao_gemmd_bp_var2.c (100%) rename sandbox/gemmlike/{ => attic}/bls_gemm_bp_var2.c (100%) diff --git a/addon/gemmd/bao_gemmd_bp_var2.c b/addon/gemmd/attic/bao_gemmd_bp_var2.c similarity index 100% rename from addon/gemmd/bao_gemmd_bp_var2.c rename to addon/gemmd/attic/bao_gemmd_bp_var2.c diff --git a/addon/gemmd/bao_gemmd.c b/addon/gemmd/bao_gemmd.c index 71d49806b..fadc52691 100644 --- a/addon/gemmd/bao_gemmd.c +++ b/addon/gemmd/bao_gemmd.c @@ -197,9 +197,7 @@ void bao_gemmd_int // In this function, we choose the gemmd implementation that is executed // on each thread. -#if 1 - // Call the block-panel algorithm that calls the kernel directly, which - // exposes edge-case handling. + // Call the block-panel algorithm. bao_gemmd_bp_var1 ( alpha, @@ -212,22 +210,6 @@ void bao_gemmd_int rntm, thread ); -#else - // Call the block-panel algorithm that calls the kernel indirectly via a - // wrapper function, which hides edge-case handling. - bao_gemmd_bp_var2 - ( - alpha, - a, - d, - b, - beta, - c, - cntx, - rntm, - thread - ); -#endif } // diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c index e042f1fd8..09e4df09e 100644 --- a/addon/gemmd/bao_gemmd_bp_var1.c +++ b/addon/gemmd/bao_gemmd_bp_var1.c @@ -164,17 +164,6 @@ void PASTECH2(bao_,ch,varname) \ function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = cs_c; \ @@ -203,7 +192,6 @@ void PASTECH2(bao_,ch,varname) \ ctype alpha_local = *alpha_cast; \ ctype beta_local = *beta_cast; \ ctype one_local = *PASTEMAC(ch,1); \ - ctype zero_local = *PASTEMAC(ch,0); \ \ auxinfo_t aux; \ \ @@ -449,47 +437,20 @@ void PASTECH2(bao_,ch,varname) \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( mr_cur == MR && nr_cur == NR ) \ - { \ - /* Invoke the gemm microkernel. */ \ - gemm_ukr \ - ( \ - kc_cur, \ - &alpha_local, \ - a_ir, \ - b_jr, \ - beta_use, \ - c_ir, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm microkernel. */ \ - gemm_ukr \ - ( \ - kc_cur, \ - &alpha_local, \ - a_ir, \ - b_jr, \ - &zero_local, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the bottom edge of C and add the result from above. */ \ - PASTEMAC(ch,xpbys_mxn) \ - ( \ - mr_cur, \ - nr_cur, \ - ct, rs_ct, cs_ct, \ - beta_use, \ - c_ir, rs_c, cs_c \ - ); \ - } \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + &alpha_local, \ + a_ir, \ + b_jr, \ + beta_use, \ + c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ } \ } \ } \ diff --git a/addon/gemmd/bao_gemmd_var.h b/addon/gemmd/bao_gemmd_var.h index 5c6674727..05ec45e07 100644 --- a/addon/gemmd/bao_gemmd_var.h +++ b/addon/gemmd/bao_gemmd_var.h @@ -54,7 +54,6 @@ void PASTECH(bao_,opname) \ ); GENPROT( gemmd_bp_var1 ) -GENPROT( gemmd_bp_var2 ) // @@ -88,12 +87,6 @@ GENTPROT( double, d, gemmd_bp_var1 ) GENTPROT( scomplex, c, gemmd_bp_var1 ) GENTPROT( dcomplex, z, gemmd_bp_var1 ) -//INSERT_GENTPROT_BASIC0( gemmd_bp_var2 ) -GENTPROT( float, s, gemmd_bp_var2 ) -GENTPROT( double, d, gemmd_bp_var2 ) -GENTPROT( scomplex, c, gemmd_bp_var2 ) -GENTPROT( dcomplex, z, gemmd_bp_var2 ) - // // Prototype the typed kernel interfaces. diff --git a/sandbox/gemmlike/bls_gemm_bp_var2.c b/sandbox/gemmlike/attic/bls_gemm_bp_var2.c similarity index 100% rename from sandbox/gemmlike/bls_gemm_bp_var2.c rename to sandbox/gemmlike/attic/bls_gemm_bp_var2.c diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c index 0b15f2197..f2f8b7e25 100644 --- a/sandbox/gemmlike/bls_gemm.c +++ b/sandbox/gemmlike/bls_gemm.c @@ -192,9 +192,7 @@ void bls_gemm_int // In this function, we choose the gemm implementation that is executed // on each thread. -#if 1 - // Call the block-panel algorithm that calls the kernel directly, which - // exposes edge-case handling. + // Call the block-panel algorithm. bls_gemm_bp_var1 ( alpha, @@ -206,21 +204,6 @@ void bls_gemm_int rntm, thread ); -#else - // Call the block-panel algorithm that calls the kernel indirectly via a - // wrapper function, which hides edge-case handling. - bls_gemm_bp_var2 - ( - alpha, - a, - b, - beta, - c, - cntx, - rntm, - thread - ); -#endif } // diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c index ae695ce34..62dc462d5 100644 --- a/sandbox/gemmlike/bls_gemm_bp_var1.c +++ b/sandbox/gemmlike/bls_gemm_bp_var1.c @@ -157,17 +157,6 @@ void PASTECH2(bls_,ch,varname) \ function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = cs_c; \ @@ -194,7 +183,6 @@ void PASTECH2(bls_,ch,varname) \ ctype alpha_local = *alpha_cast; \ ctype beta_local = *beta_cast; \ ctype one_local = *PASTEMAC(ch,1); \ - ctype zero_local = *PASTEMAC(ch,0); \ \ auxinfo_t aux; \ \ @@ -437,47 +425,20 @@ void PASTECH2(bls_,ch,varname) \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( mr_cur == MR && nr_cur == NR ) \ - { \ - /* Invoke the gemm microkernel. */ \ - gemm_ukr \ - ( \ - kc_cur, \ - &alpha_local, \ - a_ir, \ - b_jr, \ - beta_use, \ - c_ir, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the gemm microkernel. */ \ - gemm_ukr \ - ( \ - kc_cur, \ - &alpha_local, \ - a_ir, \ - b_jr, \ - &zero_local, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale the bottom edge of C and add the result from above. */ \ - PASTEMAC(ch,xpbys_mxn) \ - ( \ - mr_cur, \ - nr_cur, \ - ct, rs_ct, cs_ct, \ - beta_use, \ - c_ir, rs_c, cs_c \ - ); \ - } \ + /* Invoke the gemm microkernel. */ \ + gemm_ukr \ + ( \ + mr_cur, \ + nr_cur, \ + kc_cur, \ + &alpha_local, \ + a_ir, \ + b_jr, \ + beta_use, \ + c_ir, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ } \ } \ } \ diff --git a/sandbox/gemmlike/bls_gemm_var.h b/sandbox/gemmlike/bls_gemm_var.h index 025b54a06..7c515f8c3 100644 --- a/sandbox/gemmlike/bls_gemm_var.h +++ b/sandbox/gemmlike/bls_gemm_var.h @@ -53,7 +53,6 @@ void PASTECH(bls_,opname) \ ); GENPROT( gemm_bp_var1 ) -GENPROT( gemm_bp_var2 ) // @@ -86,12 +85,6 @@ GENTPROT( double, d, gemm_bp_var1 ) GENTPROT( scomplex, c, gemm_bp_var1 ) GENTPROT( dcomplex, z, gemm_bp_var1 ) -//INSERT_GENTPROT_BASIC0( gemm_bp_var2 ) -GENTPROT( float, s, gemm_bp_var2 ) -GENTPROT( double, d, gemm_bp_var2 ) -GENTPROT( scomplex, c, gemm_bp_var2 ) -GENTPROT( dcomplex, z, gemm_bp_var2 ) - // // Prototype the typed kernel interfaces. From 268ce1f29a717d18304713ecc25a2eafe41838c7 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 10 Jan 2022 10:17:17 -0600 Subject: [PATCH 019/230] Relax alignment constraints Remove alignment of temporary AB buffer in edge case handling macros unless alignment is specifically requested (e.g. Core2, SDB/IVB). Fixes #595. --- frame/include/bli_edge_case_macro_defs.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/frame/include/bli_edge_case_macro_defs.h b/frame/include/bli_edge_case_macro_defs.h index 242045a02..4a1fba7ac 100644 --- a/frame/include/bli_edge_case_macro_defs.h +++ b/frame/include/bli_edge_case_macro_defs.h @@ -38,14 +38,14 @@ // Helper macros for edge-case handling within gemm microkernels. -#define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major) \ +#define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; @@ -64,27 +64,27 @@ #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ - GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \ + GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ - GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \ + GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ - GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \ + GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ - GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major); \ + GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ From 81f93be0561c705ae6823d19e40849facc40bef7 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 10 Jan 2022 10:19:47 -0600 Subject: [PATCH 020/230] Fix row-/column-major pref. in 16x8 haswell sgemm ukr (unused) --- kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c index a3a8b0b09..dd9526d56 100644 --- a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c +++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c @@ -100,7 +100,7 @@ void bli_sgemm_haswell_asm_16x6 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - GEMM_UKR_SETUP_CT( s, 16, 6, true ); + GEMM_UKR_SETUP_CT( s, 16, 6, false ); begin_asm() From 0ab20c0e72402ba0b17fe2c3ed3e16bf2ace0fd3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 13 Jan 2022 07:29:56 -0800 Subject: [PATCH 021/230] the Apple local label thing is required by Clang in general @egaudry and I both saw this issue on Linux with Clang 10. ``` Compiling obj/thunderx2/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.o ('thunderx2' CFLAGS for kernels) kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c:171:49: fatal error: invalid symbol redefinition " \n\t" ^ :90:5: note: instantiated into assembly here .SLOOPKITER: ^ 1 error generated. ``` Signed-off-by: Jeff Hammond --- kernels/armv8a/3/armv8a_asm_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h index 6ef6a3fb0..465950999 100644 --- a/kernels/armv8a/3/armv8a_asm_utils.h +++ b/kernels/armv8a/3/armv8a_asm_utils.h @@ -35,7 +35,7 @@ */ // Apple's local label requirements. -#if defined(__APPLE__) +#if defined(__APPLE__) || defined(__clang__) #define LABEL(str) " L" #str"%=: \n\t" #define BEQ(str) "b.eq L" #str"%= \n\t" #define BNE(str) "b.ne L" #str"%= \n\t" From 0be9282cdccf73342d8571d3f7971a9b0af72363 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 26 Jan 2022 17:46:24 -0600 Subject: [PATCH 022/230] Updated zen3 macro constant names. Details: - In config/zen3/bli_family_zen3.h, renamed: BLIS_SMALL_MATRIX_A_THRES_M_GEMMT -> _M_SYRK BLIS_SMALL_MATRIX_A_THRES_N_GEMMT -> _N_SYRK Thanks to Jeff Diamond for helping spot the stale _SYRK naming. --- config/zen3/bli_family_zen3.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/zen3/bli_family_zen3.h b/config/zen3/bli_family_zen3.h index 918e919ae..8487a7277 100644 --- a/config/zen3/bli_family_zen3.h +++ b/config/zen3/bli_family_zen3.h @@ -63,8 +63,8 @@ #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_SYRK 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_SYRK 128 +#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 From 35195bb5cea5d99eb3eaf41e3815137d14ceb52d Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 31 Jan 2022 10:29:50 -0600 Subject: [PATCH 023/230] Add armclang detection to configure. armclang is treated as regular clang. Fixes #606. [ci skip] --- configure | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/configure b/configure index 1abe7610e..95a97c6b1 100755 --- a/configure +++ b/configure @@ -1464,17 +1464,31 @@ get_compiler_version() cc_vendor="aocc" fi + # Detect armclang, which doesn't have a nice, unambiguous, one-word tag + armclang_grep=$(echo "${vendor_string}" | grep 'Arm C/C++/Fortran Compiler') + if [ -n "${armclang_grep}" ]; then + cc_vendor="armclang" + fi + # Begin parsing cc_vendor for the version string. if [ "${cc_vendor}" = "crosstool-NG" ]; then # Treat compilers built by crosstool-NG (for eg: conda) as gcc. - cc_vendor="gcc" + cc_vendor="gcc" fi if [ "${cc_vendor}" = "icc" -o \ "${cc_vendor}" = "gcc" ]; then cc_version=$(${cc} -dumpversion) + elif [ "${cc_vendor}" = "armclang" ]; then + + # Treat armclang as regular clang. + cc_vendor="clang" + cc_version=$(echo "${vendor_string}" \ + | egrep -o 'based on LLVM [0-9]+\.[0-9]+\.?[0-9]*' \ + | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*') + elif [ "${cc_vendor}" = "clang" ]; then cc_version=$(echo "${vendor_string}" \ From b5df1811f1bc8212b2cda6bb97b79819afe236a8 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Thu, 3 Feb 2022 02:31:29 +0900 Subject: [PATCH 024/230] Armv8a, ArmSVE: Simplify Gen-C --- .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c | 74 +- .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c | 68 +- .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c | 62 +- .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c | 74 +- .../armsve/3/{ => old}/armsve_asm_2vx7cmplx.h | 0 .../armsve/3/{ => old}/armsve_asm_2vx8cmplx.h | 0 .../3/{ => old}/armsve_asm_macros_half.h | 0 .../bli_gemm_armsve_asm_z2vx7_unindexed.c | 0 .../bli_gemm_armsve_asm_z2vx8_unindexed.c | 0 kernels/armsve/bli_kernels_armsve.h | 6 +- kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 659 +----------------- 11 files changed, 154 insertions(+), 789 deletions(-) rename kernels/armsve/3/{ => old}/armsve_asm_2vx7cmplx.h (100%) rename kernels/armsve/3/{ => old}/armsve_asm_2vx8cmplx.h (100%) rename kernels/armsve/3/{ => old}/armsve_asm_macros_half.h (100%) rename kernels/armsve/3/{ => old}/bli_gemm_armsve_asm_z2vx7_unindexed.c (100%) rename kernels/armsve/3/{ => old}/bli_gemm_armsve_asm_z2vx8_unindexed.c (100%) diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c index c84a59f07..60a64515f 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c @@ -118,8 +118,8 @@ void bli_cgemm_armsve_asm_2vx10_unindexed GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " \n\t" " CCOL_PRFM: \n\t" -" cmp %3, #1 \n\t" -" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// " cmp %3, #1 \n\t" +// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. " mov x16, %2 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" @@ -140,7 +140,7 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +// " END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" @@ -233,8 +233,8 @@ MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19) " WRITE_MEM_EXEC: \n\t" " mov x9, %2 \n\t" // C address for loading. " \n\t" // C address for storing is %2 itself. -" cmp %3, #1 \n\t" -" b.ne WRITE_MEM_G \n\t" +// " cmp %3, #1 \n\t" +// " b.ne WRITE_MEM_G \n\t" " \n\t" " WRITE_MEM_C: \n\t" " fmov s29, wzr \n\t" @@ -260,38 +260,38 @@ GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) -" b END_WRITE_MEM \n\t" -" \n\t" -" WRITE_MEM_G: \n\t" -" add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, -" mov x3, %3 \n\t" // s.t. 2*sizeof(float) = 2*4 = 8. -" index z28.s, wzr, w3 \n\t" -" fmov s29, wzr \n\t" -" fcmp s31, #0.0 \n\t" // Whether Imag(beta) == 0. -" fccmp s30, s29, 0, eq \n\t" // Whether Real(beta) == 0. -" b.eq ZERO_BETA_G_0_1_2_3 \n\t" -GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) -GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) -GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) -GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) -" ZERO_BETA_G_0_1_2_3: \n\t" -GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) -GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) -" \n\t" -" b.eq ZERO_BETA_G_4_5_6_7_8_9 \n\t" -GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) -GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) -GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) -GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) -GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) -GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) -" ZERO_BETA_G_4_5_6_7_8_9: \n\t" -GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) -GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) -GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) -" \n\t" -" END_WRITE_MEM: \n\t" -" b END_EXEC \n\t" +// " b END_WRITE_MEM \n\t" +// " \n\t" +// " WRITE_MEM_G: \n\t" +// " add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, +// " mov x3, %3 \n\t" // s.t. 2*sizeof(float) = 2*4 = 8. +// " index z28.s, wzr, w3 \n\t" +// " fmov s29, wzr \n\t" +// " fcmp s31, #0.0 \n\t" // Whether Imag(beta) == 0. +// " fccmp s30, s29, 0, eq \n\t" // Whether Real(beta) == 0. +// " b.eq ZERO_BETA_G_0_1_2_3 \n\t" +// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) +// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) +// GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) +// GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) +// " ZERO_BETA_G_0_1_2_3: \n\t" +// GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) +// GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) +// " \n\t" +// " b.eq ZERO_BETA_G_4_5_6_7_8_9 \n\t" +// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) +// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) +// GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) +// GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) +// GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) +// GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) +// " ZERO_BETA_G_4_5_6_7_8_9: \n\t" +// GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) +// GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) +// GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) +// " \n\t" +// " END_WRITE_MEM: \n\t" +// " b END_EXEC \n\t" " \n\t" " END_EXEC: \n\t" " mov %11, #0 \n\t" // Return normal. diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c index 5a662df4e..7136104b5 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c @@ -78,7 +78,7 @@ void bli_dgemm_armsve_asm_2vx10_unindexed " mov x3, #10 \n\t" // Row-skip of B. " \n\t" " ldr x5, %[c] \n\t" -" ldr x6, %[rs_c] \n\t" // Row-skip of C. +// " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. #ifdef _A64FX " mov x8, 0x3 \n\t" // Tag C address. @@ -117,8 +117,8 @@ void bli_dgemm_armsve_asm_2vx10_unindexed GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) " \n\t" " CCOL_PRFM: \n\t" -" cmp x6, #1 \n\t" -" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// " cmp x6, #1 \n\t" +// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. " mov x16, x5 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" @@ -139,7 +139,7 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +// " END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" @@ -253,8 +253,8 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " \n\t" " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. -" cmp x6, #1 \n\t" // Preload first half of C for contiguous case. -" b.ne WRITE_MEM \n\t" +// " cmp x6, #1 \n\t" // Preload first half of C for contiguous case. +// " b.ne WRITE_MEM \n\t" GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) " \n\t" " WRITE_MEM: \n\t" @@ -265,8 +265,8 @@ GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) " \n\t" " UNIT_ALPHA: \n\t" -" cmp x6, #1 \n\t" -" b.ne WRITE_MEM_G \n\t" +// " cmp x6, #1 \n\t" +// " b.ne WRITE_MEM_G \n\t" " \n\t" " WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. @@ -281,32 +281,32 @@ GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28, " BETA_ZERO_C: \n\t" GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7) -" b END_WRITE_MEM \n\t" -" \n\t" -" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. -" \n\t" // Here used scratch: Z[20-30] - Z30 as index. -" mov x8, xzr \n\t" -" incb x8 \n\t" -" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. -" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. -" \n\t" -" fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN. -" b.eq BETA_ZERO_G \n\t" -" \n\t" -GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -" \n\t" -" BETA_ZERO_G: \n\t" -GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) -GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16) -" \n\t" -" END_WRITE_MEM: \n\t" -" b END_EXEC \n\t" -" \n\t" -" END_ERROR: \n\t" -" mov x0, #1 \n\t" // Return error. +// " b END_WRITE_MEM \n\t" +// " \n\t" +// " WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. +// " \n\t" // Here used scratch: Z[20-30] - Z30 as index. +// " mov x8, xzr \n\t" +// " incb x8 \n\t" +// " madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. +// " index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. +// " \n\t" +// " fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN. +// " b.eq BETA_ZERO_G \n\t" +// " \n\t" +// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) +// GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) +// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) +// GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) +// " \n\t" +// " BETA_ZERO_G: \n\t" +// GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) +// GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16) +// " \n\t" +// " END_WRITE_MEM: \n\t" +// " b END_EXEC \n\t" +// " \n\t" +// " END_ERROR: \n\t" +// " mov x0, #1 \n\t" // Return error. " END_EXEC: \n\t" " mov x0, #0 \n\t" // Return normal. : diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c index caa70a5e5..20841891b 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c @@ -78,7 +78,7 @@ void bli_sgemm_armsve_asm_2vx10_unindexed " mov x3, #10 \n\t" // Row-skip of B. " \n\t" " ldr x5, %[c] \n\t" -" ldr x6, %[rs_c] \n\t" // Row-skip of C. +// " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. #ifdef _A64FX " mov x8, 0x3 \n\t" // Tag C address. @@ -117,8 +117,8 @@ void bli_sgemm_armsve_asm_2vx10_unindexed GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) " \n\t" " CCOL_PRFM: \n\t" -" cmp x6, #1 \n\t" -" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// " cmp x6, #1 \n\t" +// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. " mov x16, x5 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" @@ -139,7 +139,7 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +// " END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" @@ -253,8 +253,8 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1 " UNIT_ALPHA: \n\t" " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. -" cmp x6, #1 \n\t" -" b.ne WRITE_MEM_G \n\t" +// " cmp x6, #1 \n\t" +// " b.ne WRITE_MEM_G \n\t" " \n\t" " WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. @@ -268,31 +268,31 @@ GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28, " BETA_ZERO_C: \n\t" GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7) -" b END_WRITE_MEM \n\t" -" \n\t" -" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. -" \n\t" // Here used scratch: Z[20-30] - Z30 as index. -" mov x8, xzr \n\t" -" incb x8 \n\t" -" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. -" index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8. -" \n\t" -" fcmp s31, #0.0 \n\t" -" b.eq BETA_ZERO_G \n\t" -GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -" \n\t" -" BETA_ZERO_G: \n\t" -GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) -GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16) -" \n\t" -" END_WRITE_MEM: \n\t" -" b END_EXEC \n\t" -" \n\t" -" END_ERROR: \n\t" -" mov x0, #1 \n\t" // Return error. +// " b END_WRITE_MEM \n\t" +// " \n\t" +// " WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. +// " \n\t" // Here used scratch: Z[20-30] - Z30 as index. +// " mov x8, xzr \n\t" +// " incb x8 \n\t" +// " madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. +// " index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8. +// " \n\t" +// " fcmp s31, #0.0 \n\t" +// " b.eq BETA_ZERO_G \n\t" +// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) +// GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) +// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) +// GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) +// " \n\t" +// " BETA_ZERO_G: \n\t" +// GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) +// GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16) +// " \n\t" +// " END_WRITE_MEM: \n\t" +// " b END_EXEC \n\t" +// " \n\t" +// " END_ERROR: \n\t" +// " mov x0, #1 \n\t" // Return error. " END_EXEC: \n\t" " mov x0, #0 \n\t" // Return normal. : diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c index 25084af35..7e630894f 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c @@ -118,8 +118,8 @@ void bli_zgemm_armsve_asm_2vx10_unindexed GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " \n\t" " CCOL_PRFM: \n\t" -" cmp %3, #1 \n\t" -" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// " cmp %3, #1 \n\t" +// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. " mov x16, %2 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" @@ -140,7 +140,7 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +// " END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" @@ -233,8 +233,8 @@ MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19) " WRITE_MEM_EXEC: \n\t" " mov x9, %2 \n\t" // C address for loading. " \n\t" // C address for storing is %2 itself. -" cmp %3, #1 \n\t" -" b.ne WRITE_MEM_G \n\t" +// " cmp %3, #1 \n\t" +// " b.ne WRITE_MEM_G \n\t" " \n\t" " WRITE_MEM_C: \n\t" " fmov d29, xzr \n\t" @@ -260,38 +260,38 @@ GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) -" b END_WRITE_MEM \n\t" -" \n\t" -" WRITE_MEM_G: \n\t" -" add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, -" index z28.d, xzr, %3 \n\t" // s.t. 2*sizeof(double) = 2*8 = 16. -" fmov d29, xzr \n\t" -" fcmp d31, #0.0 \n\t" // Whether Imag(beta) == 0. -" fccmp d30, d29, 0, eq \n\t" // Whether Real(beta) == 0. -" b.eq ZERO_BETA_G_0_1_2_3 \n\t" -GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) -GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) -GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) -GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) -" ZERO_BETA_G_0_1_2_3: \n\t" -GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) -GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) -" \n\t" -" b.eq ZERO_BETA_G_4_5_6_7_8_9 \n\t" -GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) -GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) -GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) -GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) -GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) -GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) -" ZERO_BETA_G_4_5_6_7_8_9: \n\t" -GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) -GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) -GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) -" \n\t" -" END_WRITE_MEM: \n\t" -" b END_EXEC \n\t" -" \n\t" +// " b END_WRITE_MEM \n\t" +// " \n\t" +// " WRITE_MEM_G: \n\t" +// " add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, +// " index z28.d, xzr, %3 \n\t" // s.t. 2*sizeof(double) = 2*8 = 16. +// " fmov d29, xzr \n\t" +// " fcmp d31, #0.0 \n\t" // Whether Imag(beta) == 0. +// " fccmp d30, d29, 0, eq \n\t" // Whether Real(beta) == 0. +// " b.eq ZERO_BETA_G_0_1_2_3 \n\t" +// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) +// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) +// GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) +// GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) +// " ZERO_BETA_G_0_1_2_3: \n\t" +// GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) +// GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) +// " \n\t" +// " b.eq ZERO_BETA_G_4_5_6_7_8_9 \n\t" +// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) +// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) +// GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) +// GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) +// GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) +// GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) +// " ZERO_BETA_G_4_5_6_7_8_9: \n\t" +// GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) +// GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) +// GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) +// " \n\t" +// " END_WRITE_MEM: \n\t" +// " b END_EXEC \n\t" +// " \n\t" " END_EXEC: \n\t" " mov %11, #0 \n\t" // Return normal. : "+r" (a), // %0 diff --git a/kernels/armsve/3/armsve_asm_2vx7cmplx.h b/kernels/armsve/3/old/armsve_asm_2vx7cmplx.h similarity index 100% rename from kernels/armsve/3/armsve_asm_2vx7cmplx.h rename to kernels/armsve/3/old/armsve_asm_2vx7cmplx.h diff --git a/kernels/armsve/3/armsve_asm_2vx8cmplx.h b/kernels/armsve/3/old/armsve_asm_2vx8cmplx.h similarity index 100% rename from kernels/armsve/3/armsve_asm_2vx8cmplx.h rename to kernels/armsve/3/old/armsve_asm_2vx8cmplx.h diff --git a/kernels/armsve/3/armsve_asm_macros_half.h b/kernels/armsve/3/old/armsve_asm_macros_half.h similarity index 100% rename from kernels/armsve/3/armsve_asm_macros_half.h rename to kernels/armsve/3/old/armsve_asm_macros_half.h diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c b/kernels/armsve/3/old/bli_gemm_armsve_asm_z2vx7_unindexed.c similarity index 100% rename from kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c rename to kernels/armsve/3/old/bli_gemm_armsve_asm_z2vx7_unindexed.c diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c b/kernels/armsve/3/old/bli_gemm_armsve_asm_z2vx8_unindexed.c similarity index 100% rename from kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c rename to kernels/armsve/3/old/bli_gemm_armsve_asm_z2vx8_unindexed.c diff --git a/kernels/armsve/bli_kernels_armsve.h b/kernels/armsve/bli_kernels_armsve.h index 408300308..39daf30c6 100644 --- a/kernels/armsve/bli_kernels_armsve.h +++ b/kernels/armsve/bli_kernels_armsve.h @@ -33,13 +33,13 @@ */ #include "./3/bli_armsve_utils.h" -GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 ) +// GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( float, s, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( scomplex, c, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx10_unindexed ) -GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx8_unindexed ) -GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx7_unindexed ) +// GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx8_unindexed ) +// GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx7_unindexed ) //GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_2vx10_unindexed ) //GEMMSUP_KER_PROT( double, d, gemmsup_cv_armsve_2vx10_unindexed ) //GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_10x2v_unindexed ) diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index 7b420f202..4d9a88817 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -92,8 +92,8 @@ void bli_sgemm_armv8a_asm_8x12 " ldr x10,%[cs_c] \n\t" // Load cs_c. " lsl x10,x10,#2 \n\t" // cs_c * sizeof(float) -- AUX. " \n\t" - " ldr x14,%[rs_c] \n\t" // Load rs_c. - " lsl x14,x14,#2 \n\t" // rs_c * sizeof(float). + // " ldr x14,%[rs_c] \n\t" // Load rs_c. + // " lsl x14,x14,#2 \n\t" // rs_c * sizeof(float). " \n\t" " add x16,x2,x10 \n\t" //Load address Column 1 of C " add x17,x16,x10 \n\t" //Load address Column 2 of C @@ -509,9 +509,6 @@ void bli_sgemm_armv8a_asm_8x12 " ldr x0,%[a_next] \n\t" // Pointer to next block of A. " ldr x1,%[b_next] \n\t" // Pointer to next pointer of B. " \n\t" - " cmp x14,#4 \n\t" // If rs_c != 1 (column-major) - BNE(SGENSTORED) - " \n\t" LABEL(SCOLSTORED) // C is column-major. " \n\t" " dup v0.4s, wzr \n\t" @@ -678,384 +675,8 @@ void bli_sgemm_armv8a_asm_8x12 " str q13, [x27, #16] \n\t" " \n\t" " \n\t" - BRANCH(SEND) // Done. - " \n\t" - " \n\t" - LABEL(SGENSTORED) // C is general-stride stored. - " \n\t" - " \n\t" - " dup v0.4s, wzr \n\t" - " dup v1.4s, wzr \n\t" - " dup v2.4s, wzr \n\t" - " dup v3.4s, wzr \n\t" - " dup v4.4s, wzr \n\t" - " dup v5.4s, wzr \n\t" - " \n\t" - " fcmp s7,#0.0 \n\t" - BEQ(SBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. - " \n\t" - " mov x5, x2 \n\t" - " \n\t" - " ld1 {v0.s}[0],[x5],x14 \n\t" // Load c00 into quad and increment by rs_c. - " ld1 {v0.s}[1],[x5],x14 \n\t" // Load c01 into quad and increment by rs_c. - " ld1 {v0.s}[2],[x5],x14 \n\t" // Load c02 into quad and increment by rs_c. - " ld1 {v0.s}[3],[x5],x14 \n\t" // Load c03 into quad and increment by rs_c. - " ld1 {v1.s}[0],[x5],x14 \n\t" // Load c04 into quad and increment by rs_c. - " ld1 {v1.s}[1],[x5],x14 \n\t" // Load c05 into quad and increment by rs_c. - " ld1 {v1.s}[2],[x5],x14 \n\t" // Load c06 into quad and increment by rs_c. - " ld1 {v1.s}[3],[x5],x14 \n\t" // Load c07 into quad and increment by rs_c. - " \n\t" - " mov x5, x16 \n\t" - " \n\t" - " ld1 {v2.s}[0],[x5],x14 \n\t" // Load c10 into quad and increment by rs_c. - " ld1 {v2.s}[1],[x5],x14 \n\t" // Load c11 into quad and increment by rs_c. - " ld1 {v2.s}[2],[x5],x14 \n\t" // Load c12 into quad and increment by rs_c. - " ld1 {v2.s}[3],[x5],x14 \n\t" // Load c13 into quad and increment by rs_c. - " ld1 {v3.s}[0],[x5],x14 \n\t" // Load c14 into quad and increment by rs_c. - " ld1 {v3.s}[1],[x5],x14 \n\t" // Load c15 into quad and increment by rs_c. - " ld1 {v3.s}[2],[x5],x14 \n\t" // Load c16 into quad and increment by rs_c. - " ld1 {v3.s}[3],[x5],x14 \n\t" // Load c17 into quad and increment by rs_c. - " \n\t" - " mov x5, x17 \n\t" - " \n\t" - " ld1 {v4.s}[0],[x5],x14 \n\t" // Load c20 into quad and increment by rs_c. - " ld1 {v4.s}[1],[x5],x14 \n\t" // Load c21 into quad and increment by rs_c. - " ld1 {v4.s}[2],[x5],x14 \n\t" // Load c22 into quad and increment by rs_c. - " ld1 {v4.s}[3],[x5],x14 \n\t" // Load c23 into quad and increment by rs_c. - " ld1 {v5.s}[0],[x5],x14 \n\t" // Load c24 into quad and increment by rs_c. - " ld1 {v5.s}[1],[x5],x14 \n\t" // Load c25 into quad and increment by rs_c. - " ld1 {v5.s}[2],[x5],x14 \n\t" // Load c26 into quad and increment by rs_c. - " ld1 {v5.s}[3],[x5],x14 \n\t" // Load c27 into quad and increment by rs_c. - " \n\t" - " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta - " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta - " fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta - " fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta - " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta - " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta - " \n\t" - LABEL(SBETAZEROGENSTOREDS1) - " \n\t" - " fmla v0.4s, v8.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v1.4s, v9.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v2.4s,v10.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v3.4s,v11.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v4.4s,v12.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v5.4s,v13.4s,v6.s[0] \n\t" // Scale by alpha - " \n\t" - " mov x5, x2 \n\t" - " \n\t" - " st1 {v0.s}[0],[x5],x14 \n\t" // Store c00 into quad and increment by rs_c. - " st1 {v0.s}[1],[x5],x14 \n\t" // Store c01 into quad and increment by rs_c. - " st1 {v0.s}[2],[x5],x14 \n\t" // Store c02 into quad and increment by rs_c. - " st1 {v0.s}[3],[x5],x14 \n\t" // Store c03 into quad and increment by rs_c. - " st1 {v1.s}[0],[x5],x14 \n\t" // Store c04 into quad and increment by rs_c. - " st1 {v1.s}[1],[x5],x14 \n\t" // Store c05 into quad and increment by rs_c. - " st1 {v1.s}[2],[x5],x14 \n\t" // Store c06 into quad and increment by rs_c. - " st1 {v1.s}[3],[x5],x14 \n\t" // Store c07 into quad and increment by rs_c. - " \n\t" - " mov x5, x16 \n\t" - " \n\t" - " st1 {v2.s}[0],[x5],x14 \n\t" // Store c10 into quad and increment by rs_c. - " st1 {v2.s}[1],[x5],x14 \n\t" // Store c11 into quad and increment by rs_c. - " st1 {v2.s}[2],[x5],x14 \n\t" // Store c12 into quad and increment by rs_c. - " st1 {v2.s}[3],[x5],x14 \n\t" // Store c13 into quad and increment by rs_c. - " st1 {v3.s}[0],[x5],x14 \n\t" // Store c14 into quad and increment by rs_c. - " st1 {v3.s}[1],[x5],x14 \n\t" // Store c15 into quad and increment by rs_c. - " st1 {v3.s}[2],[x5],x14 \n\t" // Store c16 into quad and increment by rs_c. - " st1 {v3.s}[3],[x5],x14 \n\t" // Store c17 into quad and increment by rs_c. - " \n\t" - " mov x5, x17 \n\t" - " \n\t" - " st1 {v4.s}[0],[x5],x14 \n\t" // Store c20 into quad and increment by rs_c. - " st1 {v4.s}[1],[x5],x14 \n\t" // Store c21 into quad and increment by rs_c. - " st1 {v4.s}[2],[x5],x14 \n\t" // Store c22 into quad and increment by rs_c. - " st1 {v4.s}[3],[x5],x14 \n\t" // Store c23 into quad and increment by rs_c. - " st1 {v5.s}[0],[x5],x14 \n\t" // Store c24 into quad and increment by rs_c. - " st1 {v5.s}[1],[x5],x14 \n\t" // Store c25 into quad and increment by rs_c. - " st1 {v5.s}[2],[x5],x14 \n\t" // Store c26 into quad and increment by rs_c. - " st1 {v5.s}[3],[x5],x14 \n\t" // Store c27 into quad and increment by rs_c. - " \n\t" - " dup v8.4s, wzr \n\t" - " dup v9.4s, wzr \n\t" - " dup v10.4s, wzr \n\t" - " dup v11.4s, wzr \n\t" - " dup v12.4s, wzr \n\t" - " dup v13.4s, wzr \n\t" - " \n\t" - " fcmp s7,#0.0 \n\t" - BEQ(SBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. - " \n\t" - " mov x5, x19 \n\t" - " \n\t" - " ld1 {v8.s}[0],[x5],x14 \n\t" // Load c30 into quad and increment by rs_c. - " ld1 {v8.s}[1],[x5],x14 \n\t" // Load c31 into quad and increment by rs_c. - " ld1 {v8.s}[2],[x5],x14 \n\t" // Load c32 into quad and increment by rs_c. - " ld1 {v8.s}[3],[x5],x14 \n\t" // Load c33 into quad and increment by rs_c. - " ld1 {v9.s}[0],[x5],x14 \n\t" // Load c34 into quad and increment by rs_c. - " ld1 {v9.s}[1],[x5],x14 \n\t" // Load c35 into quad and increment by rs_c. - " ld1 {v9.s}[2],[x5],x14 \n\t" // Load c36 into quad and increment by rs_c. - " ld1 {v9.s}[3],[x5],x14 \n\t" // Load c37 into quad and increment by rs_c. - " \n\t" - " mov x5, x20 \n\t" - " \n\t" - " ld1 {v10.s}[0],[x5],x14 \n\t" // Load c40 into quad and increment by rs_c. - " ld1 {v10.s}[1],[x5],x14 \n\t" // Load c41 into quad and increment by rs_c. - " ld1 {v10.s}[2],[x5],x14 \n\t" // Load c42 into quad and increment by rs_c. - " ld1 {v10.s}[3],[x5],x14 \n\t" // Load c43 into quad and increment by rs_c. - " ld1 {v11.s}[0],[x5],x14 \n\t" // Load c44 into quad and increment by rs_c. - " ld1 {v11.s}[1],[x5],x14 \n\t" // Load c45 into quad and increment by rs_c. - " ld1 {v11.s}[2],[x5],x14 \n\t" // Load c46 into quad and increment by rs_c. - " ld1 {v11.s}[3],[x5],x14 \n\t" // Load c47 into quad and increment by rs_c. - " \n\t" - " mov x5, x21 \n\t" - " \n\t" - " ld1 {v12.s}[0],[x5],x14 \n\t" // Load c50 into quad and increment by rs_c. - " ld1 {v12.s}[1],[x5],x14 \n\t" // Load c51 into quad and increment by rs_c. - " ld1 {v12.s}[2],[x5],x14 \n\t" // Load c52 into quad and increment by rs_c. - " ld1 {v12.s}[3],[x5],x14 \n\t" // Load c53 into quad and increment by rs_c. - " ld1 {v13.s}[0],[x5],x14 \n\t" // Load c54 into quad and increment by rs_c. - " ld1 {v13.s}[1],[x5],x14 \n\t" // Load c55 into quad and increment by rs_c. - " ld1 {v13.s}[2],[x5],x14 \n\t" // Load c56 into quad and increment by rs_c. - " ld1 {v13.s}[3],[x5],x14 \n\t" // Load c57 into quad and increment by rs_c. - " \n\t" - " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta - " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta - " fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta - " fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta - " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta - " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta - " \n\t" - LABEL(SBETAZEROGENSTOREDS2) - " \n\t" - " fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v10.4s,v16.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v11.4s,v17.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha - " \n\t" - " mov x5, x19 \n\t" - " \n\t" - " st1 {v8.s}[0],[x5],x14 \n\t" // Store c30 into quad and increment by rs_c. - " st1 {v8.s}[1],[x5],x14 \n\t" // Store c31 into quad and increment by rs_c. - " st1 {v8.s}[2],[x5],x14 \n\t" // Store c32 into quad and increment by rs_c. - " st1 {v8.s}[3],[x5],x14 \n\t" // Store c33 into quad and increment by rs_c. - " st1 {v9.s}[0],[x5],x14 \n\t" // Store c34 into quad and increment by rs_c. - " st1 {v9.s}[1],[x5],x14 \n\t" // Store c35 into quad and increment by rs_c. - " st1 {v9.s}[2],[x5],x14 \n\t" // Store c36 into quad and increment by rs_c. - " st1 {v9.s}[3],[x5],x14 \n\t" // Store c37 into quad and increment by rs_c. - " \n\t" - " mov x5, x20 \n\t" - " \n\t" - " st1 {v10.s}[0],[x5],x14 \n\t" // Store c40 into quad and increment by rs_c. - " st1 {v10.s}[1],[x5],x14 \n\t" // Store c41 into quad and increment by rs_c. - " st1 {v10.s}[2],[x5],x14 \n\t" // Store c42 into quad and increment by rs_c. - " st1 {v10.s}[3],[x5],x14 \n\t" // Store c43 into quad and increment by rs_c. - " st1 {v11.s}[0],[x5],x14 \n\t" // Store c44 into quad and increment by rs_c. - " st1 {v11.s}[1],[x5],x14 \n\t" // Store c45 into quad and increment by rs_c. - " st1 {v11.s}[2],[x5],x14 \n\t" // Store c46 into quad and increment by rs_c. - " st1 {v11.s}[3],[x5],x14 \n\t" // Store c47 into quad and increment by rs_c. - " \n\t" - " mov x5, x21 \n\t" - " \n\t" - " st1 {v12.s}[0],[x5],x14 \n\t" // Store c50 into quad and increment by rs_c. - " st1 {v12.s}[1],[x5],x14 \n\t" // Store c51 into quad and increment by rs_c. - " st1 {v12.s}[2],[x5],x14 \n\t" // Store c52 into quad and increment by rs_c. - " st1 {v12.s}[3],[x5],x14 \n\t" // Store c53 into quad and increment by rs_c. - " st1 {v13.s}[0],[x5],x14 \n\t" // Store c54 into quad and increment by rs_c. - " st1 {v13.s}[1],[x5],x14 \n\t" // Store c55 into quad and increment by rs_c. - " st1 {v13.s}[2],[x5],x14 \n\t" // Store c56 into quad and increment by rs_c. - " st1 {v13.s}[3],[x5],x14 \n\t" // Store c57 into quad and increment by rs_c. - " \n\t" - " dup v0.4s, wzr \n\t" - " dup v1.4s, wzr \n\t" - " dup v2.4s, wzr \n\t" - " dup v3.4s, wzr \n\t" - " dup v4.4s, wzr \n\t" - " dup v5.4s, wzr \n\t" - " \n\t" - " fcmp s7,#0.0 \n\t" - BEQ(SBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. - " \n\t" - " mov x5, x22 \n\t" - " \n\t" - " ld1 {v0.s}[0],[x5],x14 \n\t" // Load c60 into quad and increment by rs_c. - " ld1 {v0.s}[1],[x5],x14 \n\t" // Load c61 into quad and increment by rs_c. - " ld1 {v0.s}[2],[x5],x14 \n\t" // Load c62 into quad and increment by rs_c. - " ld1 {v0.s}[3],[x5],x14 \n\t" // Load c63 into quad and increment by rs_c. - " ld1 {v1.s}[0],[x5],x14 \n\t" // Load c64 into quad and increment by rs_c. - " ld1 {v1.s}[1],[x5],x14 \n\t" // Load c65 into quad and increment by rs_c. - " ld1 {v1.s}[2],[x5],x14 \n\t" // Load c66 into quad and increment by rs_c. - " ld1 {v1.s}[3],[x5],x14 \n\t" // Load c67 into quad and increment by rs_c. - " \n\t" - " mov x5, x23 \n\t" - " \n\t" - " ld1 {v2.s}[0],[x5],x14 \n\t" // Load c70 into quad and increment by rs_c. - " ld1 {v2.s}[1],[x5],x14 \n\t" // Load c71 into quad and increment by rs_c. - " ld1 {v2.s}[2],[x5],x14 \n\t" // Load c72 into quad and increment by rs_c. - " ld1 {v2.s}[3],[x5],x14 \n\t" // Load c73 into quad and increment by rs_c. - " ld1 {v3.s}[0],[x5],x14 \n\t" // Load c74 into quad and increment by rs_c. - " ld1 {v3.s}[1],[x5],x14 \n\t" // Load c75 into quad and increment by rs_c. - " ld1 {v3.s}[2],[x5],x14 \n\t" // Load c76 into quad and increment by rs_c. - " ld1 {v3.s}[3],[x5],x14 \n\t" // Load c77 into quad and increment by rs_c. - " \n\t" - " mov x5, x24 \n\t" - " \n\t" - " ld1 {v4.s}[0],[x5],x14 \n\t" // Load c80 into quad and increment by rs_c. - " ld1 {v4.s}[1],[x5],x14 \n\t" // Load c81 into quad and increment by rs_c. - " ld1 {v4.s}[2],[x5],x14 \n\t" // Load c82 into quad and increment by rs_c. - " ld1 {v4.s}[3],[x5],x14 \n\t" // Load c83 into quad and increment by rs_c. - " ld1 {v5.s}[0],[x5],x14 \n\t" // Load c84 into quad and increment by rs_c. - " ld1 {v5.s}[1],[x5],x14 \n\t" // Load c85 into quad and increment by rs_c. - " ld1 {v5.s}[2],[x5],x14 \n\t" // Load c86 into quad and increment by rs_c. - " ld1 {v5.s}[3],[x5],x14 \n\t" // Load c87 into quad and increment by rs_c. - " \n\t" - " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta - " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta - " fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta - " fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta - " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta - " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta - " \n\t" - LABEL(SBETAZEROGENSTOREDS3) - " \n\t" - " fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v2.4s,v22.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v3.4s,v23.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha - " \n\t" - " mov x5, x22 \n\t" - " \n\t" - " st1 {v0.s}[0],[x5],x14 \n\t" // Store c60 into quad and increment by rs_c. - " st1 {v0.s}[1],[x5],x14 \n\t" // Store c61 into quad and increment by rs_c. - " st1 {v0.s}[2],[x5],x14 \n\t" // Store c62 into quad and increment by rs_c. - " st1 {v0.s}[3],[x5],x14 \n\t" // Store c63 into quad and increment by rs_c. - " st1 {v1.s}[0],[x5],x14 \n\t" // Store c64 into quad and increment by rs_c. - " st1 {v1.s}[1],[x5],x14 \n\t" // Store c65 into quad and increment by rs_c. - " st1 {v1.s}[2],[x5],x14 \n\t" // Store c66 into quad and increment by rs_c. - " st1 {v1.s}[3],[x5],x14 \n\t" // Store c67 into quad and increment by rs_c. - " \n\t" - " mov x5, x23 \n\t" - " \n\t" - " st1 {v2.s}[0],[x5],x14 \n\t" // Store c70 into quad and increment by rs_c. - " st1 {v2.s}[1],[x5],x14 \n\t" // Store c71 into quad and increment by rs_c. - " st1 {v2.s}[2],[x5],x14 \n\t" // Store c72 into quad and increment by rs_c. - " st1 {v2.s}[3],[x5],x14 \n\t" // Store c73 into quad and increment by rs_c. - " st1 {v3.s}[0],[x5],x14 \n\t" // Store c74 into quad and increment by rs_c. - " st1 {v3.s}[1],[x5],x14 \n\t" // Store c75 into quad and increment by rs_c. - " st1 {v3.s}[2],[x5],x14 \n\t" // Store c76 into quad and increment by rs_c. - " st1 {v3.s}[3],[x5],x14 \n\t" // Store c77 into quad and increment by rs_c. - " \n\t" - " mov x5, x24 \n\t" - " \n\t" - " st1 {v4.s}[0],[x5],x14 \n\t" // Store c80 into quad and increment by rs_c. - " st1 {v4.s}[1],[x5],x14 \n\t" // Store c81 into quad and increment by rs_c. - " st1 {v4.s}[2],[x5],x14 \n\t" // Store c82 into quad and increment by rs_c. - " st1 {v4.s}[3],[x5],x14 \n\t" // Store c83 into quad and increment by rs_c. - " st1 {v5.s}[0],[x5],x14 \n\t" // Store c84 into quad and increment by rs_c. - " st1 {v5.s}[1],[x5],x14 \n\t" // Store c85 into quad and increment by rs_c. - " st1 {v5.s}[2],[x5],x14 \n\t" // Store c86 into quad and increment by rs_c. - " st1 {v5.s}[3],[x5],x14 \n\t" // Store c87 into quad and increment by rs_c. - " \n\t" - " dup v8.4s, wzr \n\t" - " dup v9.4s, wzr \n\t" - " dup v10.4s, wzr \n\t" - " dup v11.4s, wzr \n\t" - " dup v12.4s, wzr \n\t" - " dup v13.4s, wzr \n\t" - " \n\t" - " fcmp s7,#0.0 \n\t" - BEQ(SBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. - " \n\t" - " mov x5, x25 \n\t" - " \n\t" - " ld1 {v8.s}[0],[x5],x14 \n\t" // Load c90 into quad and increment by rs_c. - " ld1 {v8.s}[1],[x5],x14 \n\t" // Load c91 into quad and increment by rs_c. - " ld1 {v8.s}[2],[x5],x14 \n\t" // Load c92 into quad and increment by rs_c. - " ld1 {v8.s}[3],[x5],x14 \n\t" // Load c93 into quad and increment by rs_c. - " ld1 {v9.s}[0],[x5],x14 \n\t" // Load c94 into quad and increment by rs_c. - " ld1 {v9.s}[1],[x5],x14 \n\t" // Load c95 into quad and increment by rs_c. - " ld1 {v9.s}[2],[x5],x14 \n\t" // Load c96 into quad and increment by rs_c. - " ld1 {v9.s}[3],[x5],x14 \n\t" // Load c97 into quad and increment by rs_c. - " \n\t" - " mov x5, x26 \n\t" - " \n\t" - " ld1 {v10.s}[0],[x5],x14 \n\t" // Load c100 into quad and increment by rs_c. - " ld1 {v10.s}[1],[x5],x14 \n\t" // Load c101 into quad and increment by rs_c. - " ld1 {v10.s}[2],[x5],x14 \n\t" // Load c102 into quad and increment by rs_c. - " ld1 {v10.s}[3],[x5],x14 \n\t" // Load c103 into quad and increment by rs_c. - " ld1 {v11.s}[0],[x5],x14 \n\t" // Load c104 into quad and increment by rs_c. - " ld1 {v11.s}[1],[x5],x14 \n\t" // Load c105 into quad and increment by rs_c. - " ld1 {v11.s}[2],[x5],x14 \n\t" // Load c106 into quad and increment by rs_c. - " ld1 {v11.s}[3],[x5],x14 \n\t" // Load c107 into quad and increment by rs_c. - " \n\t" - " mov x5, x27 \n\t" - " \n\t" - " ld1 {v12.s}[0],[x5],x14 \n\t" // Load c110 into quad and increment by rs_c. - " ld1 {v12.s}[1],[x5],x14 \n\t" // Load c111 into quad and increment by rs_c. - " ld1 {v12.s}[2],[x5],x14 \n\t" // Load c112 into quad and increment by rs_c. - " ld1 {v12.s}[3],[x5],x14 \n\t" // Load c113 into quad and increment by rs_c. - " ld1 {v13.s}[0],[x5],x14 \n\t" // Load c114 into quad and increment by rs_c. - " ld1 {v13.s}[1],[x5],x14 \n\t" // Load c115 into quad and increment by rs_c. - " ld1 {v13.s}[2],[x5],x14 \n\t" // Load c116 into quad and increment by rs_c. - " ld1 {v13.s}[3],[x5],x14 \n\t" // Load c117 into quad and increment by rs_c. - " \n\t" - " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta - " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta - " fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta - " fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta - " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta - " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta - " \n\t" - LABEL(SBETAZEROGENSTOREDS4) - " \n\t" - " prfm pldl2keep,[x0] \n\t" - " prfm pldl2keep,[x1] \n\t" - " \n\t" - " fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v10.4s,v28.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v11.4s,v29.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha - " \n\t" - " mov x5, x25 \n\t" - " \n\t" - " st1 {v8.s}[0],[x5],x14 \n\t" // Store c90 into quad and increment by rs_c. - " st1 {v8.s}[1],[x5],x14 \n\t" // Store c91 into quad and increment by rs_c. - " st1 {v8.s}[2],[x5],x14 \n\t" // Store c92 into quad and increment by rs_c. - " st1 {v8.s}[3],[x5],x14 \n\t" // Store c93 into quad and increment by rs_c. - " st1 {v9.s}[0],[x5],x14 \n\t" // Store c94 into quad and increment by rs_c. - " st1 {v9.s}[1],[x5],x14 \n\t" // Store c95 into quad and increment by rs_c. - " st1 {v9.s}[2],[x5],x14 \n\t" // Store c96 into quad and increment by rs_c. - " st1 {v9.s}[3],[x5],x14 \n\t" // Store c97 into quad and increment by rs_c. - " \n\t" - " mov x5, x26 \n\t" - " \n\t" - " st1 {v10.s}[0],[x5],x14 \n\t" // Store c100 into quad and increment by rs_c. - " st1 {v10.s}[1],[x5],x14 \n\t" // Store c101 into quad and increment by rs_c. - " st1 {v10.s}[2],[x5],x14 \n\t" // Store c102 into quad and increment by rs_c. - " st1 {v10.s}[3],[x5],x14 \n\t" // Store c103 into quad and increment by rs_c. - " st1 {v11.s}[0],[x5],x14 \n\t" // Store c104 into quad and increment by rs_c. - " st1 {v11.s}[1],[x5],x14 \n\t" // Store c105 into quad and increment by rs_c. - " st1 {v11.s}[2],[x5],x14 \n\t" // Store c106 into quad and increment by rs_c. - " st1 {v11.s}[3],[x5],x14 \n\t" // Store c107 into quad and increment by rs_c. - " \n\t" - " mov x5, x27 \n\t" - " \n\t" - " st1 {v12.s}[0],[x5],x14 \n\t" // Store c110 into quad and increment by rs_c. - " st1 {v12.s}[1],[x5],x14 \n\t" // Store c111 into quad and increment by rs_c. - " st1 {v12.s}[2],[x5],x14 \n\t" // Store c112 into quad and increment by rs_c. - " st1 {v12.s}[3],[x5],x14 \n\t" // Store c113 into quad and increment by rs_c. - " st1 {v13.s}[0],[x5],x14 \n\t" // Store c114 into quad and increment by rs_c. - " st1 {v13.s}[1],[x5],x14 \n\t" // Store c115 into quad and increment by rs_c. - " st1 {v13.s}[2],[x5],x14 \n\t" // Store c116 into quad and increment by rs_c. - " st1 {v13.s}[3],[x5],x14 \n\t" // Store c147 into quad and increment by rs_c. - " \n\t" - LABEL(SEND) // Done! + // BRANCH(SEND) // Done. + // LABEL(SEND) // Done! " \n\t" :// output operands (none) :// input operands @@ -1072,7 +693,7 @@ void bli_sgemm_armv8a_asm_8x12 [b_next] "m" (b_next) // 10 :// Register clobber list "x0", "x1", "x2", - "x5", "x6", "x10","x14", + "x5", "x6", "x10", "x16","x17","x19","x20", "x21","x22","x23","x24", "x25","x26","x27", @@ -1148,8 +769,8 @@ void bli_dgemm_armv8a_asm_6x8 " ldr x10,%[cs_c] \n\t" // Load cs_c " lsl x10,x10,#3 \n\t" // cs_c * sizeof(double) " \n\t" - " ldr x14,%[rs_c] \n\t" // Load rs_c. - " lsl x14,x14,#3 \n\t" // rs_c * sizeof(double). + // " ldr x14,%[rs_c] \n\t" // Load rs_c. + // " lsl x14,x14,#3 \n\t" // rs_c * sizeof(double). " \n\t" " add x20,x2,x10 \n\t" //Load address Column 1 of C " add x21,x20,x10 \n\t" //Load address Column 2 of C @@ -1620,9 +1241,6 @@ void bli_dgemm_armv8a_asm_6x8 " ldr x0,%[a_next] \n\t" // Next A address for later use. " ldr x1,%[b_next] \n\t" // Next B address for later use. " \n\t" - " cmp x14,#8 \n\t" // If rs_c != 1 (column-major) - BNE(DGENSTORED) - " \n\t" LABEL(DCOLSTORED) // C is column-major. " \n\t" " dup v0.2d, xzr \n\t" @@ -1796,262 +1414,8 @@ void bli_dgemm_armv8a_asm_6x8 " str q12, [x26, #16] \n\t" " str q13, [x26, #32] \n\t" " \n\t" - BRANCH(DEND) - " \n\t" - LABEL(DGENSTORED) // C is general-stride stored. - " \n\t" - " dup v0.2d, xzr \n\t" - " dup v1.2d, xzr \n\t" - " dup v2.2d, xzr \n\t" - " dup v3.2d, xzr \n\t" - " dup v4.2d, xzr \n\t" - " dup v5.2d, xzr \n\t" - " \n\t" - " fcmp d7,#0.0 \n\t" - BEQ(DBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. - " \n\t" - " mov x27, x2 \n\t" - " \n\t" // Load address of C. - " ld1 {v0.d}[0],[x27],x14 \n\t" // Load c00 into quad and increment by rs_c. - " ld1 {v0.d}[1],[x27],x14 \n\t" // Load c01 into quad and increment by rs_c. - " ld1 {v1.d}[0],[x27],x14 \n\t" // Load c02 into quad and increment by rs_c. - " ld1 {v1.d}[1],[x27],x14 \n\t" // Load c03 into quad and increment by rs_c. - " ld1 {v2.d}[0],[x27],x14 \n\t" // Load c04 into quad and increment by rs_c. - " ld1 {v2.d}[1],[x27],x14 \n\t" // Load c05 into quad and increment by rs_c. - " \n\t" - " mov x27, x20 \n\t" // Load address of C. - " \n\t" - " ld1 {v3.d}[0],[x27],x14 \n\t" // Load c10 into quad and increment by rs_c. - " ld1 {v3.d}[1],[x27],x14 \n\t" // Load c11 into quad and increment by rs_c. - " ld1 {v4.d}[0],[x27],x14 \n\t" // Load c12 into quad and increment by rs_c. - " ld1 {v4.d}[1],[x27],x14 \n\t" // Load c13 into quad and increment by rs_c. - " ld1 {v5.d}[0],[x27],x14 \n\t" // Load c14 into quad and increment by rs_c. - " ld1 {v5.d}[1],[x27],x14 \n\t" // Load c15 into quad and increment by rs_c. - " \n\t" - " fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta - " fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta - " fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta - " fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta - " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta - " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta - " \n\t" - LABEL(DBETAZEROGENSTOREDS1) - " \n\t" - " fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v2.2d,v10.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v3.2d,v11.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v4.2d,v12.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v5.2d,v13.2d,v6.d[0] \n\t" // Scale by alpha - " \n\t" - " mov x27, x2 \n\t" // Load address of C. - " \n\t" - " st1 {v0.d}[0],[x27],x14 \n\t" // Store c00 into quad and increment by rs_c. - " st1 {v0.d}[1],[x27],x14 \n\t" // Store c01 into quad and increment by rs_c. - " st1 {v1.d}[0],[x27],x14 \n\t" // Store c02 into quad and increment by rs_c. - " st1 {v1.d}[1],[x27],x14 \n\t" // Store c03 into quad and increment by rs_c. - " st1 {v2.d}[0],[x27],x14 \n\t" // Store c04 into quad and increment by rs_c. - " st1 {v2.d}[1],[x27],x14 \n\t" // Store c05 into quad and increment by rs_c. - " \n\t" - " mov x27, x20 \n\t" // Load address of C. - " \n\t" - " st1 {v3.d}[0],[x27],x14 \n\t" // Store c10 into quad and increment by rs_c. - " st1 {v3.d}[1],[x27],x14 \n\t" // Store c11 into quad and increment by rs_c. - " st1 {v4.d}[0],[x27],x14 \n\t" // Store c12 into quad and increment by rs_c. - " st1 {v4.d}[1],[x27],x14 \n\t" // Store c13 into quad and increment by rs_c. - " st1 {v5.d}[0],[x27],x14 \n\t" // Store c14 into quad and increment by rs_c. - " st1 {v5.d}[1],[x27],x14 \n\t" // Store c15 into quad and increment by rs_c. - " \n\t" - " dup v8.2d, xzr \n\t" - " dup v9.2d, xzr \n\t" - " dup v10.2d, xzr \n\t" - " dup v11.2d, xzr \n\t" - " dup v12.2d, xzr \n\t" - " dup v13.2d, xzr \n\t" - " \n\t" - " fcmp d7,#0.0 \n\t" - BEQ(DBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. - " \n\t" - " mov x27, x21 \n\t" // Load address of C. - " \n\t" - " ld1 {v8.d}[0], [x27],x14 \n\t" // Load c20 into quad and increment by rs_c. - " ld1 {v8.d}[1], [x27],x14 \n\t" // Load c21 into quad and increment by rs_c. - " ld1 {v9.d}[0], [x27],x14 \n\t" // Load c22 into quad and increment by rs_c. - " ld1 {v9.d}[1], [x27],x14 \n\t" // Load c23 into quad and increment by rs_c. - " ld1 {v10.d}[0],[x27],x14 \n\t" // Load c24 into quad and increment by rs_c. - " ld1 {v10.d}[1],[x27],x14 \n\t" // Load c25 into quad and increment by rs_c. - " \n\t" - " mov x27, x22 \n\t" // Load address of C. - " \n\t" - " ld1 {v11.d}[0],[x27],x14 \n\t" // Load c30 into quad and increment by rs_c. - " ld1 {v11.d}[1],[x27],x14 \n\t" // Load c31 into quad and increment by rs_c. - " ld1 {v12.d}[0],[x27],x14 \n\t" // Load c32 into quad and increment by rs_c. - " ld1 {v12.d}[1],[x27],x14 \n\t" // Load c33 into quad and increment by rs_c. - " ld1 {v13.d}[0],[x27],x14 \n\t" // Load c34 into quad and increment by rs_c. - " ld1 {v13.d}[1],[x27],x14 \n\t" // Load c35 into quad and increment by rs_c. - " \n\t" - " fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta - " fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta - " fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta - " fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta - " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta - " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta - " \n\t" - LABEL(DBETAZEROGENSTOREDS2) - " \n\t" - " fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v10.2d,v16.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v11.2d,v17.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v12.2d,v18.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v13.2d,v19.2d,v6.d[0] \n\t" // Scale by alpha - " \n\t" - " mov x27, x21 \n\t" // Load address of C. - " \n\t" - " st1 {v8.d}[0], [x27],x14 \n\t" // Store c20 into quad and increment by rs_c. - " st1 {v8.d}[1], [x27],x14 \n\t" // Store c21 into quad and increment by rs_c. - " st1 {v9.d}[0], [x27],x14 \n\t" // Store c22 into quad and increment by rs_c. - " st1 {v9.d}[1], [x27],x14 \n\t" // Store c23 into quad and increment by rs_c. - " st1 {v10.d}[0],[x27],x14 \n\t" // Store c24 into quad and increment by rs_c. - " st1 {v10.d}[1],[x27],x14 \n\t" // Store c25 into quad and increment by rs_c. - " \n\t" - " mov x27, x22 \n\t" // Load address of C. - " \n\t" - " st1 {v11.d}[0],[x27],x14 \n\t" // Store c30 into quad and increment by rs_c. - " st1 {v11.d}[1],[x27],x14 \n\t" // Store c31 into quad and increment by rs_c. - " st1 {v12.d}[0],[x27],x14 \n\t" // Store c32 into quad and increment by rs_c. - " st1 {v12.d}[1],[x27],x14 \n\t" // Store c33 into quad and increment by rs_c. - " st1 {v13.d}[0],[x27],x14 \n\t" // Store c34 into quad and increment by rs_c. - " st1 {v13.d}[1],[x27],x14 \n\t" // Store c35 into quad and increment by rs_c. - " \n\t" - " dup v0.2d, xzr \n\t" - " dup v1.2d, xzr \n\t" - " dup v2.2d, xzr \n\t" - " dup v3.2d, xzr \n\t" - " dup v4.2d, xzr \n\t" - " dup v5.2d, xzr \n\t" - " \n\t" - " fcmp d7,#0.0 \n\t" - BEQ(DBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. - " \n\t" - " mov x27, x23 \n\t" // Load address of C. - " \n\t" - " ld1 {v0.d}[0],[x27],x14 \n\t" // Load c40 into quad and increment by rs_c. - " ld1 {v0.d}[1],[x27],x14 \n\t" // Load c41 into quad and increment by rs_c. - " ld1 {v1.d}[0],[x27],x14 \n\t" // Load c42 into quad and increment by rs_c. - " ld1 {v1.d}[1],[x27],x14 \n\t" // Load c43 into quad and increment by rs_c. - " ld1 {v2.d}[0],[x27],x14 \n\t" // Load c44 into quad and increment by rs_c. - " ld1 {v2.d}[1],[x27],x14 \n\t" // Load c45 into quad and increment by rs_c. - " \n\t" - " mov x27, x24 \n\t" // Load address of C. - " \n\t" - " ld1 {v3.d}[0],[x27],x14 \n\t" // Load c50 into quad and increment by rs_c. - " ld1 {v3.d}[1],[x27],x14 \n\t" // Load c51 into quad and increment by rs_c. - " ld1 {v4.d}[0],[x27],x14 \n\t" // Load c52 into quad and increment by rs_c. - " ld1 {v4.d}[1],[x27],x14 \n\t" // Load c53 into quad and increment by rs_c. - " ld1 {v5.d}[0],[x27],x14 \n\t" // Load c54 into quad and increment by rs_c. - " ld1 {v5.d}[1],[x27],x14 \n\t" // Load c55 into quad and increment by rs_c. - " \n\t" - " fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta - " fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta - " fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta - " fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta - " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta - " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta - " \n\t" - LABEL(DBETAZEROGENSTOREDS3) - " \n\t" - " fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v2.2d,v22.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v3.2d,v23.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v4.2d,v24.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v5.2d,v25.2d,v6.d[0] \n\t" // Scale by alpha - " \n\t" - " mov x27, x23 \n\t" // Load address of C. - " \n\t" - " st1 {v0.d}[0],[x27],x14 \n\t" // Store c40 into quad and increment by rs_c. - " st1 {v0.d}[1],[x27],x14 \n\t" // Store c41 into quad and increment by rs_c. - " st1 {v1.d}[0],[x27],x14 \n\t" // Store c42 into quad and increment by rs_c. - " st1 {v1.d}[1],[x27],x14 \n\t" // Store c43 into quad and increment by rs_c. - " st1 {v2.d}[0],[x27],x14 \n\t" // Store c44 into quad and increment by rs_c. - " st1 {v2.d}[1],[x27],x14 \n\t" // Store c45 into quad and increment by rs_c. - " \n\t" - " mov x27, x24 \n\t" // Load address of C. - " \n\t" - " st1 {v3.d}[0],[x27],x14 \n\t" // Store c50 into quad and increment by rs_c. - " st1 {v3.d}[1],[x27],x14 \n\t" // Store c51 into quad and increment by rs_c. - " st1 {v4.d}[0],[x27],x14 \n\t" // Store c52 into quad and increment by rs_c. - " st1 {v4.d}[1],[x27],x14 \n\t" // Store c53 into quad and increment by rs_c. - " st1 {v5.d}[0],[x27],x14 \n\t" // Store c54 into quad and increment by rs_c. - " st1 {v5.d}[1],[x27],x14 \n\t" // Store c55 into quad and increment by rs_c. - " \n\t" - " dup v8.2d, xzr \n\t" - " dup v9.2d, xzr \n\t" - " dup v10.2d, xzr \n\t" - " dup v11.2d, xzr \n\t" - " dup v12.2d, xzr \n\t" - " dup v13.2d, xzr \n\t" - " \n\t" - " fcmp d7,#0.0 \n\t" - BEQ(DBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. - " \n\t" - " mov x27, x25 \n\t" - " \n\t" - " ld1 {v8.d}[0], [x27],x14 \n\t" // Load c60 into quad and increment by rs_c. - " ld1 {v8.d}[1], [x27],x14 \n\t" // Load c61 into quad and increment by rs_c. - " ld1 {v9.d}[0], [x27],x14 \n\t" // Load c62 into quad and increment by rs_c. - " ld1 {v9.d}[1], [x27],x14 \n\t" // Load c63 into quad and increment by rs_c. - " ld1 {v10.d}[0],[x27],x14 \n\t" // Load c64 into quad and increment by rs_c. - " ld1 {v10.d}[1],[x27],x14 \n\t" // Load c65 into quad and increment by rs_c. - " \n\t" - " mov x27, x26 \n\t" // Load address of C. - " \n\t" - " ld1 {v11.d}[0],[x27],x14 \n\t" // Load c70 into quad and increment by rs_c. - " ld1 {v11.d}[1],[x27],x14 \n\t" // Load c71 into quad and increment by rs_c. - " ld1 {v12.d}[0],[x27],x14 \n\t" // Load c72 into quad and increment by rs_c. - " ld1 {v12.d}[1],[x27],x14 \n\t" // Load c73 into quad and increment by rs_c. - " ld1 {v13.d}[0],[x27],x14 \n\t" // Load c74 into quad and increment by rs_c. - " ld1 {v13.d}[1],[x27],x14 \n\t" // Load c75 into quad and increment by rs_c. - " \n\t" - " fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta - " fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta - " fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta - " fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta - " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta - " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta - " \n\t" - LABEL(DBETAZEROGENSTOREDS4) - " \n\t" - " prfm pldl2keep,[x0] \n\t" - " prfm pldl2keep,[x1] \n\t" - " \n\t" - " fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v10.2d,v28.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v11.2d,v29.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v12.2d,v30.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v13.2d,v31.2d,v6.d[0] \n\t" // Scale by alpha - " \n\t" - " mov x27, x25 \n\t" // Load address of C. - " \n\t" - " st1 {v8.d}[0], [x27],x14 \n\t" // Store c60 into quad and increment by rs_c. - " st1 {v8.d}[1], [x27],x14 \n\t" // Store c61 into quad and increment by rs_c. - " st1 {v9.d}[0], [x27],x14 \n\t" // Store c62 into quad and increment by rs_c. - " st1 {v9.d}[1], [x27],x14 \n\t" // Store c63 into quad and increment by rs_c. - " st1 {v10.d}[0],[x27],x14 \n\t" // Store c64 into quad and increment by rs_c. - " st1 {v10.d}[1],[x27],x14 \n\t" // Store c65 into quad and increment by rs_c. - " \n\t" - " mov x27, x26 \n\t" // Load address of C. - " \n\t" - " st1 {v11.d}[0],[x27],x14 \n\t" // Store c70 into quad and increment by rs_c. - " st1 {v11.d}[1],[x27],x14 \n\t" // Store c71 into quad and increment by rs_c. - " st1 {v12.d}[0],[x27],x14 \n\t" // Store c72 into quad and increment by rs_c. - " st1 {v12.d}[1],[x27],x14 \n\t" // Store c73 into quad and increment by rs_c. - " st1 {v13.d}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c. - " st1 {v13.d}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c. - " \n\t" - LABEL(DEND) // Done! + // BRANCH(DEND) + // LABEL(DEND) // Done! " \n\t" :// output operands (none) :// input operands @@ -2069,8 +1433,9 @@ void bli_dgemm_armv8a_asm_6x8 :// Register clobber list "x0","x1","x2", "x5","x6","x10", - "x14","x16","x17", - "x20","x21","x22","x23","x24","x25","x26","x27", + "x16","x17","x20", + "x21","x22","x23", + "x24","x25","x26","x27", "v0","v1","v2", "v3","v4","v5", "v6","v7","v8", From 9cc897f37455d52fbba752e3801f1a9d4a5bfdc1 Mon Sep 17 00:00:00 2001 From: Ruqing Xu Date: Thu, 3 Feb 2022 16:40:02 +0000 Subject: [PATCH 025/230] Fix SVE Compil. --- config/a64fx/bli_family_a64fx.h | 10 ++++++++++ .../armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c | 2 +- .../armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c | 2 +- .../armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c | 2 +- .../armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c | 2 +- 5 files changed, 14 insertions(+), 4 deletions(-) diff --git a/config/a64fx/bli_family_a64fx.h b/config/a64fx/bli_family_a64fx.h index 5e3f29fd4..b67ae7c60 100644 --- a/config/a64fx/bli_family_a64fx.h +++ b/config/a64fx/bli_family_a64fx.h @@ -41,6 +41,16 @@ #define BLIS_SIMD_ALIGN_SIZE 256 #define BLIS_SIMD_NUM_REGISTERS 32 +// SVE-specific configs. +#define N_L1_SVE_DEFAULT 64 +#define W_L1_SVE_DEFAULT 4 +#define C_L1_SVE_DEFAULT 256 +#define N_L2_SVE_DEFAULT 2048 +#define W_L2_SVE_DEFAULT 16 +#define C_L2_SVE_DEFAULT 256 +#define N_L3_SVE_DEFAULT 8192 +#define W_L3_SVE_DEFAULT 16 +#define C_L3_SVE_DEFAULT 256 //#endif diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c index 60a64515f..0327f6dbc 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c @@ -140,7 +140,7 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -// " END_CCOL_PRFM: \n\t" +" END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c index 7136104b5..e92eba9d6 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c @@ -139,7 +139,7 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -// " END_CCOL_PRFM: \n\t" +" END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c index 20841891b..deb01f9fe 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c @@ -139,7 +139,7 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" -// " END_CCOL_PRFM: \n\t" +" END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c index 7e630894f..e941f5abd 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c @@ -140,7 +140,7 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -// " END_CCOL_PRFM: \n\t" +" END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" From 72089bb2917b78d99cf4f27c69125bf213ee54e6 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 5 Feb 2022 16:56:04 +0900 Subject: [PATCH 026/230] ArmSVE Use Predicate in M-Direction No need to query MR during kernel runtime. --- .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c | 7 ++- .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c | 53 +++++++------------ .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c | 50 +++++++---------- .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c | 7 ++- 4 files changed, 43 insertions(+), 74 deletions(-) diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c index 0327f6dbc..c24384b02 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c @@ -68,10 +68,10 @@ void bli_cgemm_armsve_asm_2vx10_unindexed uint64_t cs_c = cs_c0; uint64_t info = 0; - uint64_t mr = bli_vl_bytes_armsve() * 2 / 8; - GEMM_UKR_SETUP_CT( c, mr, 10, false ); + GEMM_UKR_SETUP_CT( c, m, 10, false ); __asm__ volatile ( +" whilelo p0.s, xzr, %12 \n\t" // " ldr x0, %[a] \n\t" // " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" @@ -97,7 +97,6 @@ void bli_cgemm_armsve_asm_2vx10_unindexed " madd x2, x16, x2, xzr \n\t" // cs_a " madd x3, x16, x3, xzr \n\t" // rs_b " madd %4, x16, %4, xzr \n\t" // cs_c -" ptrue p0.s \n\t" " \n\t" // " ldr x5, %[k_mker] \n\t" // Number of loops. // " ldr x6, %[k_left] \n\t" @@ -307,7 +306,7 @@ GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) "+r" (a_next), // %9 "+r" (b_next), // %10 "=r" (info) // %11 -: +: "r" (m) // %12 : "x2","x3","x9","x16", "z0","z1","z2","z3","z4","z5","z6","z7", "z8","z9","z10","z11","z12","z13","z14","z15", diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c index e92eba9d6..1c2c37208 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c @@ -67,10 +67,14 @@ void bli_dgemm_armsve_asm_2vx10_unindexed uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - uint64_t mr = bli_vl_bytes_armsve() * 2 / 8; - GEMM_UKR_SETUP_CT( d, mr, 10, false ); + GEMM_UKR_SETUP_CT( d, m, 10, false ); __asm__ volatile ( +" mov x0, xzr \n\t" +" ldr x1, %[m] \n\t" +" whilelo p0.d, x0, x1 \n\t" " incd x0 \n\t" +" whilelo p1.d, x0, x1 \n\t" +" \n\t" " ldr x0, %[a] \n\t" " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" @@ -96,7 +100,6 @@ void bli_dgemm_armsve_asm_2vx10_unindexed " madd x2, x8, x2, xzr \n\t" // cs_a " madd x3, x8, x3, xzr \n\t" // rs_b " madd x7, x8, x7, xzr \n\t" // cs_c -" ptrue p0.d \n\t" " \n\t" " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" @@ -114,7 +117,7 @@ void bli_dgemm_armsve_asm_2vx10_unindexed " ld1rd z26.d, p0/z, [x1, 48] \n\t" " ld1rd z27.d, p0/z, [x1, 56] \n\t" " \n\t" -GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) " \n\t" " CCOL_PRFM: \n\t" // " cmp x6, #1 \n\t" @@ -149,22 +152,22 @@ CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1 " K_MKER_LOOP: \n\t" " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " subs x4, x4, #1 \n\t" // Decrease counter before final replica. " b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " b K_MKER_LOOP \n\t" " \n\t" @@ -176,7 +179,7 @@ GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3 " cmp x8, #0 \n\t" // End of execution. " b.eq WRITE_MEM_PREP \n\t" " \n\t" -GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) " ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row. " ld1rd z21.d, p0/z, [x1, 8] \n\t" " ld1rd z22.d, p0/z, [x1, 16] \n\t" @@ -255,7 +258,7 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " \n\t" // C address for storing is x5 itself. // " cmp x6, #1 \n\t" // Preload first half of C for contiguous case. // " b.ne WRITE_MEM \n\t" -GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) " \n\t" " WRITE_MEM: \n\t" " \n\t" @@ -273,35 +276,16 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1 " fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN. " b.eq BETA_ZERO_C \n\t" // First half of C is already loaded in this case. -// GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7) +// GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7) GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) " \n\t" " BETA_ZERO_C: \n\t" -GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) -GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7) +GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7) +GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7) // " b END_WRITE_MEM \n\t" // " \n\t" -// " WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. -// " \n\t" // Here used scratch: Z[20-30] - Z30 as index. -// " mov x8, xzr \n\t" -// " incb x8 \n\t" -// " madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. -// " index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. -// " \n\t" -// " fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN. -// " b.eq BETA_ZERO_G \n\t" -// " \n\t" -// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -// GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -// GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -// " \n\t" -// " BETA_ZERO_G: \n\t" -// GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) -// GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16) -// " \n\t" // " END_WRITE_MEM: \n\t" // " b END_EXEC \n\t" // " \n\t" @@ -310,7 +294,8 @@ GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7) " END_EXEC: \n\t" " mov x0, #0 \n\t" // Return normal. : -: [a] "m" (a), +: [m] "m" (m), + [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_c] "m" (rs_c), diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c index deb01f9fe..7dad6953f 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c @@ -67,10 +67,14 @@ void bli_sgemm_armsve_asm_2vx10_unindexed uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - uint64_t mr = bli_vl_bytes_armsve() * 2 / 4; - GEMM_UKR_SETUP_CT( s, mr, 10, false ); + GEMM_UKR_SETUP_CT( s, m, 10, false ); __asm__ volatile ( +" mov x0, xzr \n\t" +" ldr x1, %[m] \n\t" +" whilelo p0.s, x0, x1 \n\t" " incw x0 \n\t" +" whilelo p1.s, x0, x1 \n\t" +" \n\t" " ldr x0, %[a] \n\t" " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" @@ -96,7 +100,6 @@ void bli_sgemm_armsve_asm_2vx10_unindexed " madd x2, x8, x2, xzr \n\t" // cs_a " madd x3, x8, x3, xzr \n\t" // rs_b " madd x7, x8, x7, xzr \n\t" // cs_c -" ptrue p0.s \n\t" " \n\t" " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" @@ -114,7 +117,7 @@ void bli_sgemm_armsve_asm_2vx10_unindexed " ld1rw z26.s, p0/z, [x1, 24] \n\t" " ld1rw z27.s, p0/z, [x1, 28] \n\t" " \n\t" -GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) " \n\t" " CCOL_PRFM: \n\t" // " cmp x6, #1 \n\t" @@ -149,22 +152,22 @@ CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1 " K_MKER_LOOP: \n\t" " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " subs x4, x4, #1 \n\t" // Decrease counter before final replica. " b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " b K_MKER_LOOP \n\t" " \n\t" @@ -176,7 +179,7 @@ GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3 " cmp x8, #0 \n\t" // End of execution. " b.eq WRITE_MEM_PREP \n\t" " \n\t" -GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) " ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row. " ld1rw z21.s, p0/z, [x1, 4] \n\t" " ld1rw z22.s, p0/z, [x1, 8] \n\t" @@ -260,34 +263,16 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1 " \n\t" // Here used scratch: Z[20-29]. " fcmp s31, #0.0 \n\t" " b.eq BETA_ZERO_C \n\t" -GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) " \n\t" " BETA_ZERO_C: \n\t" -GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) -GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7) +GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7) +GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7) // " b END_WRITE_MEM \n\t" // " \n\t" -// " WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. -// " \n\t" // Here used scratch: Z[20-30] - Z30 as index. -// " mov x8, xzr \n\t" -// " incb x8 \n\t" -// " madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. -// " index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8. -// " \n\t" -// " fcmp s31, #0.0 \n\t" -// " b.eq BETA_ZERO_G \n\t" -// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -// GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -// GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -// " \n\t" -// " BETA_ZERO_G: \n\t" -// GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) -// GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16) -// " \n\t" // " END_WRITE_MEM: \n\t" // " b END_EXEC \n\t" // " \n\t" @@ -296,7 +281,8 @@ GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7) " END_EXEC: \n\t" " mov x0, #0 \n\t" // Return normal. : -: [a] "m" (a), +: [m] "m" (m), + [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_c] "m" (rs_c), diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c index e941f5abd..42b1345ff 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c @@ -68,10 +68,10 @@ void bli_zgemm_armsve_asm_2vx10_unindexed uint64_t cs_c = cs_c0; uint64_t info = 0; - uint64_t mr = bli_vl_bytes_armsve() * 2 / 16; - GEMM_UKR_SETUP_CT( z, mr, 10, false ); + GEMM_UKR_SETUP_CT( z, m, 10, false ); __asm__ volatile ( +" whilelo p0.d, xzr, %12 \n\t" // " ldr x0, %[a] \n\t" // " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" @@ -97,7 +97,6 @@ void bli_zgemm_armsve_asm_2vx10_unindexed " madd x2, x16, x2, xzr \n\t" // cs_a " madd x3, x16, x3, xzr \n\t" // rs_b " madd %4, x16, %4, xzr \n\t" // cs_c -" ptrue p0.d \n\t" " \n\t" // " ldr x5, %[k_mker] \n\t" // Number of loops. // " ldr x6, %[k_left] \n\t" @@ -306,7 +305,7 @@ GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) "+r" (a_next), // %9 "+r" (b_next), // %10 "=r" (info) // %11 -: +: "r" (m) // %12 : "x2","x3","x9","x16", "z0","z1","z2","z3","z4","z5","z6","z7", "z8","z9","z10","z11","z12","z13","z14","z15", From 2f3872e01d51545c687ae2c8b2650e00552111a7 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Mon, 7 Feb 2022 17:14:49 +0900 Subject: [PATCH 027/230] ArmSVE Adopts Label Wrapper For clang (& armclang?) compilation. Hopefully solves #609 . --- .../armsve/1m/bli_dpackm_armsve512_asm_10xk.c | 37 +++++----- .../armsve/1m/bli_dpackm_armsve512_asm_16xk.c | 37 +++++----- kernels/armsve/3/armsve_asm_macros.h | 13 ++++ .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c | 70 +++++++++---------- .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c | 57 +++++++-------- .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c | 55 ++++++++------- .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c | 70 +++++++++---------- kernels/armv8a/3/armv8a_asm_utils.h | 2 +- 8 files changed, 179 insertions(+), 162 deletions(-) diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c index 44718fa57..a086b3a76 100644 --- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c +++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c @@ -36,6 +36,7 @@ #include "blis.h" #include "armsve512_asm_transpose_d8x8.h" #include "armsve512_asm_transpose_d8x2.h" +#include "../3/armsve_asm_macros.h" // assumption: // SVE vector length = 512 bits. @@ -93,9 +94,9 @@ void bli_dpackm_armsve512_asm_10xk "mov x8, %[n_mker] \n\t" "mov x9, %[n_left] \n\t" "ptrue p0.d \n\t" - "b.ne .AROWSTOR \n\t" + BNE(AROWSTOR) // A stored in columns. - " .ACOLSTOR: \n\t" + LABEL(ACOLSTOR) // Prefetch distance. "mov x17, #8 \n\t" "madd x17, x17, x3, xzr \n\t" @@ -105,9 +106,9 @@ void bli_dpackm_armsve512_asm_10xk "lsl x16, x16, #60 \n\t" "orr x0, x0, x16 \n\t" #endif - " .ACOLSTORMKER: \n\t" + LABEL(ACOLSTORMKER) "cmp x8, xzr \n\t" - "b.eq .ACOLSTORMKEREND \n\t" + BEQ(ACOLSTORMKEREND) "add x5, x0, x3 \n\t" "add x6, x5, x3 \n\t" "add x7, x6, x3 \n\t" @@ -201,11 +202,11 @@ void bli_dpackm_armsve512_asm_10xk // "add x1, x1, #320 \n\t" "add x0, x7, x3 \n\t" "sub x8, x8, #1 \n\t" - "b .ACOLSTORMKER \n\t" - " .ACOLSTORMKEREND: \n\t" - " .ACOLSTORLEFT: \n\t" + BRANCH(ACOLSTORMKER) + LABEL(ACOLSTORMKEREND) + LABEL(ACOLSTORLEFT) "cmp x9, xzr \n\t" - "b.eq .UNITKDONE \n\t" + BEQ(UNITKDONE) "ld1d z0.d, p0/z, [x0] \n\t" "ldr q1, [x0, #64] \n\t" "st1d z0.d, p0, [x1] \n\t" @@ -213,14 +214,14 @@ void bli_dpackm_armsve512_asm_10xk "add x0, x0, x3 \n\t" "add x1, x1, x2 \n\t" "sub x9, x9, #1 \n\t" - "b .ACOLSTORLEFT \n\t" + BRANCH(ACOLSTORLEFT) // A stored in rows. - " .AROWSTOR: \n\t" + LABEL(AROWSTOR) // Prepare predicates for in-reg transpose. SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6) - " .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful. + LABEL(AROWSTORMKER) // X[10-16] for A here not P. Be careful. "cmp x8, xzr \n\t" - "b.eq .AROWSTORMKEREND \n\t" + BEQ(AROWSTORMKEREND) "add x10, x0, x4 \n\t" "add x11, x10, x4 \n\t" "add x12, x11, x4 \n\t" @@ -271,15 +272,15 @@ void bli_dpackm_armsve512_asm_10xk "add x1, x16, x2 \n\t" "add x0, x0, #64 \n\t" "sub x8, x8, #1 \n\t" - "b .AROWSTORMKER \n\t" - " .AROWSTORMKEREND: \n\t" + BRANCH(AROWSTORMKER) + LABEL(AROWSTORMKEREND) "mov x4, %[inca] \n\t" // Restore unshifted inca. "index z30.d, xzr, x4 \n\t" // Generate index. "lsl x4, x4, #3 \n\t" // Shift again. "lsl x5, x4, #3 \n\t" // Virtual column vl. - " .AROWSTORLEFT: \n\t" + LABEL(AROWSTORLEFT) "cmp x9, xzr \n\t" - "b.eq .UNITKDONE \n\t" + BEQ(UNITKDONE) "add x6, x0, x5 \n\t" "add x7, x6, x4 \n\t" "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t" @@ -291,8 +292,8 @@ void bli_dpackm_armsve512_asm_10xk "add x1, x1, x2 \n\t" "add x0, x0, #8 \n\t" "sub x9, x9, #1 \n\t" - "b .AROWSTORLEFT \n\t" - " .UNITKDONE: \n\t" + BRANCH(AROWSTORLEFT) + LABEL(UNITKDONE) "mov x0, #0 \n\t" : : [a] "r" (a), diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c index f02b87a7a..aeb323c0c 100644 --- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c +++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c @@ -35,6 +35,7 @@ #include "blis.h" #include "armsve512_asm_transpose_d8x8.h" +#include "../3/armsve_asm_macros.h" // assumption: // SVE vector length = 512 bits. @@ -99,9 +100,9 @@ void bli_dpackm_armsve512_asm_16xk "mov x8, %[n_mker] \n\t" "mov x9, %[n_left] \n\t" "ptrue p0.d \n\t" - "b.ne .AROWSTOR \n\t" + BNE(AROWSTOR) // A stored in columns. - " .ACOLSTOR: \n\t" + LABEL(ACOLSTOR) // Prefetch distance. "mov x17, #8 \n\t" "madd x17, x17, x3, xzr \n\t" @@ -125,9 +126,9 @@ void bli_dpackm_armsve512_asm_16xk // "prfm PLDL1STRM, [x5] \n\t" // "prfm PLDL1STRM, [x6] \n\t" // "prfm PLDL1STRM, [x7] \n\t" - " .ACOLSTORMKER: \n\t" + LABEL(ACOLSTORMKER) "cmp x8, xzr \n\t" - "b.eq .ACOLSTORMKEREND \n\t" + BEQ(ACOLSTORMKEREND) "add x5, x0, x3 \n\t" "add x6, x5, x3 \n\t" "add x7, x6, x3 \n\t" @@ -193,11 +194,11 @@ void bli_dpackm_armsve512_asm_16xk "add x0, x7, x3 \n\t" "add x1, x16, x2 \n\t" "sub x8, x8, #1 \n\t" - "b .ACOLSTORMKER \n\t" - " .ACOLSTORMKEREND: \n\t" - " .ACOLSTORLEFT: \n\t" + BRANCH(ACOLSTORMKER) + LABEL(ACOLSTORMKEREND) + LABEL(ACOLSTORLEFT) "cmp x9, xzr \n\t" - "b.eq .UNITKDONE \n\t" + BEQ(UNITKDONE) "ld1d z0.d, p0/z, [x0] \n\t" "ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t" "st1d z0.d, p0, [x1] \n\t" @@ -205,14 +206,14 @@ void bli_dpackm_armsve512_asm_16xk "add x0, x0, x3 \n\t" "add x1, x1, x2 \n\t" "sub x9, x9, #1 \n\t" - "b .ACOLSTORLEFT \n\t" + BRANCH(ACOLSTORLEFT) // A stored in rows. - " .AROWSTOR: \n\t" + LABEL(AROWSTOR) // Prepare predicates for in-reg transpose. SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6) - " .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful. + LABEL(AROWSTORMKER) // X[10-16] for A here not P. Be careful. "cmp x8, xzr \n\t" - "b.eq .AROWSTORMKEREND \n\t" + BEQ(AROWSTORMKEREND) "add x10, x0, x4 \n\t" "add x11, x10, x4 \n\t" "add x12, x11, x4 \n\t" @@ -274,15 +275,15 @@ void bli_dpackm_armsve512_asm_16xk "add x0, x0, #64 \n\t" "add x1, x16, x2 \n\t" "sub x8, x8, #1 \n\t" - "b .AROWSTORMKER \n\t" - " .AROWSTORMKEREND: \n\t" + BRANCH(AROWSTORMKER) + LABEL(AROWSTORMKEREND) "mov x4, %[inca] \n\t" // Restore unshifted inca. "index z30.d, xzr, x4 \n\t" // Generate index. "lsl x4, x4, #3 \n\t" // Shift again. "lsl x5, x4, #3 \n\t" // Virtual column vl. - " .AROWSTORLEFT: \n\t" + LABEL(AROWSTORLEFT) "cmp x9, xzr \n\t" - "b.eq .UNITKDONE \n\t" + BEQ(UNITKDONE) "add x6, x0, x5 \n\t" "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t" "ld1d z1.d, p0/z, [x6, z30.d, lsl #3] \n\t" @@ -291,8 +292,8 @@ void bli_dpackm_armsve512_asm_16xk "add x1, x1, x2 \n\t" "add x0, x0, #8 \n\t" "sub x9, x9, #1 \n\t" - "b .AROWSTORLEFT \n\t" - " .UNITKDONE: \n\t" + BRANCH(AROWSTORLEFT) + LABEL(UNITKDONE) "mov x0, #0 \n\t" : : [a] "r" (a), diff --git a/kernels/armsve/3/armsve_asm_macros.h b/kernels/armsve/3/armsve_asm_macros.h index 5e8eb3c62..9cbbeab92 100644 --- a/kernels/armsve/3/armsve_asm_macros.h +++ b/kernels/armsve/3/armsve_asm_macros.h @@ -33,6 +33,19 @@ */ +// Clang's label requirements. +#if defined(__clang__) +#define LABEL(str) " L" #str"%=: \n\t" +#define BEQ(str) "b.eq L" #str"%= \n\t" +#define BNE(str) "b.ne L" #str"%= \n\t" +#define BRANCH(str) "b L" #str"%= \n\t" +#else +#define LABEL(str) " ." #str": \n\t" +#define BEQ(str) "b.eq ." #str" \n\t" +#define BNE(str) "b.ne ." #str" \n\t" +#define BRANCH(str) "b ." #str" \n\t" +#endif + #define CLEAR_COL2(Z0,Z1) \ " dup "#Z0"."DT", #0 \n\t" \ " dup "#Z1"."DT", #0 \n\t" diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c index c24384b02..098d5d4b5 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c @@ -101,9 +101,9 @@ void bli_cgemm_armsve_asm_2vx10_unindexed // " ldr x5, %[k_mker] \n\t" // Number of loops. // " ldr x6, %[k_left] \n\t" " \n\t" -" LOAD_ABC: \n\t" +LABEL(LOAD_ABC) " cmp %5, #0 \n\t" // Don't preload if no microkernel there. -" b.eq END_CCOL_PRFM \n\t" +BEQ(END_CCOL_PRFM) " \n\t" " ld1rw z20.s, p0/z, [%1, 4*0] \n\t" // Load B's real 8/10, no imaginary. " ld1rw z21.s, p0/z, [%1, 4*2] \n\t" @@ -116,9 +116,9 @@ void bli_cgemm_armsve_asm_2vx10_unindexed " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " \n\t" -" CCOL_PRFM: \n\t" +LABEL(CCOL_PRFM) // " cmp %3, #1 \n\t" -// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage. " mov x16, %2 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" @@ -139,14 +139,14 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +LABEL(END_CCOL_PRFM) " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp %5, #0 \n\t" // If no 4-microkernel can be applied. -" b.eq K_LEFT_LOOP \n\t" +BEQ(K_LEFT_LOOP) " \n\t" -" K_MKER_LOOP: \n\t" +LABEL(K_MKER_LOOP) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) @@ -158,18 +158,18 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" " subs %5, %5, #1 \n\t" // Decrease counter before final replica. -" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem. " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) -" b K_MKER_LOOP \n\t" +BRANCH(K_MKER_LOOP) " \n\t" -" FIN_MKER_LOOP: \n\t" +LABEL(FIN_MKER_LOOP) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" -" K_LEFT_LOOP: \n\t" +LABEL(K_LEFT_LOOP) " cmp %6, #0 \n\t" // End of execution. -" b.eq WRITE_MEM_PREP \n\t" +BEQ(WRITE_MEM_PREP) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " ld1rw z20.s, p0/z, [%1, 4*0] \n\t" // Load B's real 8/10, no imaginary. @@ -182,9 +182,9 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " ld1rw z27.s, p0/z, [%1, 4*14] \n\t" GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " sub %6, %6, #1 \n\t" -" b K_LEFT_LOOP \n\t" // Next column / row. +BRANCH(K_LEFT_LOOP) " \n\t" -" WRITE_MEM_PREP: \n\t" +LABEL(WRITE_MEM_PREP) " \n\t" // " ldr x7, %[alpha] \n\t" // Load alpha & beta (address). // " ldr x8, %[beta] \n\t" @@ -193,7 +193,7 @@ GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18, " ld1rw z30.s, p0/z, [%8] \n\t" // Real(beta). " ld1rw z31.s, p0/z, [%8, 4] \n\t" // Imag(beta). " \n\t" -" PREFETCH_ABNEXT: \n\t" +LABEL(PREFETCH_ABNEXT) // " ldr x9, %[a_next] \n\t" // " ldr x10, %[b_next] \n\t" #ifdef _A64FX @@ -209,90 +209,90 @@ GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18, " prfm PLDL1STRM, [%10] \n\t" " prfm PLDL1STRM, [%10, 256*1] \n\t" " \n\t" -" WRITE_MEM: \n\t" +LABEL(WRITE_MEM) " fmov s27, #1.0 \n\t" " fcmp s29, #0.0 \n\t" // Whether Imag(alpha) == 0. " fccmp s28, s27, 0, eq \n\t" // Whether Real(alpha) == 1. -" b.eq UNIT_ALPHA \n\t" +BEQ(UNIT_ALPHA) " \n\t" GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z28,z29) GEMM_FMULCMPLX_COL2(z24,z25,z26,z27,p0,z4 ,z5 ,z6 ,z7 ,z28,z29) GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z8, z9, z10,z11,z28,z29) GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z28,z29) GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z16,z17,z18,z19,z28,z29) -" b WRITE_MEM_EXEC \n\t" +BRANCH(WRITE_MEM_EXEC) " \n\t" -" UNIT_ALPHA: \n\t" +LABEL(UNIT_ALPHA) MOV_COL2(z20,z21,z22,z23,z0 ,z1 ,z2 ,z3 ) MOV_COL2(z24,z25,z26,z27,z4 ,z5 ,z6 ,z7 ) MOV_COL2(z0 ,z1 ,z2 ,z3 ,z8, z9, z10,z11) MOV_COL2(z4 ,z5 ,z6 ,z7 ,z12,z13,z14,z15) MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19) " \n\t" -" WRITE_MEM_EXEC: \n\t" +LABEL(WRITE_MEM_EXEC) " mov x9, %2 \n\t" // C address for loading. " \n\t" // C address for storing is %2 itself. // " cmp %3, #1 \n\t" -// " b.ne WRITE_MEM_G \n\t" +// BNE(WRITE_MEM_G) " \n\t" -" WRITE_MEM_C: \n\t" +LABEL(WRITE_MEM_C) " fmov s29, wzr \n\t" " fcmp s31, #0.0 \n\t" // Whether Imag(beta) == 0. " fccmp s30, s29, 0, eq \n\t" // Whether Real(beta) == 0. -" b.eq ZERO_BETA_C_0_1_2_3 \n\t" +BEQ(ZERO_BETA_C_0_1_2_3) GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) -" ZERO_BETA_C_0_1_2_3: \n\t" +LABEL(ZERO_BETA_C_0_1_2_3) GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z24,z25,z26,z27,p0,%2,%4) " \n\t" -" b.eq ZERO_BETA_C_4_5_6_7_8_9 \n\t" +BEQ(ZERO_BETA_C_4_5_6_7_8_9) GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z20,z21,z22,z23,p0,x9,%4) GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) -" ZERO_BETA_C_4_5_6_7_8_9: \n\t" +LABEL(ZERO_BETA_C_4_5_6_7_8_9) GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) -// " b END_WRITE_MEM \n\t" +// BRANCH(END_WRITE_MEM) // " \n\t" -// " WRITE_MEM_G: \n\t" +// LABEL(WRITE_MEM_G) // " add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, // " mov x3, %3 \n\t" // s.t. 2*sizeof(float) = 2*4 = 8. // " index z28.s, wzr, w3 \n\t" // " fmov s29, wzr \n\t" // " fcmp s31, #0.0 \n\t" // Whether Imag(beta) == 0. // " fccmp s30, s29, 0, eq \n\t" // Whether Real(beta) == 0. -// " b.eq ZERO_BETA_G_0_1_2_3 \n\t" +// BEQ(ZERO_BETA_G_0_1_2_3) // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) // GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) // GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) -// " ZERO_BETA_G_0_1_2_3: \n\t" +// LABEL(ZERO_BETA_G_0_1_2_3) // GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) // " \n\t" -// " b.eq ZERO_BETA_G_4_5_6_7_8_9 \n\t" +// BEQ(ZERO_BETA_G_4_5_6_7_8_9) // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) // GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) // GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) // GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) -// " ZERO_BETA_G_4_5_6_7_8_9: \n\t" +// LABEL(ZERO_BETA_G_4_5_6_7_8_9) // GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) // " \n\t" -// " END_WRITE_MEM: \n\t" -// " b END_EXEC \n\t" +// LABEL(END_WRITE_MEM) +// BRANCH(END_EXEC) " \n\t" -" END_EXEC: \n\t" +LABEL(END_EXEC) " mov %11, #0 \n\t" // Return normal. : "+r" (a), // %0 "+r" (b), // %1 diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c index 1c2c37208..0ee470f24 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c @@ -104,9 +104,9 @@ void bli_dgemm_armsve_asm_2vx10_unindexed " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" " \n\t" -" LOAD_ABC: \n\t" +LABEL(LOAD_ABC) " cmp x4, #0 \n\t" // Don't preload if no microkernel there. -" b.eq END_CCOL_PRFM \n\t" +BEQ(END_CCOL_PRFM) " ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row. " ld1rd z21.d, p0/z, [x1, 8] \n\t" @@ -119,9 +119,9 @@ void bli_dgemm_armsve_asm_2vx10_unindexed " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) " \n\t" -" CCOL_PRFM: \n\t" +LABEL(CCOL_PRFM) // " cmp x6, #1 \n\t" -// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage. " mov x16, x5 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" @@ -142,14 +142,14 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +LABEL(END_CCOL_PRFM) " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp x4, #0 \n\t" // If no 4-microkernel can be applied -" b.eq K_LEFT_LOOP \n\t" +BEQ(K_LEFT_LOOP) " \n\t" -" K_MKER_LOOP: \n\t" +LABEL(K_MKER_LOOP) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) @@ -164,20 +164,20 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " subs x4, x4, #1 \n\t" // Decrease counter before final replica. -" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem. " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) -" b K_MKER_LOOP \n\t" +BRANCH(K_MKER_LOOP) " \n\t" -" FIN_MKER_LOOP: \n\t" +LABEL(FIN_MKER_LOOP) GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " add x0, x0, x2 \n\t" // Forward A to fill the blank. " \n\t" -" K_LEFT_LOOP: \n\t" +LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of execution. -" b.eq WRITE_MEM_PREP \n\t" +BEQ(WRITE_MEM_PREP) " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) " ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row. @@ -203,9 +203,9 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " add x0, x0, x2 \n\t" // Forward A. " add x1, x1, x3 \n\t" // Forward B. " sub x8, x8, #1 \n\t" -" b K_LEFT_LOOP \n\t" // Next column / row. +BRANCH(K_LEFT_LOOP) " \n\t" -" WRITE_MEM_PREP: \n\t" +LABEL(WRITE_MEM_PREP) " \n\t" " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" @@ -216,7 +216,7 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " fmov d28, #1.0 \n\t" // Prepare FP 1.0. " fmov x16, d28 \n\t" " \n\t" -" PREFETCH_ABNEXT: \n\t" +LABEL(PREFETCH_ABNEXT) " ldr x0, %[a_next] \n\t" " ldr x1, %[b_next] \n\t" #ifdef _A64FX @@ -257,41 +257,42 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. // " cmp x6, #1 \n\t" // Preload first half of C for contiguous case. -// " b.ne WRITE_MEM \n\t" +// BNE(WRITE_MEM) GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) " \n\t" -" WRITE_MEM: \n\t" +LABEL(WRITE_MEM) " \n\t" " cmp x16, x4 \n\t" -" b.eq UNIT_ALPHA \n\t" +BEQ(UNIT_ALPHA) " \n\t" SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) " \n\t" -" UNIT_ALPHA: \n\t" +LABEL(UNIT_ALPHA) // " cmp x6, #1 \n\t" -// " b.ne WRITE_MEM_G \n\t" +// BNE(WRITE_MEM_G) " \n\t" -" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. +LABEL(WRITE_MEM_C) +" \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. " fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN. -" b.eq BETA_ZERO_C \n\t" +BEQ(BETA_ZERO_C) // First half of C is already loaded in this case. // GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7) GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) " \n\t" -" BETA_ZERO_C: \n\t" +LABEL(BETA_ZERO_C) GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7) GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7) -// " b END_WRITE_MEM \n\t" +// BRANCH(END_WRITE_MEM) // " \n\t" -// " END_WRITE_MEM: \n\t" -// " b END_EXEC \n\t" +// LABEL(END_WRITE_MEM) +// BRANCH(END_EXEC) // " \n\t" -// " END_ERROR: \n\t" +// LABEL(END_ERROR) // " mov x0, #1 \n\t" // Return error. -" END_EXEC: \n\t" +LABEL(END_EXEC) " mov x0, #0 \n\t" // Return normal. : : [m] "m" (m), diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c index 7dad6953f..d03af5923 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c @@ -104,9 +104,9 @@ void bli_sgemm_armsve_asm_2vx10_unindexed " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" " \n\t" -" LOAD_ABC: \n\t" +LABEL(LOAD_ABC) " cmp x4, #0 \n\t" // Don't preload if no microkernel there. -" b.eq END_CCOL_PRFM \n\t" +BEQ(END_CCOL_PRFM) " ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row. " ld1rw z21.s, p0/z, [x1, 4] \n\t" @@ -119,9 +119,9 @@ void bli_sgemm_armsve_asm_2vx10_unindexed " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) " \n\t" -" CCOL_PRFM: \n\t" +LABEL(CCOL_PRFM) // " cmp x6, #1 \n\t" -// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage. " mov x16, x5 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" @@ -142,14 +142,14 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +LABEL(END_CCOL_PRFM) " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp x4, #0 \n\t" // If no 4-microkernel can be applied -" b.eq K_LEFT_LOOP \n\t" +BEQ(K_LEFT_LOOP) " \n\t" -" K_MKER_LOOP: \n\t" +LABEL(K_MKER_LOOP) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) @@ -164,20 +164,20 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " subs x4, x4, #1 \n\t" // Decrease counter before final replica. -" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem. " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) -" b K_MKER_LOOP \n\t" +BRANCH(K_MKER_LOOP) " \n\t" -" FIN_MKER_LOOP: \n\t" +LABEL(FIN_MKER_LOOP) GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " add x0, x0, x2 \n\t" // Forward A to fill the blank. " \n\t" -" K_LEFT_LOOP: \n\t" +LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of execution. -" b.eq WRITE_MEM_PREP \n\t" +BEQ(WRITE_MEM_PREP) " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) " ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row. @@ -203,9 +203,9 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " add x0, x0, x2 \n\t" // Forward A. " add x1, x1, x3 \n\t" // Forward B. " sub x8, x8, #1 \n\t" -" b K_LEFT_LOOP \n\t" // Next column / row. +BRANCH(K_LEFT_LOOP) " \n\t" -" WRITE_MEM_PREP: \n\t" +LABEL(WRITE_MEM_PREP) " \n\t" " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" @@ -214,7 +214,7 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " dup z30.s, w4 \n\t" // Broadcast alpha & beta into vectors. " dup z31.s, w8 \n\t" " \n\t" -" PREFETCH_ABNEXT: \n\t" +LABEL(PREFETCH_ABNEXT) " ldr x0, %[a_next] \n\t" " ldr x1, %[b_next] \n\t" " prfm PLDL2KEEP, [x0] \n\t" @@ -244,41 +244,42 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " prfm PLDL2KEEP, [x1, 256*8] \n\t" " prfm PLDL2KEEP, [x1, 256*9] \n\t" " \n\t" -" WRITE_MEM: \n\t" +LABEL(WRITE_MEM) " \n\t" " fmov s28, #1.0 \n\t" " fmov w16, s28 \n\t" " cmp w16, w4 \n\t" -" b.eq UNIT_ALPHA \n\t" +BEQ(UNIT_ALPHA) " \n\t" SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) " \n\t" -" UNIT_ALPHA: \n\t" +LABEL(UNIT_ALPHA) " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. // " cmp x6, #1 \n\t" -// " b.ne WRITE_MEM_G \n\t" +// BNE(WRITE_MEM_G) " \n\t" -" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. +LABEL(WRITE_MEM_C) +" \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. " fcmp s31, #0.0 \n\t" -" b.eq BETA_ZERO_C \n\t" +BEQ(BETA_ZERO_C) GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) " \n\t" -" BETA_ZERO_C: \n\t" +LABEL(BETA_ZERO_C) GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7) GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7) -// " b END_WRITE_MEM \n\t" +// BRANCH(END_WRITE_MEM) // " \n\t" -// " END_WRITE_MEM: \n\t" -// " b END_EXEC \n\t" +// LABEL(END_WRITE_MEM) +// BRANCH(END_EXEC) // " \n\t" -// " END_ERROR: \n\t" +// LABEL(END_ERROR) // " mov x0, #1 \n\t" // Return error. -" END_EXEC: \n\t" +LABEL(END_EXEC) " mov x0, #0 \n\t" // Return normal. : : [m] "m" (m), diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c index 42b1345ff..8636a527b 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c @@ -101,9 +101,9 @@ void bli_zgemm_armsve_asm_2vx10_unindexed // " ldr x5, %[k_mker] \n\t" // Number of loops. // " ldr x6, %[k_left] \n\t" " \n\t" -" LOAD_ABC: \n\t" +LABEL(LOAD_ABC) " cmp %5, #0 \n\t" // Don't preload if no microkernel there. -" b.eq END_CCOL_PRFM \n\t" +BEQ(END_CCOL_PRFM) " \n\t" " ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Load B's real 8/10, no imaginary. " ld1rd z21.d, p0/z, [%1, 8*2] \n\t" @@ -116,9 +116,9 @@ void bli_zgemm_armsve_asm_2vx10_unindexed " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " \n\t" -" CCOL_PRFM: \n\t" +LABEL(CCOL_PRFM) // " cmp %3, #1 \n\t" -// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage. " mov x16, %2 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" @@ -139,14 +139,14 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +LABEL(END_CCOL_PRFM) " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp %5, #0 \n\t" // If no 4-microkernel can be applied. -" b.eq K_LEFT_LOOP \n\t" +BEQ(K_LEFT_LOOP) " \n\t" -" K_MKER_LOOP: \n\t" +LABEL(K_MKER_LOOP) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) @@ -158,18 +158,18 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" " subs %5, %5, #1 \n\t" // Decrease counter before final replica. -" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem. " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) -" b K_MKER_LOOP \n\t" +BRANCH(K_MKER_LOOP) " \n\t" -" FIN_MKER_LOOP: \n\t" +LABEL(FIN_MKER_LOOP) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" -" K_LEFT_LOOP: \n\t" +LABEL(K_LEFT_LOOP) " cmp %6, #0 \n\t" // End of execution. -" b.eq WRITE_MEM_PREP \n\t" +BEQ(WRITE_MEM_PREP) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Load B's real 8/10, no imaginary. @@ -182,9 +182,9 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " ld1rd z27.d, p0/z, [%1, 8*14] \n\t" GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " sub %6, %6, #1 \n\t" -" b K_LEFT_LOOP \n\t" // Next column / row. +BRANCH(K_LEFT_LOOP) " \n\t" -" WRITE_MEM_PREP: \n\t" +LABEL(WRITE_MEM_PREP) " \n\t" // " ldr x7, %[alpha] \n\t" // Load alpha & beta (address). // " ldr x8, %[beta] \n\t" @@ -193,7 +193,7 @@ GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18, " ld1rd z30.d, p0/z, [%8] \n\t" // Real(beta). " ld1rd z31.d, p0/z, [%8, 8] \n\t" // Imag(beta). " \n\t" -" PREFETCH_ABNEXT: \n\t" +LABEL(PREFETCH_ABNEXT) // " ldr x9, %[a_next] \n\t" // " ldr x10, %[b_next] \n\t" #ifdef _A64FX @@ -209,89 +209,89 @@ GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18, " prfm PLDL1STRM, [%10] \n\t" " prfm PLDL1STRM, [%10, 256*1] \n\t" " \n\t" -" WRITE_MEM: \n\t" +LABEL(WRITE_MEM) " fmov d27, #1.0 \n\t" " fcmp d29, #0.0 \n\t" // Whether Imag(alpha) == 0. " fccmp d28, d27, 0, eq \n\t" // Whether Real(alpha) == 1. -" b.eq UNIT_ALPHA \n\t" +BEQ(UNIT_ALPHA) " \n\t" GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z28,z29) GEMM_FMULCMPLX_COL2(z24,z25,z26,z27,p0,z4 ,z5 ,z6 ,z7 ,z28,z29) GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z8, z9, z10,z11,z28,z29) GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z28,z29) GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z16,z17,z18,z19,z28,z29) -" b WRITE_MEM_EXEC \n\t" +BRANCH(WRITE_MEM_EXEC) " \n\t" -" UNIT_ALPHA: \n\t" +LABEL(UNIT_ALPHA) MOV_COL2(z20,z21,z22,z23,z0 ,z1 ,z2 ,z3 ) MOV_COL2(z24,z25,z26,z27,z4 ,z5 ,z6 ,z7 ) MOV_COL2(z0 ,z1 ,z2 ,z3 ,z8, z9, z10,z11) MOV_COL2(z4 ,z5 ,z6 ,z7 ,z12,z13,z14,z15) MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19) " \n\t" -" WRITE_MEM_EXEC: \n\t" +LABEL(WRITE_MEM_EXEC) " mov x9, %2 \n\t" // C address for loading. " \n\t" // C address for storing is %2 itself. // " cmp %3, #1 \n\t" -// " b.ne WRITE_MEM_G \n\t" +// BNE(WRITE_MEM_G) " \n\t" -" WRITE_MEM_C: \n\t" +LABEL(WRITE_MEM_C) " fmov d29, xzr \n\t" " fcmp d31, #0.0 \n\t" // Whether Imag(beta) == 0. " fccmp d30, d29, 0, eq \n\t" // Whether Real(beta) == 0. -" b.eq ZERO_BETA_C_0_1_2_3 \n\t" +BEQ(ZERO_BETA_C_0_1_2_3) GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) -" ZERO_BETA_C_0_1_2_3: \n\t" +LABEL(ZERO_BETA_C_0_1_2_3) GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z24,z25,z26,z27,p0,%2,%4) " \n\t" -" b.eq ZERO_BETA_C_4_5_6_7_8_9 \n\t" +BEQ(ZERO_BETA_C_4_5_6_7_8_9) GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z20,z21,z22,z23,p0,x9,%4) GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) -" ZERO_BETA_C_4_5_6_7_8_9: \n\t" +LABEL(ZERO_BETA_C_4_5_6_7_8_9) GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) -// " b END_WRITE_MEM \n\t" +// BRANCH(END_WRITE_MEM) // " \n\t" -// " WRITE_MEM_G: \n\t" +// LABEL(WRITE_MEM_G) // " add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, // " index z28.d, xzr, %3 \n\t" // s.t. 2*sizeof(double) = 2*8 = 16. // " fmov d29, xzr \n\t" // " fcmp d31, #0.0 \n\t" // Whether Imag(beta) == 0. // " fccmp d30, d29, 0, eq \n\t" // Whether Real(beta) == 0. -// " b.eq ZERO_BETA_G_0_1_2_3 \n\t" +// BEQ(ZERO_BETA_G_0_1_2_3) // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) // GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) // GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) -// " ZERO_BETA_G_0_1_2_3: \n\t" +// LABEL(ZERO_BETA_G_0_1_2_3) // GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) // " \n\t" -// " b.eq ZERO_BETA_G_4_5_6_7_8_9 \n\t" +// BEQ(ZERO_BETA_G_4_5_6_7_8_9) // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) // GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) // GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) // GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) -// " ZERO_BETA_G_4_5_6_7_8_9: \n\t" +// LABEL(ZERO_BETA_G_4_5_6_7_8_9) // GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) // " \n\t" -// " END_WRITE_MEM: \n\t" -// " b END_EXEC \n\t" +// LABEL(END_WRITE_MEM) +// BRANCH(END_EXEC) // " \n\t" -" END_EXEC: \n\t" +LABEL(END_EXEC) " mov %11, #0 \n\t" // Return normal. : "+r" (a), // %0 "+r" (b), // %1 diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h index 465950999..0c405dfd2 100644 --- a/kernels/armv8a/3/armv8a_asm_utils.h +++ b/kernels/armv8a/3/armv8a_asm_utils.h @@ -34,7 +34,7 @@ */ -// Apple's local label requirements. +// Apple/Clang's local label requirements. #if defined(__APPLE__) || defined(__clang__) #define LABEL(str) " L" #str"%=: \n\t" #define BEQ(str) "b.eq L" #str"%= \n\t" From 26742910a087947780a089360e2baf82ea109e01 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sun, 13 Feb 2022 16:53:45 -0600 Subject: [PATCH 028/230] Update CC_VENDOR logic Look for `GCC` in addition to `gcc` to handle weird conda version strings. [ci skip] --- configure | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/configure b/configure index 95a97c6b1..c03df26cd 100755 --- a/configure +++ b/configure @@ -1454,7 +1454,7 @@ get_compiler_version() # isolate the version number. # The last part ({ read first rest ; echo $first ; }) is a workaround # to OS X's egrep only returning the first match. - cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM|oneAPI|crosstool-NG' | { read first rest ; echo $first ; }) + cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM|oneAPI|crosstool-NG|GCC' | { read first rest ; echo $first ; }) # AOCC version strings contain both "clang" and "AOCC" substrings, and # so we have perform a follow-up check to make sure cc_vendor gets set @@ -1472,6 +1472,10 @@ get_compiler_version() # Begin parsing cc_vendor for the version string. + if [ "${cc_vendor}" = "GCC" ]; then + # Conda gcc sometimes has GCC (all caps) in the version string + cc_vendor="gcc" + fi if [ "${cc_vendor}" = "crosstool-NG" ]; then # Treat compilers built by crosstool-NG (for eg: conda) as gcc. cc_vendor="gcc" From 5a4d3f5208d3d8cc1827f8cc90414c764b7ebab3 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sun, 13 Feb 2022 17:28:30 -0600 Subject: [PATCH 029/230] Use -flat_namespace option to link on macOS Fixes #611. --- common.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.mk b/common.mk index 5f2d30c9b..b9e3d97a7 100644 --- a/common.mk +++ b/common.mk @@ -552,7 +552,7 @@ endif # NOTE: The flag for creating shared objects is different for Linux and OS X. ifeq ($(OS_NAME),Darwin) # OS X shared library link flags. -SOFLAGS := -dynamiclib +SOFLAGS := -dynamiclib -Wl,-flat_namespace ifeq ($(MK_ENABLE_RPATH),yes) SOFLAGS += -Wl,-install_name,@rpath/$(LIBBLIS_SONAME) else From 25061593460767221e1066f9d720fa6676bbed8f Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sun, 13 Feb 2022 20:11:55 -0600 Subject: [PATCH 030/230] Don't use `-Wl,-flat-namespace`. Flat namespaces can cause problems due to conflicting system libraries, etc., so just mark `xerbla_` as a weak symbol on macOS instead. --- common.mk | 2 +- frame/compat/f2c/bla_xerbla.h | 2 +- frame/include/bli_config_macro_defs.h | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/common.mk b/common.mk index b9e3d97a7..5f2d30c9b 100644 --- a/common.mk +++ b/common.mk @@ -552,7 +552,7 @@ endif # NOTE: The flag for creating shared objects is different for Linux and OS X. ifeq ($(OS_NAME),Darwin) # OS X shared library link flags. -SOFLAGS := -dynamiclib -Wl,-flat_namespace +SOFLAGS := -dynamiclib ifeq ($(MK_ENABLE_RPATH),yes) SOFLAGS += -Wl,-install_name,@rpath/$(LIBBLIS_SONAME) else diff --git a/frame/compat/f2c/bla_xerbla.h b/frame/compat/f2c/bla_xerbla.h index 44c168e58..f9f0a4641 100644 --- a/frame/compat/f2c/bla_xerbla.h +++ b/frame/compat/f2c/bla_xerbla.h @@ -34,6 +34,6 @@ #ifdef BLIS_ENABLE_BLAS -BLIS_EXPORT_BLAS int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); +BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index 5a4c8a15d..0c75fb639 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -236,6 +236,22 @@ #define BLIS_EXPORT_ADDON BLIS_EXPORT +// -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- + +// On Linux, functions called from a shared library can be overriden by the main +// program simply by providing a new definition. However, macOS uses a "two-level +// namespace" which causes calls to shared library functions to be tied to the +// library and not overridable. As a workaround, certain symbols can be defined +// as "weak" and are given lower preference during linking. +#ifndef BLIS_OVERRIDABLE +#if BLIS_OS_OSX +#define BLIS_OVERRIDABLE __attribute__((weak)) +#else +#define BLIS_OVERRIDABLE +#endif +#endif + + // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, From ee9ff988c49f16696679d4c6cd3dcfcac7295be7 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 15 Feb 2022 15:01:51 -0600 Subject: [PATCH 031/230] Move edge cases to gemmtrsm ukrs; doc updates. Details: - Moved edge-case handling into the gemmtrsm microkernel. This required changing the microkernel API to take m and n dimension parameters as well as updating all existing gemmtrsm microkernel function pointer types, function signatures, and related definitions to take m and n dimensions. Also updated all existing gemmtrsm kernels in the 'kernels' directory (which for now is limited to haswell and penryn kernel sets, plus native and 1m-based reference kernels in 'ref_kernels') to take m and n dimensions, and implemented edge-case handling within those microkernels via a collection of new C preprocessor macros defined within bli_edge_case_macro_defs.h. Note that the edge-case handling for gemm-like operations had already been relocated into the gemm microkernel in 54fa28b. - Added desriptive comments to GEMM_UKR_SETUP_CT() and related macros in bli_edge_case_macro_defs.h to allow for easier reading. - Updated docs/KernelsHowTo.md to reflect above changes. Also cleaned up the bullet under "Implementation Notes for gemm" that covers alignment issues. (Thanks to Ivan Korostelev for pointing out the confusing and outdated language in issue #591.) - Other minor tweaks to KernelsHowTo.md. --- docs/KernelsHowTo.md | 64 ++++++---- frame/3/bli_l3_ft_ukr.h | 2 + frame/3/bli_l3_ind_ukr.h | 4 + frame/3/bli_l3_ukr_oapi.c | 6 + frame/3/bli_l3_ukr_prot.h | 2 + frame/3/bli_l3_ukr_tapi.c | 4 + frame/3/trsm/bli_trsm_ll_ker_var2.c | 58 +++------ frame/3/trsm/bli_trsm_lu_ker_var2.c | 58 +++------ frame/3/trsm/bli_trsm_rl_ker_var2.c | 60 +++------- frame/3/trsm/bli_trsm_ru_ker_var2.c | 60 +++------- frame/include/bli_edge_case_macro_defs.h | 110 +++++++++++++++++- .../3/bli_gemmtrsm_l_haswell_asm_d6x8.c | 12 ++ .../3/bli_gemmtrsm_u_haswell_asm_d6x8.c | 12 ++ .../penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c | 5 + .../penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c | 5 + ref_kernels/3/bli_gemmtrsm_ref.c | 45 ++++++- ref_kernels/ind/bli_gemmtrsm1m_ref.c | 50 ++++++-- 17 files changed, 352 insertions(+), 205 deletions(-) diff --git a/docs/KernelsHowTo.md b/docs/KernelsHowTo.md index 302b1c75d..6e84db8e7 100644 --- a/docs/KernelsHowTo.md +++ b/docs/KernelsHowTo.md @@ -113,7 +113,7 @@ Note that all kernels, whether they be reference implementations or based on ful The first step is to obtain a valid context. Contexts store all of the information specific to a particular sub-configuration (usually loosely specific to a -microarchitecture or group of closely-related microarchitectuers). If a context is +microarchitecture or group of closely-related microarchitectures). If a context is not already available in your current scope, a default context for the hardware for which BLIS was configured (or, in the case of multi-configuration builds, the hardware on which BLIS is currently running) may be queried via: @@ -229,7 +229,7 @@ This section seeks to provide developers with a complete reference for each of t The function prototypes in this section follow the same guidelines as those listed in the [BLIS typed API reference](BLISTypedAPI.md#Notes_for_using_this_reference). Namely: * Any occurrence of `?` should be replaced with `s`, `d`, `c`, or `z` to form an actual function name. - * Any occurrence of `ctype` should be replaced with the actual C type corresponding to the datatype instance in question. + * Any occurrence of `ctype` should be replaced with the actual C99 language type corresponding to the datatype instance in question. * Some matrix arguments have associated row and column strides arguments that proceed them, typically listed as `rsX` and `csX` for a given matrix `X`. Row strides are always listed first, and column strides are always listed second. The semantic meaning of a row stride is "the distance, in units of elements, from any given element to the corresponding element (within the same column) of the next row," and the meaning of a column stride is "the distance, in units of elements, from any given element to the corresponding element (within the same row) of the next column." Thus, unit row stride implies column-major storage and unit column stride implies row-major storage. * All occurrences of `alpha` and `beta` parameters are scalars. @@ -248,6 +248,8 @@ This section describes in detail the various level-3 microkernels supported by B ```c void bli_?gemm_ ( + dim_t m, + dim_t n, dim_t k, ctype* restrict alpha, ctype* restrict a1, @@ -264,6 +266,8 @@ where `` is implementation-dependent. (Recall that the precise ` ```c void bli_?gemm_ukernel ( + dim_t m, + dim_t n, dim_t k, ctype* restrict alpha, ctype* restrict a1, @@ -274,6 +278,7 @@ void bli_?gemm_ukernel cntx_t* restrict cntx ); ``` +This function simply queries a microkernel function pointer from the context specified by `cntx`. Note that in the case of either method of calling the microkernel, `cntx` must be a valid pointer. (Passing in `NULL` will *not* result in a default context being used.) The `gemm` microkernel, sometimes simply referred to as "the BLIS microkernel" or "the microkernel", performs the following operation: @@ -281,16 +286,20 @@ The `gemm` microkernel, sometimes simply referred to as "the BLIS microkernel" o C11 := beta * C11 + alpha * A1 * B1 ``` -where `A1` is an _MR x k_ "micropanel" matrix stored in packed (column-wise) format, `B1` is a _k x NR_ "micropanel" matrix stored in packed (row-wise) format, `C11` is an _MR x NR_ general matrix stored according to its row and column strides `rsc` and `csc`, and `alpha` and beta are scalars. +where `A1` is an _m x k_ "micropanel" matrix stored in packed (column-wise) format, `B1` is a _k x n_ "micropanel" matrix stored in packed (row-wise) format, `C11` is an _m x n_ "microtile" matrix stored according to its row and column strides `rsc` and `csc`, and `alpha` and beta are scalars. -_MR_ and _NR_ are the register blocksizes associated with the microkernel. They are chosen by the developer when the microkernel is written and then encoded into a BLIS configuration, which will reference the microkernel when the BLIS framework is instantiated into a library. For more information on setting register blocksizes and related constants, please see the [BLIS developer configuration guide](ConfigurationHowTo.md). +Here, _m <= MR_ and _n <= NR_, where _MR_ and _NR_ are the register blocksizes associated with the microkernel. They are chosen by the developer when the microkernel is written and then encoded into a BLIS configuration, which will reference the microkernel when the BLIS framework is instantiated into a library. For more information on setting register blocksizes and related constants, please see the [BLIS developer configuration guide](ConfigurationHowTo.md). + +**Note:** For many years, BLIS defined its microkernel to operate on microtiles whose dimensions were *exactly* _MR x NR_. However, as of commit 54fa28b, we have augmented the `gemm` microkernel API to pass in _m_ and _n_ dimensions as well as _k_. This change was made as part of our decision to move edge-case handling into the microkernel, whereas previously it was handled outside of the microkernel, within the portable parts of BLIS framework. And while this does mean additional complexity for microkernel authors, adding generic edge-case handling can be done in a relatively painless manner by employing some pre-defined preprocessor macros (which are defined in `bli_edge_case_macro_defs.h`). For examples of how to use these macros, please see the beginning and end of existing microkernel functions residing within the `kernels` directory. Parameters: + * `m`: The number of rows of `C11` and `A1`. + * `n`: The number of columns of `C11` and `B1`. * `k`: The number of columns of `A1` and rows of `B1`. * `alpha`: The address of a scalar to the `A1 * B1` product. - * `a1`: The address of a micropanel of matrix `A` of dimension _MR x k_, stored by columns with leading dimension _PACKMR_, where typically _PACKMR_ = _MR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKMR_.) - * `b1`: The address of a micropanel of matrix `B` of dimension _k x NR_, stored by rows with leading dimension _PACKNR_, where typically _PACKNR_ = _NR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKNR_.) + * `a1`: The address of a micropanel of matrix `A` of dimension _m x k_ (where _m <= MR_), stored by columns with leading dimension _PACKMR_, where typically _PACKMR_ = _MR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKMR_.) + * `b1`: The address of a micropanel of matrix `B` of dimension _k x n_ (where _n <= NR_), stored by rows with leading dimension _PACKNR_, where typically _PACKNR_ = _NR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKNR_.) * `beta`: The address of a scalar to the input value of matrix `C11`. * `c11`: The address of a matrix `C11` of dimension _MR x NR_, stored according to `rsc` and `csc`. * `rsc`: The row stride of matrix `C11` (ie: the distance to the next row, in units of matrix elements). @@ -321,24 +330,24 @@ The diagram below shows the packed micropanel operands and how elements of each #### Implementation Notes for gemm - * **Register blocksizes.** The register blocksizes `MR` and `NR`, corresponding to the number of *logical* rows in `a1` and columns in `b1`, respectively, are defined in the context and may be queried via `bli_cntx_get_blksz_def_dt()`. However, you shouldn't need to query these values since the implementation inherently "knows" them already. - * **Leading dimensions of `a1` and `b1`: _PACKMR_ and _PACKNR_.** The packed micropanels `a1` and `b1` are simply stored in column-major and row-major order, respectively. Usually, the width of either micropanel (ie: the number of logical rows of `a1`, or _MR_, and the number of columns of `b1`, or _NR_) is equal to that micropanel's so-called "leading dimension", or number of *physical* rows. Sometimes, it may be beneficial to specify a leading dimension that is larger than the panel width. This may be desirable because it allows each column of `a1` or row of `b1` to maintain a certain alignment in memory that would not otherwise be maintained by _MR_ and/or _NR_. In this case, you should index through `a1` and `b1` using the values _PACKMR_ and _PACKNR_, respectively (which are stored in the context as the blocksize "maximums" associated with the `bszid_t` values `BLIS_MR` and `BLIS_NR`). These values are defined in the context and may be queried via `bli_cntx_get_blksz_max_dt()`. However, you shouldn't need to query these values since the implementation inherently "knows" them already. + * **Register blocksizes.** The register blocksizes `MR` and `NR`, corresponding to the maximum number of *logical* rows in `a1` and columns in `b1`, respectively, are defined in the context and may be queried via `bli_cntx_get_blksz_def_dt()`. However, you shouldn't need to query these values since the implementation inherently "knows" them already. + * **Leading dimensions of `a1` and `b1`: _PACKMR_ and _PACKNR_.** The packed micropanels `a1` and `b1` are simply stored in column-major and row-major order, respectively. Usually, the width of either micropanel (ie: the number of *logical* rows of `a1` and the number of columns of `b1`) is equal to that micropanel's so-called "leading dimension", or number of *physical* rows. Sometimes, it may be beneficial to specify a leading dimension that is larger than the panel width. This may be desirable because it allows each column of `a1` or row of `b1` to maintain a certain alignment in memory that would not otherwise be maintained by _MR_ and/or _NR_, which would othewise serve as the maximum value for each micropanel, respectively. If you want your microkernel to support _MR < PACKMR_ or _NR < PACKNR_, you should index through columns of `a1` and rows of `b1` using the values _PACKMR_ and _PACKNR_, respectively (which are stored in the context as the blocksize "maximums" associated with the `bszid_t` values `BLIS_MR` and `BLIS_NR`). These values are defined in the context and may be queried via `bli_cntx_get_blksz_max_dt()`. However, you shouldn't need to query these values since the microkernel implementation inherently must "know" them already. * **Storage preference of `c11`.** Usually, an optimized `gemm` microkernel will have a "preferred" storage format for `C11`--typically either contiguous row-storage (i.e. `cs_c` = 1) or contiguous column-storage (i.e. `rs_c` = 1). This preference comes from how the microkernel is most efficiently able to load/store elements of `C11` from/to memory. Most microkernels use vector instructions to access contiguous columns (or column segments) of `C11`. However, the developer may decide that accessing contiguous rows (or row segments) is more desirable. If this is the case, this preference should be indicated via the `bool` argument when registering microkernels via `bli_cntx_set_l3_nat_ukrs()`--`TRUE` indicating a row preference and `FALSE` indicating a column preference. Properly setting this property allows the framework to perform a runtime optimization that will ensure the microkernel preference is honored, if at all possible. - * **Edge cases in _MR_, _NR_ dimensions.** Sometimes the microkernel will be called with micropanels `a1` and `b1` that correspond to edge cases, where only partial results are needed. Zero-padding is handled automatically by the packing function to facilitate reuse of the same microkernel. Similarly, the logic for computing to temporary storage and then saving only the elements that correspond to elements of `C11` that exist (at the edges) is handled automatically within the macrokernel. - * **Alignment of `a1` and `b1`.** By default, the alignment of addresses `a1` and `b1` are aligned only to `sizeof(type)`. If `BLIS_POOL_ADDR_ALIGN_SIZE` is set to some larger multiple of `sizeof(type)`, such as the page size, then the *first* `a1` and `b1` micropanels will be aligned to that value, but subsequent micropanels will only be aligned to `sizeof(type)`, or, if `BLIS_POOL_ADDR_ALIGN_SIZE` is a multiple of `PACKMR` and `PACKNR`, then subsequent micropanels `a1` and `b1` will be aligned to `PACKMR * sizeof(type)` and `PACKNR * sizeof(type)`, respectively. - * **Unrolling loops.** As a general rule of thumb, the loop over _k_ is sometimes moderately unrolled; for example, in our experience, an unrolling factor of _u_ = 4 is fairly common. If unrolling is applied in the _k_ dimension, edge cases must be handled to support values of _k_ that are not multiples of _u_. It is nearly universally true that there should be no loops in the _MR_ or _NR_ directions; in other words, iteration over these dimensions should always be fully unrolled (within the loop over _k_). + * **Edge cases in _MR_, _NR_ dimensions.** Sometimes the microkernel will be called with micropanels `a1` and `b1` that correspond to edge cases, where only partial results are needed. This edge-case handling was once performed by the framework automatically. However, as of commit 54fa28b, edge-case handling is the responsiblity of the microkernel. This means that the kernel author will need to handle all possible values of _m_ and _n_ that are equal to **or** less than _MR_ and _NR_, respectively. Fortunately, this can be implemented outside of the assembly region of the microkernel with preprocessor macros. Please reference the existing microkernels in the `kernels` directory for examples of how this is done. (The macros that are now employed by most of BLIS's microkernels are defined in `bli_edge_case_macro_defs.h`.) + * **Alignment of `a1` and `b1`.** By default, the alignment of addresses `a1` and `b1` are aligned to the page size (4096 bytes). These alignment factors are set by `BLIS_POOL_ADDR_ALIGN_SIZE_A` and `BLIS_POOL_ADDR_ALIGN_SIZE_B`, respectively. Note that these alignment factors control only the alignment of the *first* micropanel within a given packed blockof matrix `A` or packed row-panel of matrix `B`. Subsequent micropanels will only be aligned to `sizeof(type)`, or, if `BLIS_POOL_ADDR_ALIGN_SIZE_A` is a multiple of `PACKMR` and/or `BLIS_POOL_ADDR_ALIGN_SIZE_B` is a multiple of `PACKNR`, then subsequent micropanels `a1` and/or `b1` will be aligned to `PACKMR * sizeof(type)` and/or `PACKNR * sizeof(type)`, respectively. + * **Unrolling loops.** As a general rule of thumb, the loop over _k_ is sometimes moderately unrolled; for example, in our experience, an unrolling factor of _u_ = 4 is fairly common. If unrolling is applied in the _k_ dimension, edge cases must be handled to support values of _k_ that are not multiples of _u_. It is nearly universally true that the microkernel should not contain loops in the _m_ or _n_ directions; in other words, iteration over these dimensions should always be fully unrolled (within the loop over _k_). * **Zero `beta`.** If `beta` = 0.0 (or 0.0 + 0.0i for complex datatypes), then the microkernel should NOT use it explicitly, as `C11` may contain uninitialized memory (including elements containing `NaN` or `Inf`). This case should be detected and handled separately by overwriting `C11` with the `alpha * A1 * B1` product. #### Using the auxinfo\_t object -Each microkernel ([gemm](KernelsHowTo.md#gemm-microkernel), [trsm](KernelsHowTo.md#trsm_microkernels), and [gemmtrsm](KernelsHowTo.md#gemmtrsm-microkernels)) takes as its last argument a pointer of type `auxinfo_t`. This BLIS-defined type is defined as a `struct` whose fields contain auxiliary values that may be useful to some microkernel authors, particularly when implementing certain optimization techniques. BLIS provides kernel authors access to the fields of the `auxinfo_t` object via the following function-like preprocessor macros. Each macro takes a single argument, the `auxinfo_t` pointer, and returns one of the values stored within the object. +Each microkernel ([gemm](KernelsHowTo.md#gemm-microkernel), [trsm](KernelsHowTo.md#trsm_microkernels), and [gemmtrsm](KernelsHowTo.md#gemmtrsm-microkernels)) takes as its last argument a pointer of type `auxinfo_t`. This BLIS-defined type is defined as a `struct` whose fields contain auxiliary values that may be useful to some microkernel authors, particularly when implementing certain optimization techniques. BLIS provides kernel authors access to the fields of the `auxinfo_t` object via the following static inline functions. Each function takes a single argument, the `auxinfo_t` pointer, and returns one of the values stored within the object. * `bli_auxinfo_next_a()`. Returns the address (`void*`) of the micropanel of `A` that will be used the next time the microkernel will be called. * `bli_auxinfo_next_b()`. Returns the address (`void*`) of the micropanel of `B` that will be used the next time the microkernel will be called. * `bli_auxinfo_ps_a()`. Returns the panel stride (`inc_t`) of the current micropanel of `A`. * `bli_auxinfo_ps_b()`. Returns the panel stride (`inc_t`) of the current micropanel of `B`. -The addresses of the next micropanels of `A` and `B` may be used by the microkernel to perform prefetching, if prefetching is supported by the architecture. Similarly, it may be useful to know the precise distance in memory to the next micropanel. (Note that sometimes the next micropanel to be used is **not** the same as the next micropanel in memory.) +The addresses of the next micropanels of `A` and `B` may be used by the microkernel to perform prefetching, if prefetching is supported by the architecture. Similarly, it may be useful to know the precise distance in memory to the next micropanel. (Note that occasionally the next micropanel to be used is **not** the same as the next micropanel in memory.) Any and all of these values may be safely ignored; they are completely optional. However, BLIS guarantees that all values accessed via the macros listed above will **always** be initialized and meaningful, for every invocation of each microkernel (`gemm`, `trsm`, and `gemmtrsm`). @@ -348,8 +357,7 @@ Any and all of these values may be safely ignored; they are completely optional. An example implementation of the `gemm` microkernel may be found in the `template` configuration directory in: * [config/template/kernels/3/bli\_gemm_opt\_mxn.c](https://github.com/flame/blis/tree/master/config/template/kernels/3/bli_gemm_opt_mxn.c) - -Note that this implementation is coded in C99 and lacks several kinds of optimization that are typical of real-world optimized microkernels, such as vector instructions (or intrinsics) and loop unrolling in _MR_ or _NR_. It is meant to serve only as a starting point for a microkernel developer. +Note that this implementation is coded in C99 and lacks several kinds of optimization that are typical of real-world optimized microkernels, such as vector instructions (or intrinsics) and loop unrolling in the _m_ or _n_ dimensions. It is meant to serve only as a starting point for a microkernel developer. @@ -411,6 +419,8 @@ where `A11` is _MR x MR_ and lower (`trsm_l`) or upper (`trsm_u`) triangular, `B _MR_ and _NR_ are the register blocksizes associated with the microkernel. They are chosen by the developer when the microkernel is written and then encoded into a BLIS configuration, which will reference the microkernel when the BLIS framework is instantiated into a library. For more information on setting register blocksizes and related constants, please see the [BLIS developer configuration guide](ConfigurationHowTo.md). +**Note:** Although the `gemm` microkernel must handle edge-cases, and therefore must take _m_ and _n_ parameters, the `trsm` microkernels are simpler in that they still assume _m = MR_ and _n = NR_, and therefore do not need these _m_ and _n_ parameters passed in. + Parameters: * `a11`: The address of `A11`, which is the _MR x MR_ lower (`trsm_l`) or upper (`trsm_u`) triangular submatrix within the packed micropanel of matrix `A`. `A11` is stored by columns with leading dimension _PACKMR_, where typically _PACKMR_ = _MR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKMR_.) Note that `A11` contains elements in both triangles, though elements in the unstored triangle are not guaranteed to be zero and thus should not be referenced. @@ -454,6 +464,8 @@ Note that these implementations are coded in C99 and lack several kinds of optim ```c void bli_?gemmtrsm_l_ ( + dim_t m, + dim_t n, dim_t k, ctype* restrict alpha, ctype* restrict a10, @@ -467,6 +479,8 @@ void bli_?gemmtrsm_l_ void bli_?gemmtrsm_u_ ( + dim_t m, + dim_t n, dim_t k, ctype* restrict alpha, ctype* restrict a12, @@ -484,6 +498,8 @@ where `` is implementation-dependent. (Recall that the precise ` ```c void bli_?gemmtrsm_l_ukernel ( + dim_t m, + dim_t n, dim_t k, ctype* restrict alpha, ctype* restrict a10, @@ -497,6 +513,8 @@ void bli_?gemmtrsm_l_ukernel void bli_?gemmtrsm_u_ukernel ( + dim_t m, + dim_t n, dim_t k, ctype* restrict alpha, ctype* restrict a12, @@ -517,7 +535,7 @@ The `gemmtrsm_l` microkernel performs the following compound operation: C11 := B11 ``` -where `A11` is _MR_ x _MR_ and lower triangular, `A10` is _MR_ x _k_, and `B01` is _k_ x _NR_. +where `A11` is _MR x MR_ and lower triangular, `A10` is _MR x k_, and `B01` is _k x NR_. The `gemmtrsm_u` microkernel performs: ``` @@ -526,20 +544,22 @@ The `gemmtrsm_u` microkernel performs: C11 := B11 ``` -where `A11` is _MR_ x _MR_ and upper triangular, `A12` is _MR_ x _k_, and `B21` is _k_ x _NR_. -In both cases, `B11` is _MR_ x _NR_ and `alpha` is a scalar. Here, `inv()` denotes matrix inverse. +where `A11` is _MR x MR_ and upper triangular, `A12` is _MR x k_, and `B21` is _k x NR_. +In both cases, `B11` is _MR x NR_ and `alpha` is a scalar. However, `C11` is _m x n_, and therefore the `C11 := B11` statements amount to a copy of only the top-leftmost _m x n_ elements of `B11`. (Recall that A11 and B11 are packed and therefore guaranteed to reside within fully-sized micropanels, whereas `C11` exists in the caller-provided output matrix and may represent a bottom-right edge case.) Here, `inv()` denotes matrix inverse. _MR_ and _NR_ are the register blocksizes associated with the microkernel. They are chosen by the developer when the microkernel is written and then encoded into a BLIS configuration, which will reference the microkernel when the BLIS framework is instantiated into a library. For more information on setting register blocksizes and related constants, please see the [BLIS developer configuration guide](ConfigurationHowTo.md). Parameters: + * `m`: The number of rows of `C11`. + * `n`: The number of columns of `C11`. * `k`: The number of columns of `A10` and rows of `B01` (`trsm_l`); the number of columns of `A12` and rows of `B21` (`trsm_u`). * `alpha`: The address of a scalar to be applied to `B11`. * `a10`, `a12`: The address of `A10` or `A12`, which is the _MR x k_ submatrix of the packed micropanel of `A` that is situated to the left (`trsm_l`) or right (`trsm_u`) of the _MR x MR_ triangular submatrix `A11`. `A10` and `A12` are stored by columns with leading dimension _PACKMR_, where typically _PACKMR_ = _MR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKMR_.) * `a11`: The address of `A11`, which is the _MR x MR_ lower (`trsm_l`) or upper (`trsm_u`) triangular submatrix within the packed micropanel of matrix `A` that is situated to the right of `A10` (`trsm_l`) or the left of `A12` (`trsm_u`). `A11` is stored by columns with leading dimension _PACKMR_, where typically _PACKMR_ = _MR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKMR_.) Note that `A11` contains elements in both triangles, though elements in the unstored triangle are not guaranteed to be zero and thus should not be referenced. * `b01`, `b21`: The address of `B01` and `B21`, which is the _k x NR_ submatrix of the packed micropanel of `B` that is situated above (`trsm_l`) or below (`trsm_u`) the _MR x NR_ block `B11`. `B01` and `B21` are stored by rows with leading dimension _PACKNR_, where typically _PACKNR_ = _NR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKNR_.) * `b11`: The address of `B11`, which is the _MR x NR_ submatrix of the packed micropanel of `B`, situated below `B01` (`trsm_l`) or above `B21` (`trsm_u`). `B11` is stored by rows with leading dimension _PACKNR_, where typically _PACKNR_ = _NR_. (See [Implementation Notes for gemm](KernelsHowTo.md#implementation-notes-for-gemm) for a discussion of _PACKNR_.) - * `c11`: The address of `C11`, which is an _MR x NR_ submatrix of matrix `C`, stored according to `rsc` and `csc`. `C11` is the submatrix within `C` that corresponds to the elements which were packed into `B11`. Thus, `C` is the original input matrix `B` to the overall `trsm` operation. + * `c11`: The address of `C11`, which is an _m x n_ submatrix of matrix `C`, stored according to `rsc` and `csc`, where _m <= MR_ and _n <= NR_. `C11` is the submatrix within `C` that corresponds to the elements which were packed into `B11`. Thus, `C` is the original input matrix `B` to the overall `trsm` operation. * `rsc`: The row stride of matrix `C11` (ie: the distance to the next row, in units of matrix elements). * `csc`: The column stride of matrix `C11` (ie: the distance to the next column, in units of matrix elements). * `data`: The address of an `auxinfo_t` object that contains auxiliary information that may be useful when optimizing the `gemmtrsm` microkernel implementation. (See [Using the auxinfo\_t object](KernelsHowTo.md#Using_the_auxinfo_t_object) for a discussion of the kinds of values available via `auxinfo_t`, and also [Implementation Notes for gemmtrsm](KernelsHowTo.md#implementation-notes-for-gemmtrsm) for caveats.) @@ -690,7 +710,7 @@ This kernel performs the following operation: ``` y := y + alpha * conja(a) * conjy(x) ``` -where `a` is an _m_ x _b_ matrix, `x` is a vector of length _b_, and `y` is a vector of length _m_. Vectors `x` and `y` are stored with strides `incx` and `incy`, respectively. Matrix `a` is stored with row stride `inca` and column stride `lda`, though `inca` is most often (in practice) unit. This kernel is typically implemented as a fused series of _b_ `axpyv` operations updating the same vector `y` (with the elements of `x` serving as the scalars and the columns of `a` serving as the vectors to be scaled). +where `a` is an _m x b_ matrix, `x` is a vector of length _b_, and `y` is a vector of length _m_. Vectors `x` and `y` are stored with strides `incx` and `incy`, respectively. Matrix `a` is stored with row stride `inca` and column stride `lda`, though `inca` is most often (in practice) unit. This kernel is typically implemented as a fused series of _b_ `axpyv` operations updating the same vector `y` (with the elements of `x` serving as the scalars and the columns of `a` serving as the vectors to be scaled). --- @@ -714,7 +734,7 @@ This kernel performs the following operation: ``` y := beta * y + alpha * conjat(a)^T conjx(x) ``` -where `a` is an _m_ x _b_ matrix, where `w` is a vector of length _m_, `y` is a vector of length _b_, and `alpha` is a scalar. +where `a` is an _m x b_ matrix, where `w` is a vector of length _m_, `y` is a vector of length _b_, and `alpha` is a scalar. Vectors `x` and `y` are stored with strides `incx` and `incy`, respectively. Matrix `a` is stored with row stride `inca` and column stride `lda`, though `inca` is most often (in practice) unit. This kernel is typically implemented as a series of _b_ `dotxv` operations with the same right-hand operand vector `x` (contracted with the rows of `a^T` and accumulating to the corresponding elements of vector `y`). @@ -745,7 +765,7 @@ This kernel performs the following operation: y := beta * y + alpha * conjat(a)^T conjw(w) z := z + alpha * conja(a) conjx(x) ``` -where `a` is an _m_ x _b_ matrix, `w` and `z` are vectors of length _m_, `x` and `y` are vectors of length _b_, and `alpha` and `beta` are scalars. +where `a` is an _m x b_ matrix, `w` and `z` are vectors of length _m_, `x` and `y` are vectors of length _b_, and `alpha` and `beta` are scalars. Vectors `w`, `z`, `x` and `y` are stored with strides `incw`, `incz`, `incx`, and `incy`, respectively. Matrix `a` is stored with row stride `inca` and column stride `lda`, though `inca` is most often (in practice) unit. This kernel is typically implemented as a series of _b_ `dotxv` operations with the same right-hand operand vector `w` fused with a series of _b_ `axpyv` operations updating the same vector `z`. diff --git a/frame/3/bli_l3_ft_ukr.h b/frame/3/bli_l3_ft_ukr.h index 561c8264f..28065c208 100644 --- a/frame/3/bli_l3_ft_ukr.h +++ b/frame/3/bli_l3_ft_ukr.h @@ -69,6 +69,8 @@ INSERT_GENTDEF( gemm ) \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ diff --git a/frame/3/bli_l3_ind_ukr.h b/frame/3/bli_l3_ind_ukr.h index f73a6ad90..6f24e71fc 100644 --- a/frame/3/bli_l3_ind_ukr.h +++ b/frame/3/bli_l3_ind_ukr.h @@ -43,6 +43,8 @@ \ void PASTEMAC(ch,opname) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ @@ -61,6 +63,8 @@ INSERT_GENTPROT_BASIC0( gemm1m_ukr_name ) \ void PASTEMAC(ch,opname) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ diff --git a/frame/3/bli_l3_ukr_oapi.c b/frame/3/bli_l3_ukr_oapi.c index b8f2e00e6..e500bab71 100644 --- a/frame/3/bli_l3_ukr_oapi.c +++ b/frame/3/bli_l3_ukr_oapi.c @@ -111,6 +111,8 @@ void PASTEMAC0(opname) \ \ num_t dt = bli_obj_dt( c11 ); \ \ + dim_t m = bli_obj_length( c11 ); \ + dim_t n = bli_obj_width( c11 ); \ dim_t k = bli_obj_width( a1x ); \ void* buf_a1x = bli_obj_buffer_at_off( a1x ); \ void* buf_a11 = bli_obj_buffer_at_off( a11 ); \ @@ -140,6 +142,8 @@ void PASTEMAC0(opname) \ \ f \ ( \ + m, \ + n, \ k, \ buf_alpha, \ buf_a1x, \ @@ -160,6 +164,8 @@ void PASTEMAC0(opname) \ \ f \ ( \ + m, \ + n, \ k, \ buf_alpha, \ buf_a1x, \ diff --git a/frame/3/bli_l3_ukr_prot.h b/frame/3/bli_l3_ukr_prot.h index f68973ff5..677afc020 100644 --- a/frame/3/bli_l3_ukr_prot.h +++ b/frame/3/bli_l3_ukr_prot.h @@ -59,6 +59,8 @@ void PASTEMAC(ch,opname) \ \ void PASTEMAC(ch,opname) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ diff --git a/frame/3/bli_l3_ukr_tapi.c b/frame/3/bli_l3_ukr_tapi.c index ab745d12b..56eaf3f4c 100644 --- a/frame/3/bli_l3_ukr_tapi.c +++ b/frame/3/bli_l3_ukr_tapi.c @@ -83,6 +83,8 @@ INSERT_GENTFUNC_BASIC2( gemm_ukernel, gemm, BLIS_GEMM_UKR ) \ void PASTEMAC(ch,opname) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ @@ -105,6 +107,8 @@ void PASTEMAC(ch,opname) \ /* Invoke the typed function for the given datatype. */ \ f \ ( \ + m, \ + n, \ k, \ alpha, \ a1x, \ diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index b503efa5b..f50f739e7 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -176,12 +176,14 @@ void PASTEMAC(ch,varname) \ temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ +/* ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +*/ \ \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ @@ -276,10 +278,6 @@ void PASTEMAC(ch,varname) \ know that the underlying buffer was already allocated to have an m dimension that is a multiple of PACKMR, with the region between the last row and the next multiple of MR zero-padded accordingly. */ \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ @@ -409,44 +407,20 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the fused gemm/trsm micro-kernel. */ \ - gemmtrsm_ukr \ - ( \ - k_a10, \ - alpha1_cast, \ - a10, \ - a11, \ - b01, \ - b11, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the fused gemm/trsm micro-kernel. */ \ - gemmtrsm_ukr \ - ( \ - k_a10, \ - alpha1_cast, \ - a10, \ - a11, \ - b01, \ - b11, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Copy the result to the bottom edge of C. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + gemmtrsm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k_a10, \ + alpha1_cast, \ + a10, \ + a11, \ + b01, \ + b11, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ \ a1 += ps_a_cur; \ } \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 55ceafb91..4f3514143 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -176,12 +176,14 @@ void PASTEMAC(ch,varname) \ temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ +/* ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +*/ \ \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ @@ -284,10 +286,6 @@ void PASTEMAC(ch,varname) \ know that the underlying buffer was already allocated to have an m dimension that is a multiple of PACKMR, with the region between the last row and the next multiple of MR zero-padded accordingly. */ \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ @@ -419,44 +417,20 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the fused gemm/trsm micro-kernel. */ \ - gemmtrsm_ukr \ - ( \ - k_a12, \ - alpha1_cast, \ - a12, \ - a11, \ - b21, \ - b11, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the fused gemm/trsm micro-kernel. */ \ - gemmtrsm_ukr \ - ( \ - k_a12, \ - alpha1_cast, \ - a12, \ - a11, \ - b21, \ - b11, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Copy the result to the bottom edge of C. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + gemmtrsm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k_a12, \ + alpha1_cast, \ + a12, \ + a11, \ + b21, \ + b11, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ \ a1 += ps_a_cur; \ } \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 23d4dd728..b4937134f 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -181,12 +181,14 @@ void PASTEMAC(ch,varname) \ temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ +/* ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +*/ \ \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ @@ -302,10 +304,6 @@ void PASTEMAC(ch,varname) \ know that the underlying buffer was already allocated to have an n dimension that is a multiple of PACKNR, with the region between the last column and the next multiple of NR zero-padded accordingly. */ \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ @@ -424,44 +422,21 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( b2, &aux ); \ bli_auxinfo_set_next_b( a2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the fused gemm/trsm micro-kernel. */ \ - gemmtrsm_ukr \ - ( \ - k_b21, \ - alpha1_cast, \ - b21, \ - b11, \ - a12, \ - a11, \ - c11, cs_c, rs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the fused gemm/trsm micro-kernel. */ \ - gemmtrsm_ukr \ - ( \ - k_b21, \ - alpha1_cast, \ - b21, \ - b11, \ - a12, \ - a11, \ - ct, cs_ct, rs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Copy the result to the bottom edge of C. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + gemmtrsm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k_b21, \ + alpha1_cast, \ + b21, \ + b11, \ + a12, \ + a11, \ + c11, cs_c, rs_c, \ + &aux, \ + cntx \ + ); \ +\ } \ \ a1 += rstep_a; \ @@ -512,6 +487,7 @@ void PASTEMAC(ch,varname) \ &aux, \ cntx \ ); \ +\ } \ \ a1 += rstep_a; \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 71381707c..09942d311 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -181,12 +181,14 @@ void PASTEMAC(ch,varname) \ temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ +/* ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +*/ \ \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ @@ -297,10 +299,6 @@ void PASTEMAC(ch,varname) \ know that the underlying buffer was already allocated to have an n dimension that is a multiple of PACKNR, with the region between the last column and the next multiple of NR zero-padded accordingly. */ \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ @@ -417,44 +415,21 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_next_a( b2, &aux ); \ bli_auxinfo_set_next_b( a2, &aux ); \ \ - /* Handle interior and edge cases separately. */ \ - if ( m_cur == MR && n_cur == NR ) \ - { \ - /* Invoke the fused gemm/trsm micro-kernel. */ \ - gemmtrsm_ukr \ - ( \ - k_b01, \ - alpha1_cast, \ - b01, \ - b11, \ - a10, \ - a11, \ - c11, cs_c, rs_c, \ - &aux, \ - cntx \ - ); \ - } \ - else \ - { \ - /* Invoke the fused gemm/trsm micro-kernel. */ \ - gemmtrsm_ukr \ - ( \ - k_b01, \ - alpha1_cast, \ - b01, \ - b11, \ - a10, \ - a11, \ - ct, cs_ct, rs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Copy the result to the bottom edge of C. */ \ - PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - c11, rs_c, cs_c ); \ - } \ + gemmtrsm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k_b01, \ + alpha1_cast, \ + b01, \ + b11, \ + a10, \ + a11, \ + c11, cs_c, rs_c, \ + &aux, \ + cntx \ + ); \ +\ } \ \ a1 += rstep_a; \ @@ -505,6 +480,7 @@ void PASTEMAC(ch,varname) \ &aux, \ cntx \ ); \ +\ } \ \ a1 += rstep_a; \ diff --git a/frame/include/bli_edge_case_macro_defs.h b/frame/include/bli_edge_case_macro_defs.h index 4a1fba7ac..70d97d5d1 100644 --- a/frame/include/bli_edge_case_macro_defs.h +++ b/frame/include/bli_edge_case_macro_defs.h @@ -35,8 +35,11 @@ #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H +// +// Macros for edge-case handling within gemm microkernels. +// -// Helper macros for edge-case handling within gemm microkernels. +// -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ @@ -62,8 +65,14 @@ beta = &_zero; \ } +// -- Setup macros -- + #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ + /* Scenario 1: the ukernel contains assembly-level support only for its + IO preference (e.g. only row-oriented or only column-oriented IO). + Use a temporary microtile for the other two cases as well as edge + cases. */ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ @@ -71,6 +80,10 @@ #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ + /* Scenario 2: the ukernel contains assembly-level support for its IO + preference as well as its opposite via in-register transpose + (e.g. both row- and column-oriented IO). Use a temporary microtile + for the general stride case as well as edge cases. */ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ @@ -78,12 +91,16 @@ #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ + /* Scenario 3: Similar to (2) where the assembly region also supports + general stride I0. Use a temporary microtile only for edge cases. */ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ - const bool _use_ct = m != mr || n != nr; \ + const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ + /* Scenario 4: Similar to (1), but uses temporary microtile to handle + cases where the pointer to the C microtile is not aligned. */ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ @@ -91,8 +108,12 @@ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); +// -- Flush macros -- + #define GEMM_UKR_FLUSH_CT(ch) \ \ + /* If we actually used the temporary microtile, accumulate it to the output + microtile. */ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ @@ -105,5 +126,90 @@ } \ +// +// Macros for edge-case handling within gemmtrsm microkernels. +// + +// -- Setup helper macros -- + +#define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ +\ + PASTEMAC(ch,ctype)* restrict _c = c11; \ + const inc_t _rs_c = rs_c; \ + const inc_t _cs_c = cs_c; \ + PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ + __attribute__((aligned(alignment))); \ + const inc_t _rs_ct = row_major ? nr : 1; \ + const inc_t _cs_ct = row_major ? 1 : mr; + +#define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ +\ + if ( _use_ct ) \ + { \ + c11 = _ct; \ + rs_c = _rs_ct; \ + cs_c = _cs_ct; \ + } + +// -- Setup macros -- + +#define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ +\ + /* Scenario 1: the ukernel contains assembly-level support only for its + IO preference (e.g. only row-oriented or only column-oriented IO). + Use a temporary microtile for the other two cases as well as edge + cases. */ \ + GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ + const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ + m != mr || n != nr; \ + GEMMTRSM_UKR_SETUP_CT_POST(ch); + +#define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ +\ + /* Scenario 2: the ukernel contains assembly-level support for its IO + preference as well as its opposite via in-register transpose + (e.g. both row- and column-oriented IO). Use a temporary microtile + for the general stride case as well as edge cases. */ \ + GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ + const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ + m != mr || n != nr; \ + GEMMTRSM_UKR_SETUP_CT_POST(ch); + +#define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ +\ + /* Scenario 3: Similar to (2) where the assembly region also supports + general stride I0. Use a temporary microtile only for edge cases. */ \ + GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ + const bool _use_ct = ( m != mr || n != nr ); \ + GEMMTRSM_UKR_SETUP_CT_POST(ch); + +#define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ +\ + /* Scenario 4: Similar to (1), but uses temporary microtile to handle + cases where the pointer to the C microtile is not aligned. */ \ + GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ + const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ + m != mr || n != nr || \ + ( (uintptr_t)_c % alignment ) || \ + ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ + GEMMTRSM_UKR_SETUP_CT_POST(ch); + +// -- Flush macros -- + +#define GEMMTRSM_UKR_FLUSH_CT(ch) \ +\ + /* If we actually used the temporary microtile, use it to overwrite the + output microtile. Used by trsm. */ \ + if ( _use_ct ) \ + { \ + PASTEMAC(ch,copys_mxn) \ + ( \ + m, n, \ + _ct, _rs_ct, _cs_ct, \ + _c, _rs_c, _cs_c \ + ); \ + } \ + + #endif diff --git a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c index aead3ea9f..d0d0ff211 100644 --- a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c @@ -58,6 +58,8 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 ( + dim_t m, + dim_t n, dim_t k0, float* restrict alpha, float* restrict a10, @@ -81,6 +83,8 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 float* beta = bli_sm1; + GEMMTRSM_UKR_SETUP_CT_ANY( s, 6, 16, true ); + begin_asm() vzeroall() // zero all xmm/ymm registers. @@ -825,6 +829,8 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMMTRSM_UKR_FLUSH_CT( s ); } @@ -843,6 +849,8 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 void bli_dgemmtrsm_l_haswell_asm_6x8 ( + dim_t m, + dim_t n, dim_t k0, double* restrict alpha, double* restrict a10, @@ -866,6 +874,8 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 double* beta = bli_dm1; + GEMMTRSM_UKR_SETUP_CT_ANY( d, 6, 8, true ); + begin_asm() vzeroall() // zero all xmm/ymm registers. @@ -1572,6 +1582,8 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMMTRSM_UKR_FLUSH_CT( d ); } diff --git a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c index 2849e6994..68a8c069b 100644 --- a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c @@ -58,6 +58,8 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 ( + dim_t m, + dim_t n, dim_t k0, float* restrict alpha, float* restrict a10, @@ -81,6 +83,8 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 float* beta = bli_sm1; + GEMMTRSM_UKR_SETUP_CT_ANY( s, 6, 16, true ); + begin_asm() vzeroall() // zero all xmm/ymm registers. @@ -830,6 +834,8 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMMTRSM_UKR_FLUSH_CT( s ); } @@ -848,6 +854,8 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 void bli_dgemmtrsm_u_haswell_asm_6x8 ( + dim_t m, + dim_t n, dim_t k0, double* restrict alpha, double* restrict a10, @@ -871,6 +879,8 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 double* beta = bli_dm1; + GEMMTRSM_UKR_SETUP_CT_ANY( d, 6, 8, true ); + begin_asm() vzeroall() // zero all xmm/ymm registers. @@ -1583,6 +1593,8 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) + + GEMMTRSM_UKR_FLUSH_CT( d ); } diff --git a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c index 56afcf08c..7bef618fa 100644 --- a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c @@ -56,6 +56,8 @@ void bli_sgemmtrsm_l_penryn_asm_8x4 void bli_dgemmtrsm_l_penryn_asm_4x4 ( + dim_t m, + dim_t n, dim_t k0, double* restrict alpha, double* restrict a10, @@ -76,6 +78,8 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMMTRSM_UKR_SETUP_CT( d, 4, 4, false ); + begin_asm() mov(var(a10), rax) // load address of a10. @@ -561,6 +565,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 "memory" ) + GEMMTRSM_UKR_FLUSH_CT( d ); } diff --git a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c index 9811e0e32..add12ea24 100644 --- a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c @@ -56,6 +56,8 @@ void bli_sgemmtrsm_u_penryn_asm_8x4 void bli_dgemmtrsm_u_penryn_asm_4x4 ( + dim_t m, + dim_t n, dim_t k0, double* restrict alpha, double* restrict a12, @@ -76,6 +78,8 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; + GEMMTRSM_UKR_SETUP_CT( d, 4, 4, false ); + begin_asm() mov(var(a12), rax) // load address of a12. @@ -546,6 +550,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 "memory" ) + GEMMTRSM_UKR_FLUSH_CT( d ); } diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c index 2b260c881..30fc3fcd6 100644 --- a/ref_kernels/3/bli_gemmtrsm_ref.c +++ b/ref_kernels/3/bli_gemmtrsm_ref.c @@ -39,6 +39,8 @@ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ @@ -52,8 +54,9 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ - const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const inc_t rs_b = packnr; \ @@ -65,13 +68,35 @@ void PASTEMAC3(ch,opname,arch,suf) \ gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ PASTECH(ch,trsm_ukr_ft) \ trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \ +\ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + /* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR + instead? */ \ + const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : nr ); \ + const inc_t cs_ct = ( col_pref ? mr : 1 ); \ +\ + const bool use_ct = ( m < mr || n < nr ); \ +\ + ctype* restrict c11_use = c11; \ + inc_t rs_c_use = rs_c; \ + inc_t cs_c_use = cs_c; \ +\ + if ( use_ct ) \ + { \ + c11_use = ct; \ + rs_c_use = rs_ct; \ + cs_c_use = cs_ct; \ + } \ \ /* lower: b11 = alpha * b11 - a10 * b01; */ \ /* upper: b11 = alpha * b11 - a12 * b21; */ \ gemm_ukr \ ( \ - mr, \ - nr, \ + m, \ + n, \ k, \ minus_one, \ a1x, \ @@ -88,10 +113,20 @@ void PASTEMAC3(ch,opname,arch,suf) \ ( \ a11, \ b11, \ - c11, rs_c, cs_c, \ + c11_use, rs_c_use, cs_c_use, \ data, \ cntx \ ); \ +\ + if ( use_ct ) \ + { \ + PASTEMAC(ch,copys_mxn) \ + ( \ + m, n, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c \ + ); \ + } \ \ /* PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_r after", k+3, 8, \ diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c index 96f5a16fe..08823f073 100644 --- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c +++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c @@ -39,6 +39,8 @@ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ + dim_t m, \ + dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ @@ -59,7 +61,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ PASTECH(ch,trsm_ukr_ft) \ ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref_r = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ @@ -98,6 +100,28 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype_r* b_use; \ inc_t rs_b_use; \ inc_t cs_b_use; \ +\ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + /* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR + instead? */ \ + const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : nr ); \ + const inc_t cs_ct = ( col_pref ? mr : 1 ); \ +\ + const bool use_ct = ( m < mr || n < nr ); \ +\ + ctype* restrict c11_use = c11; \ + inc_t rs_c_use = rs_c; \ + inc_t cs_c_use = cs_c; \ +\ + if ( use_ct ) \ + { \ + c11_use = ct; \ + rs_c_use = rs_ct; \ + cs_c_use = cs_ct; \ + } \ \ \ /* Handle alphas with non-zero imaginary components. */ \ @@ -113,7 +137,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ bli_abort(); \ \ -/* + /* ctype_r* restrict one_r = PASTEMAC(chr,1); \ \ const inc_t ld_b = rs_b; \ @@ -125,17 +149,17 @@ void PASTEMAC3(ch,opname,arch,suf) \ b11, rs_b, cs_b, ld_b ); \ \ alpha_r = *one_r; \ -*/ \ + */ \ } \ \ \ { \ /* Set the strides for the temporary bt matrix based on the native real domain micro-kernel storage preferences. */ \ - if ( col_pref ) { rs_bt = 1; cs_bt = mr; \ - rs_bt_r = 1; cs_bt_r = mr_r; } \ - else { rs_bt = nr; cs_bt = 1; \ - rs_bt_r = nr_r; cs_bt_r = 1; } \ + if ( col_pref_r ) { rs_bt = 1; cs_bt = mr; \ + rs_bt_r = 1; cs_bt_r = mr_r; } \ + else { rs_bt = nr; cs_bt = 1; \ + rs_bt_r = nr_r; cs_bt_r = 1; } \ \ b_use = ( ctype_r* )bt; \ rs_b_use = rs_bt_r; \ @@ -241,10 +265,20 @@ void PASTEMAC3(ch,opname,arch,suf) \ ( \ a11, \ b11, \ - c11, rs_c, cs_c, \ + c11_use, rs_c_use, cs_c_use, \ data, \ cntx \ ); \ +\ + if ( use_ct ) \ + { \ + PASTEMAC(ch,copys_mxn) \ + ( \ + m, n, \ + ct, rs_ct, cs_ct, \ + c11, rs_c, cs_c \ + ); \ + } \ } INSERT_GENTFUNCCO_BASIC3( gemmtrsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR ) From c9700f369aa84fc00f36c4b817ffb7dab72b865d Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 15 Feb 2022 15:36:52 -0600 Subject: [PATCH 032/230] Renamed SIMD-related macro constants for clarity. Details: - Renamed the following macros defined in bli_kernel_macro_defs.h: BLIS_SIMD_NUM_REGISTERS -> BLIS_SIMD_MAX_NUM_REGISTERS BLIS_SIMD_SIZE -> BLIS_SIMD_MAX_SIZE Also updated all instances of these macros elsewhere, including subconfigurations, source code, and documentation. Thanks to Devin Matthews for suggesting this change. --- config/a64fx/bli_family_a64fx.h | 4 ++-- config/armsve/bli_family_armsve.h | 4 ++-- config/knc/bli_family_knc.h | 4 ++-- config/knl/bli_family_knl.h | 4 ++-- config/skx/bli_family_skx.h | 4 ++-- docs/ConfigurationHowTo.md | 23 +++++++++++++---------- docs/Testsuite.md | 2 +- frame/base/bli_info.c | 4 ++-- frame/include/bli_kernel_macro_defs.h | 14 +++++++------- 9 files changed, 33 insertions(+), 30 deletions(-) diff --git a/config/a64fx/bli_family_a64fx.h b/config/a64fx/bli_family_a64fx.h index b67ae7c60..f2837459d 100644 --- a/config/a64fx/bli_family_a64fx.h +++ b/config/a64fx/bli_family_a64fx.h @@ -38,8 +38,8 @@ // -- MEMORY ALLOCATION -------------------------------------------------------- -#define BLIS_SIMD_ALIGN_SIZE 256 -#define BLIS_SIMD_NUM_REGISTERS 32 +#define BLIS_SIMD_ALIGN_SIZE 256 +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 // SVE-specific configs. #define N_L1_SVE_DEFAULT 64 diff --git a/config/armsve/bli_family_armsve.h b/config/armsve/bli_family_armsve.h index b67ae7c60..f2837459d 100644 --- a/config/armsve/bli_family_armsve.h +++ b/config/armsve/bli_family_armsve.h @@ -38,8 +38,8 @@ // -- MEMORY ALLOCATION -------------------------------------------------------- -#define BLIS_SIMD_ALIGN_SIZE 256 -#define BLIS_SIMD_NUM_REGISTERS 32 +#define BLIS_SIMD_ALIGN_SIZE 256 +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 // SVE-specific configs. #define N_L1_SVE_DEFAULT 64 diff --git a/config/knc/bli_family_knc.h b/config/knc/bli_family_knc.h index 6f9e03e8f..b968b0c9a 100644 --- a/config/knc/bli_family_knc.h +++ b/config/knc/bli_family_knc.h @@ -46,8 +46,8 @@ #define BLIS_SIMD_ALIGN_SIZE 64 -#define BLIS_SIMD_SIZE 64 -#define BLIS_SIMD_NUM_REGISTERS 32 +#define BLIS_SIMD_MAX_SIZE 64 +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 #if 0 diff --git a/config/knl/bli_family_knl.h b/config/knl/bli_family_knl.h index 64994cd9d..98d3fe8d7 100644 --- a/config/knl/bli_family_knl.h +++ b/config/knl/bli_family_knl.h @@ -52,8 +52,8 @@ #define BLIS_SIMD_ALIGN_SIZE 64 -#define BLIS_SIMD_SIZE 64 -#define BLIS_SIMD_NUM_REGISTERS 32 +#define BLIS_SIMD_MAX_SIZE 64 +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 /* #ifdef BLIS_NO_HBWMALLOC diff --git a/config/skx/bli_family_skx.h b/config/skx/bli_family_skx.h index ac9478f8b..d698f12b4 100644 --- a/config/skx/bli_family_skx.h +++ b/config/skx/bli_family_skx.h @@ -47,8 +47,8 @@ #define BLIS_SIMD_ALIGN_SIZE 64 -#define BLIS_SIMD_SIZE 64 -#define BLIS_SIMD_NUM_REGISTERS 32 +#define BLIS_SIMD_MAX_SIZE 64 +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#include diff --git a/docs/ConfigurationHowTo.md b/docs/ConfigurationHowTo.md index 08eaf8027..dcec7754c 100644 --- a/docs/ConfigurationHowTo.md +++ b/docs/ConfigurationHowTo.md @@ -212,32 +212,35 @@ Furthermore, if a header file needs to be included, such as `my_malloc.h`, it sh _**SIMD register file.**_ BLIS allows you to specify the _maximum_ number of SIMD registers available for use by your kernels, as well as the _maximum_ size (in bytes) of those registers. These values default to: ```c -#define BLIS_SIMD_NUM_REGISTERS 32 -#define BLIS_SIMD_SIZE 64 +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 +#define BLIS_SIMD_MAX_SIZE 64 ``` These macros are used in computing the maximum amount of temporary storage (typically allocated statically, on the function stack) that will be needed to hold a single micro-tile of any datatype (and for any induced method): ```c -#define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_NUM_REGISTERS * BLIS_SIMD_SIZE * 2 ) +#define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * BLIS_SIMD_MAX_SIZE * 2 ) ``` -These temporary buffers are used when handling edge cases (m % _MR_ != 0 || n % _NR_ != 0) within the level-3 macrokernels, and also in the virtual microkernels of various implementations of induced methods for complex matrix multiplication. It is **very important** that these values be set correctly; otherwise, you may experience undefined behavior as stack data is overwritten at run-time. A kernel developer may set `BLIS_SIMD_NUM_REGISTERS` and `BLIS_SIMD_SIZE`, which will indirectly affect `BLIS_STACK_BUF_MAX_SIZE`, or he may set `BLIS_STACK_BUF_MAX_SIZE` directly. Notice that the default values are already set to work with modern x86_64 systems. +These temporary buffers are used when handling edge cases (m % _MR_ != 0 || n % _NR_ != 0) within the level-3 macrokernels, and also in the virtual microkernels of various implementations of induced methods for complex matrix multiplication. It is **very important** that these values be set correctly; otherwise, you may experience undefined behavior as stack data is overwritten at run-time. A kernel developer may set `BLIS_SIMD_MAX_NUM_REGISTERS` and `BLIS_SIMD_MAX_SIZE`, which will indirectly affect `BLIS_STACK_BUF_MAX_SIZE`, or he may set `BLIS_STACK_BUF_MAX_SIZE` directly. Notice that the default values are already set to work with modern x86_64 systems. _**Memory alignment.**_ BLIS implements memory alignment internally, rather than relying on a function such as `posix_memalign()`, and thus it can provide aligned memory even with functions that adhere to the `malloc()` and `free()` API in the standard C library. ```c -#define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_SIZE +#define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #define BLIS_PAGE_SIZE 4096 #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE -#define BLIS_POOL_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE +#define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE +#define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE +#define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE +#define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE ``` -The value `BLIS_STACK_BUF_ALIGN_SIZE` defines the alignment of stack memory used as temporary internal buffers, such as for output matrices to the microkernel when computing edge cases. (See [implementation notes](KernelsHowTo#implementation-notes-for-gemm) for the `gemm` microkernel for details.) This value defaults to `BLIS_SIMD_ALIGN_SIZE`, which defaults to `BLIS_SIMD_SIZE`. +The value `BLIS_STACK_BUF_ALIGN_SIZE` defines the alignment of stack memory used as temporary internal buffers, such as for output matrices to the microkernel when computing edge cases. (See [implementation notes](KernelsHowTo#implementation-notes-for-gemm) for the `gemm` microkernel for details.) This value defaults to `BLIS_SIMD_ALIGN_SIZE`, which defaults to `BLIS_SIMD_MAX_SIZE`. The value `BLIS_HEAP_ADDR_ALIGN_SIZE` defines the alignment used when allocating memory via the `malloc()` function defined by `BLIS_MALLOC_USER`. Setting this value to `BLIS_SIMD_ALIGN_SIZE` may speed up certain level-1v and -1f kernels. The value `BLIS_HEAP_STRIDE_ALIGN_SIZE` defines the alignment used for so-called "leading dimensions" (i.e. column strides for column-stored matrices, and row strides for row-stored matrices) when creating BLIS matrices via the object-based API (e.g. `bli_obj_create()`). While setting `BLIS_HEAP_ADDR_ALIGN_SIZE` guarantees alignment for the first column (or row), creating a matrix with certain dimension values (_m_ and _n_) may cause subsequent columns (or rows) to be misaligned. Setting this value to `BLIS_SIMD_ALIGN_SIZE` is usually desirable. Additional alignment may or may not be beneficial. -The value `BLIS_POOL_ADDR_ALIGN_SIZE` defines the alignment used when allocating blocks to the memory pools used to manage internal packing buffers. Any block of memory returned by the memory allocator is guaranteed to be aligned to this value. Aligning these blocks to the virtual memory page size (usually 4096 bytes) is standard practice. +The value `BLIS_POOL_ADDR_ALIGN_SIZE_*` define the alignments used when allocating blocks to the memory pools used to manage internal packing buffers for matrices A, B, C, and for general use. Any block of memory returned by the memory allocator is guaranteed to be aligned to this value. Aligning these blocks to the virtual memory page size (usually 4096 bytes) is standard practice. @@ -635,8 +638,8 @@ Adding support for a new-subconfiguration to BLIS is similar to adding support f ``` and while we're editing the file, we can make any other changes to compiler flags we wish (if any). Similarly, the `bli_family_knl.h` header file should be updated as needed. Since the number of vector registers and the vector register size on `knl` differ from the defaults, we must explicitly set them. (The role of these parameters was explained in a [previous section](ConfigurationHowTo.md#bli_family_h).) Furthermore, provided that a macro `BLIS_NO_HBWMALLOC` is not set, we use a different implementation of `malloc()` and `free()` and `#include` that implementation's header file. ```c - #define BLIS_SIMD_NUM_REGISTERS 32 - #define BLIS_SIMD_SIZE 64 + #define BLIS_SIMD_MAX_NUM_REGISTERS 32 + #define BLIS_SIMD_MAX_SIZE 64 #ifdef BLIS_NO_HBWMALLOC #include diff --git a/docs/Testsuite.md b/docs/Testsuite.md index d34955f0a..7c4893d04 100644 --- a/docs/Testsuite.md +++ b/docs/Testsuite.md @@ -150,7 +150,7 @@ _**Vector storage scheme.**_ Similar to the matrix storage scheme string, this s _**Test all combinations of storage schemes?**_ Enabling this option causes all combinations of storage schemes to be tested. For example, if the option is disabled, a matrix storage scheme string of `cr` would cause the `gemm` test module to test execution where all matrix operands are column-stored, and then where all matrix operands are row-stored. Enabling this option with the same matrix storage string (`cr`) would cause the test suite to test `gemm` under all eight scenarios where the three `gemm` matrix operands are either column-stored or row-stored. -_**Perform all tests with alignment?**_ Disabling this option causes the leading dimension (row or column stride) of test matrices to **not** be aligned according to `BLIS_HEAP_STRIDE_ALIGN_SIZE`, which defaults to `BLIS_SIMD_ALIGN_SIZE`, which defaults to `BLIS_SIMD_SIZE`, which defaults to 64 (bytes). (If any of these values is set to a non-default value, it would be in `bli_family_.h` where `` is the configuration family.) Sometimes it's useful to disable leading dimension alignment in order to test certain aspects of BLIS that need to handle computing with unaligned user data, such as level-1v and level-1f kernels. +_**Perform all tests with alignment?**_ Disabling this option causes the leading dimension (row or column stride) of test matrices to **not** be aligned according to `BLIS_HEAP_STRIDE_ALIGN_SIZE`, which defaults to `BLIS_SIMD_ALIGN_SIZE`, which defaults to `BLIS_SIMD_MAX_SIZE`, which defaults to 64 (bytes). (If any of these values is set to a non-default value, it would be in `bli_family_.h` where `` is the configuration family.) Sometimes it's useful to disable leading dimension alignment in order to test certain aspects of BLIS that need to handle computing with unaligned user data, such as level-1v and level-1f kernels. _**Randomize vectors and matrices.**_ The default randomization method uses real values on the interval [-1,1]. However, we offer an alternate randomization using powers of two in a narrow precision range, which is more likely to result in test residuals exactly equal to zero. This method is somewhat niche/experimental and most people should use random values on the [-1,1] interval. diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index 8a3dcd30a..bfa5ca9a3 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -54,8 +54,8 @@ gint_t bli_info_get_int_type_size( void ) { return BLIS_INT_TYPE_SIZ gint_t bli_info_get_num_fp_types( void ) { return BLIS_NUM_FP_TYPES; } gint_t bli_info_get_max_type_size( void ) { return BLIS_MAX_TYPE_SIZE; } gint_t bli_info_get_page_size( void ) { return BLIS_PAGE_SIZE; } -gint_t bli_info_get_simd_num_registers( void ) { return BLIS_SIMD_NUM_REGISTERS; } -gint_t bli_info_get_simd_size( void ) { return BLIS_SIMD_SIZE; } +gint_t bli_info_get_simd_num_registers( void ) { return BLIS_SIMD_MAX_NUM_REGISTERS; } +gint_t bli_info_get_simd_size( void ) { return BLIS_SIMD_MAX_SIZE; } gint_t bli_info_get_simd_align_size( void ) { return BLIS_SIMD_ALIGN_SIZE; } gint_t bli_info_get_stack_buf_max_size( void ) { return BLIS_STACK_BUF_MAX_SIZE; } gint_t bli_info_get_stack_buf_align_size( void ) { return BLIS_STACK_BUF_ALIGN_SIZE; } diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index d2487584e..4de624f98 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -163,21 +163,21 @@ // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. -#ifndef BLIS_SIMD_NUM_REGISTERS -#define BLIS_SIMD_NUM_REGISTERS 32 +#ifndef BLIS_SIMD_MAX_NUM_REGISTERS +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. -#ifndef BLIS_SIMD_SIZE -#define BLIS_SIMD_SIZE 64 +#ifndef BLIS_SIMD_MAX_SIZE +#define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE -#define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_SIZE +#define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel @@ -188,8 +188,8 @@ // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE -#define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_NUM_REGISTERS * \ - BLIS_SIMD_SIZE * 2 ) +#define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ + BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel From 4d8352309784403ed6719528968531ffb4483947 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Wed, 23 Feb 2022 01:03:47 +0900 Subject: [PATCH 033/230] Add armsve to arm64 Metaconfig (#614) Availability of the `armsve` subconfig is controlled by the compiler version (gcc/clang). Tested for SVE and non-SVE. Fixes #612. --- config/arm64/bli_family_arm64.h | 13 ++++++++++++- config/armsve/bli_cntx_init_armsve.c | 8 ++++++++ config_registry | 2 +- kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c | 2 +- .../armsve/1m/old/bli_dpackm_armsve512_int_12xk.c | 2 +- kernels/armsve/bli_kernels_armsve.h | 2 +- 6 files changed, 24 insertions(+), 5 deletions(-) diff --git a/config/arm64/bli_family_arm64.h b/config/arm64/bli_family_arm64.h index 278c22818..b242d7049 100644 --- a/config/arm64/bli_family_arm64.h +++ b/config/arm64/bli_family_arm64.h @@ -39,7 +39,18 @@ // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 - +#define BLIS_SIMD_MAX_NUM_REGISTERS 32 + +// SVE-specific configs. +#define N_L1_SVE_DEFAULT 64 +#define W_L1_SVE_DEFAULT 4 +#define C_L1_SVE_DEFAULT 256 +#define N_L2_SVE_DEFAULT 2048 +#define W_L2_SVE_DEFAULT 16 +#define C_L2_SVE_DEFAULT 256 +#define N_L3_SVE_DEFAULT 8192 +#define W_L3_SVE_DEFAULT 16 +#define C_L3_SVE_DEFAULT 256 //#endif diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c index cd07924a7..ad0e68219 100644 --- a/config/armsve/bli_cntx_init_armsve.c +++ b/config/armsve/bli_cntx_init_armsve.c @@ -33,9 +33,17 @@ */ #include "blis.h" +#include + +#ifndef HWCAP_SVE +#define HWCAP_SVE (1 << 22) +#endif void bli_cntx_init_armsve( cntx_t* cntx ) { + if (!(getauxval( AT_HWCAP ) & HWCAP_SVE)) + return; + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; #if 0 blksz_t thresh[ BLIS_NUM_THRESH ]; diff --git a/config_registry b/config_registry index d472325c7..93cff1523 100644 --- a/config_registry +++ b/config_registry @@ -12,7 +12,7 @@ x86_64: intel64 amd64 amd64_legacy intel64: skx knl haswell sandybridge penryn generic amd64_legacy: excavator steamroller piledriver bulldozer generic amd64: zen3 zen2 zen generic -arm64: firestorm thunderx2 cortexa57 cortexa53 generic +arm64: armsve firestorm thunderx2 cortexa57 cortexa53 generic arm32: cortexa15 cortexa9 generic # Intel architectures. diff --git a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c b/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c index 85dfaa9c0..7171347bf 100644 --- a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c +++ b/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c @@ -35,7 +35,7 @@ #include "blis.h" -#if (defined(BLIS_FAMILY_ARMSVE) && !defined(BLIS_FAMILY_A64FX)) +#if !defined(BLIS_FAMILY_A64FX) #include // assumption: diff --git a/kernels/armsve/1m/old/bli_dpackm_armsve512_int_12xk.c b/kernels/armsve/1m/old/bli_dpackm_armsve512_int_12xk.c index 966b0c134..47b15b437 100644 --- a/kernels/armsve/1m/old/bli_dpackm_armsve512_int_12xk.c +++ b/kernels/armsve/1m/old/bli_dpackm_armsve512_int_12xk.c @@ -36,7 +36,7 @@ #include "blis.h" #include -#if (defined(BLIS_FAMILY_ARMSVE) && !defined(BLIS_FAMILY_A64FX)) +#if !defined(BLIS_FAMILY_A64FX) #include // assumption: diff --git a/kernels/armsve/bli_kernels_armsve.h b/kernels/armsve/bli_kernels_armsve.h index 39daf30c6..00e1f0455 100644 --- a/kernels/armsve/bli_kernels_armsve.h +++ b/kernels/armsve/bli_kernels_armsve.h @@ -45,7 +45,7 @@ GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx10_unindexed ) //GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_10x2v_unindexed ) // Use SVE intrinsics only for referred cases. -#if (defined(BLIS_FAMILY_ARMSVE) && !defined(BLIS_FAMILY_A64FX)) +#if !defined(BLIS_FAMILY_A64FX) PACKM_KER_PROT( double, d, packm_armsve256_int_8xk ) PACKM_KER_PROT( double, d, packm_armsve512_int_12xk ) #endif From d5146582b1f1bcdccefe23925d3b114d40cd7e31 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Wed, 23 Feb 2022 03:35:46 +0900 Subject: [PATCH 034/230] ArmSVE Ensure Non-zero Block Size (#615) Fixes #613. There are several macros/environment variables which need to be tuned to get good cache block sizes. It would be nice to have a way of getting values automatically. --- kernels/armsve/3/bli_armsve_utils.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernels/armsve/3/bli_armsve_utils.c b/kernels/armsve/3/bli_armsve_utils.c index 1e3256d34..2ebafa655 100644 --- a/kernels/armsve/3/bli_armsve_utils.c +++ b/kernels/armsve/3/bli_armsve_utils.c @@ -79,6 +79,11 @@ void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \ dim_t C_Bc = W_L3 - 1 - ceil( (2.0 * k_c * m_c * S_Data)/(C_L3 * N_L3) ); \ dim_t n_c = C_Bc * (N_L3 * C_L3)/(k_c * S_Data); \ n_c -= n_c % n_r; \ +\ + /* Ensure non-zero block sizes. */ \ + m_c = bli_max(m_c, m_r); \ + n_c = bli_max(n_c, n_r); \ + k_c = bli_max(k_c, 128); \ \ *m_r_ = m_r; \ *n_r_ = n_r; \ From 84732bf95634ac606c5f2661d9474318e366c386 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 28 Feb 2022 12:19:31 -0600 Subject: [PATCH 035/230] Revamp how tools are handled/checked by configure. Details: - Consolidate handling of tools that are specifiable via CC, CXX, FC, PYTHON, AR, and RANLIB into one bash function, select_tool_w_env(). - If the user specifies a tool via an environment variable (e.g. CC=gcc) and that tool does not seem valid, print an error message and abort configure, unless the tool is optional (e.g. CXX or FC), in which case a warning message is printed instead. - The definition of "seems valid" above amounts to: - responding to at least one of a basic set of command line options (e.g. --version, -V, -h) if the os_name is Linux (since GNU tools tend to respond to flags such as --version) or if the tool in question is CC, CXX, FC, or PYTHON (which tend to respond to the expected flags regardless of OS) - the binary merely existing for AR and RANLIB on Darwin/OSX/BSD. (These OSes tend to have non-GNU versions of ar and ranlib, which typically do not respond to --version and friends.) - This PR addresses #584. Thanks to Devin Matthews for suggesting some of the changes in this commit. --- configure | 440 ++++++++++++++++++++++++++++++----------- frame/compat/bla_dot.c | 5 +- 2 files changed, 333 insertions(+), 112 deletions(-) diff --git a/configure b/configure index c03df26cd..5f3e83eaa 100755 --- a/configure +++ b/configure @@ -360,15 +360,18 @@ print_usage() echo " CC Specifies the C compiler to use." echo " CXX Specifies the C++ compiler to use (sandbox only)." echo " FC Specifies the Fortran compiler to use (only to determine --complex-return)." - echo " RANLIB Specifies the ranlib executable to use." - echo " AR Specifies the archiver to use." + echo " AR Specifies the static library archiver to use." + echo " RANLIB Specifies the ranlib (library indexer) executable to use." + echo " PYTHON Specifies the python interpreter to use." echo " CFLAGS Specifies additional compiler flags to use (prepended)." echo " LDFLAGS Specifies additional linker flags to use (prepended)." echo " LIBPTHREAD Pthreads library to use." - echo " PYTHON Specifies the python interpreter to use." echo " " - echo " Environment variables may also be specified as command line" - echo " options, e.g.:" + echo " Environment variables are traditionally set prior to running configure:" + echo " " + echo " CC=gcc ./configure [options] haswell" + echo " " + echo " However, they may also be specified as command line options, e.g.:" echo " " echo " ./configure [options] CC=gcc haswell" echo " " @@ -418,10 +421,10 @@ assign_key_value() # # found in a blacklist. # # # Note: $2 can actually be a list of items. -# dlist=\$"$1" -# ditem=\$"$2" +# ditem=\$"$1" +# dlist=\$"$2" # -# # Acquire the contents of $list and $item and store them in list_c +# # Acquire the contents of $dlist and $ditem and store them in list_c # # and item_c, respectively. # list_c=$(eval "expr \"$dlist\" ") # item_c=$(eval "expr \"$ditem\" ") @@ -438,7 +441,7 @@ assign_key_value() # done # # # Update the argument. -# eval "$1=\"${list_c}\"" +# eval "$2=\"${list_c}\"" #} pass_config_kernel_registries() @@ -1049,42 +1052,31 @@ get_cxx_search_list() echo "${list}" } -select_tool() +get_fc_search_list() { - local search_list CC_env the_cc cc + local list - # This is the list of compilers/tools to search for, and the order in - # which to search for them. - search_list=$1 + list="gfortran ifort" - # The environment variable associated with the compiler/tool type we - # are searching (e.g. CC, CXX, PYTHON). - CC_env=$2 + echo "${list}" +} - # If CC_env contains something, add it to the beginning of our default - # search list. - if [ -n "${CC_env}" ]; then - search_list="${CC_env} ${search_list}" - fi +get_ar_search_list() +{ + local list - # Initialize our selected compiler/tool to empty. - the_cc="" + list="ar" - # Try each compiler/tool in the list and select the first one we find that - # works. - for cc in ${search_list}; do + echo "${list}" +} - # See if the current compiler/tool works and/or is present. - ${cc} --version > /dev/null 2>&1 +get_ranlib_search_list() +{ + local list - if [ "$?" == 0 ]; then - the_cc=${cc} - break - fi - done + list="ranlib" - # Return the selected compiler/tool. - echo "${the_cc}" + echo "${list}" } auto_detect() @@ -2054,6 +2046,223 @@ set_default_version() fi } +select_tool_w_env() +{ + local search_list env_var env_str tool_str found_var + local _the_tool + + # Example calling sequence: + # + # select_tool_w_env "${cc_search_list}" "${CC}" "CC" "C compiler" "yes" found_cc + # + + search_list="$1" # the tool's default search list. + env_var="$2" # the value of the environment variable for this tool. + env_str="$3" # a string naming the source of env_var. + tool_str="$4" # a human-readable string identifying the tool. + is_required="$5" # is it fatal if env_var doesn't exist/work? (yes or no) + found_var="$6" # the variable into which to save the selected tool. + + # If the environment variable contains something, verify that it exists. If + # it is unset or empty, we proceed with the default search list. + if [ -n "${env_var}" ]; then + + echo "${script_name}: user specified a ${tool_str} via ${env_str} (${env_var})." + + # See if the binary specified by env_var exists. + _the_tool=$(select_tool "${env_var}" "${env_str}") + + # Copy the result into the variable specified by found_var. + eval "${found_var}=\"${_the_tool}\"" + + # If the tool specified by env_var doesn't exist, throw a tantrum. + if [ -z "${_the_tool}" ]; then + + echo "${script_name}: *** Could not find the ${tool_str} specified via ${env_str} ('${env_var}')." + + # Whether the tantrum is fatal depends on the is_required argument. + if [ "${is_required}" == "yes" ]; then + echo "${script_name}: *** A working ${tool_str} is required. Please set ${env_str}" + echo "${script_name}: *** to a ${tool_str} that exists (or unset ${env_str})." + exit 1 + else + echo "${script_name}: *** Note that a ${tool_str} will not be available." + + # Set the found_var variable to *something* so that the output + # makefile fragment contains a record that the tool wasn't found. + eval "${found_var}=\"${env_str}\"-not-found" + fi + else + # The user-specified tool was found. + echo "${script_name}: ${_the_tool} exists and appears to work." + echo "${script_name}: using '${_the_tool}' as ${tool_str}." + fi + + else + + echo "${script_name}: ${tool_str} search list is: ${search_list}." + + # Search for a working tool from the search list. + _the_tool=$(select_tool "${search_list}" "${env_str}") + + # Copy the result into the variable specified by found_var. + eval "${found_var}=\"${_the_tool}\"" + + # If we didn't find a working tool from the search list, throw a tantrum. + if [ -z "${_the_tool}" ]; then + + echo "${script_name}: *** Could not find a ${tool_str} from the search list." + + # Whether the tantrum is fatal depends on the is_required argument. + if [ "${is_required}" == "yes" ]; then + echo "${script_name}: *** A working ${tool_str} is required. Cannot continue." + exit 1 + else + echo "${script_name}: *** Note that a ${tool_str} will not be available." + + # Set the found_var variable to *something* so that the output + # makefile fragment contains a record that the tool wasn't found. + eval "${found_var}=\"${env_str}-not-found\"" + fi + else + # A tool from the search list was found. + echo "${script_name}: found '${_the_tool}'." + echo "${script_name}: using '${_the_tool}' as ${tool_str}." + fi + fi +} + +select_tool() +{ + local search_list env_str + local the_tool tool the_flags rval + + # This is the list of tools to search for, and the order in which + # to search for them. + search_list="$1" + + # This is the name of the environment variable associated with the tool. For + # example, if search_list is a list of C compilers, env_str will be "CC". + env_str="$2" + + # Initialize our selected tool to empty. + the_tool="" + + # Try each tool in the list and select the first one we find that works. + for tool in ${search_list}; do + + # Map each tool (via its canonical environment variable form) to the set + # of options we should use to check that it is working and available. + the_flags=$(get_tool_checkflags "${env_str}") + + # Check that the tool works with at least one of the flags in the_flags + # the_flags (or, if the_flags is empty, check that the tool exists). + rval=$(check_tool "${tool}" "${the_flags}") + + # If check_tool() returns 0, we're done. + if [ "${rval}" == "0" ]; then + the_tool=${tool} + break + fi + done + + # Return the selected tool. + echo "${the_tool}" +} + +get_tool_checkflags() +{ + local env_str + local allflags flaglist + + # The tool for which we will determine the flag/option to pass in + # when testing that the tool works. Notice that it's not actually + # the tool but rather its equivalent environment variable. + env_str="${1}" + + # The default list of flags to use in most circumstances. + allflags="--version -V -h" + + if [ "${os_name}" = "Linux" ]; then + + # If we are on Linux, it is very likely that all the tools will respond + # to at least one of the usual flags. + flaglist="${allflags}" + + else + + # If we are on Darwin/OSX/BSD or something else, we sometimes skip flag + # checks. (Note that when the list of flags to check is empty, we end + # up testing for the existence of the tool instead.) + if [ "${env_str}" = "AR" -o \ + "${env_str}" = "RANLIB" ]; then + + # AR, RANLIB may not respond to the normal flags on Darwin/OSX/BSD, + # so all we can really do is check for their existence. + flaglist="" + else + # Even on Darwin/OSX/BSD, we expect that CC, CXX, FC, PYTHON will + # respond to the typical flag checklist. + flaglist="${allflags}" + fi + fi + + echo "${flaglist}" +} + +check_tool() +{ + local tool the_flags + local rval opt toolpath + + # This is the name, or filepath, of the tool to check for. + tool="$1" + + # Some command line options to try to determine that the tool works. + the_flags="$2" + + # Start with the assuming that the tool doesn't work/exist. + rval=1 + + if [ -n "${the_flags}" ]; then + + # If the list of flags to check non-empty, we will iterate through the + # list in search of a flag that works. Failure to find one that works + # means the tool doesn't work (or, if the user specified the tool via + # its environment variable, failure might mean that the tool doesn't + # even exist). + + # Try each flag in the list of flags. + for opt in ${the_flags}; do + + # See if the tool responds to the current flag. + ${tool} ${opt} > /dev/null 2>&1 + + # If the tool responded to the flag with a nominal error code of + # 0, we found one that works and set rval accoringly. + if [ "$?" == 0 ]; then + rval=0 + break + fi + done + else + + # If the list of flags to check is empty, we interpret this as a + # request to instead check for the existence of the tool. + + # Use 'which' to determine if the tool exists. + toolpath="$(which ${tool} 2> /dev/null)" + + # If the tool doesn't exist, we set rval accordingly. + if [ -n "${toolpath}" ]; then + rval=0 + fi + fi + + # Return the error code. + echo "${rval}" +} + # @@ -2568,24 +2777,13 @@ main() # -- Find a python interpreter --------------------------------------------- - # Acquire the python search order. This may vary based on the os found - # above. + # Acquire the default python search order. python_search_list=$(get_python_search_list) - echo "${script_name}: python interpeter search list is: ${python_search_list}." - - # Find a working python interpreter. - found_python=$(select_tool "${python_search_list}" "${PYTHON}") - - # If we didn't find any working python interpreters, we print an error - # message. - if [ -z "${found_python}" ]; then - echo "${script_name}: *** Could not find working python interperter! Cannot continue." - exit 1 - fi - - echo "${script_name}: using '${found_python}' python interpreter." - + # Select a python interpreter from the default list, or from PYTHON if it + # refers to a valid binary. + select_tool_w_env "${python_search_list}" "${PYTHON}" "PYTHON" \ + "python interpreter" "yes" found_python # -- Check the python version ---------------------------------------------- @@ -2596,22 +2794,13 @@ main() # -- Find a C compiler ----------------------------------------------------- - # Acquire the compiler search order. This will vary based on the os found - # above. + # Acquire the default compiler search order. This will vary based on os_name. cc_search_list=$(get_cc_search_list) - echo "${script_name}: C compiler search list is: ${cc_search_list}." - - # Find a working C compiler. - found_cc=$(select_tool "${cc_search_list}" "${CC}") - - # If we didn't find any working C compilers, we print an error message. - if [ -z "${found_cc}" ]; then - echo "${script_name}: *** Could not find working C compiler! Cannot continue." - exit 1 - fi - - echo "${script_name}: using '${found_cc}' C compiler." + # Select a C compiler from the default list, or from CC if it refers to a + # valid binary. + select_tool_w_env "${cc_search_list}" "${CC}" "CC" \ + "C compiler" "yes" found_cc # Also check the compiler to see if we are (cross-)compiling for Windows if ${found_cc} -dM -E - < /dev/null 2> /dev/null | grep -q _WIN32; then @@ -2619,27 +2808,6 @@ main() fi - # -- Find a C++ compiler --------------------------------------------------- - - # Acquire the compiler search order. This will vary based on the os - # found above. - cxx_search_list=$(get_cxx_search_list) - - echo "${script_name}: C++ compiler search list is: ${cxx_search_list}." - - # Find a working C++ compiler. NOTE: We can reuse the select_tool() - # function since it is written in a way that is general-purpose. - found_cxx=$(select_tool "${cxx_search_list}" "${CXX}") - - # If we didn't find any working C++ compilers, we print an error message. - if [ -z "${found_cxx}" ]; then - echo "${script_name}: Could not find working C++ compiler! C++ will not be available in sandbox." - found_cxx="c++notfound" - fi - - echo "${script_name}: using '${found_cxx}' C++ compiler (for sandbox only)." - - # -- Check the compiler version -------------------------------------------- # Initialize the blacklist to empty. @@ -2670,6 +2838,57 @@ main() fi + # -- Find a C++ compiler --------------------------------------------------- + + # Acquire the default C++ compiler search order. This will vary based on + # os_name. + cxx_search_list=$(get_cxx_search_list) + + # Select a C compiler from the default list, or from CC if it refers to a + # valid binary. + select_tool_w_env "${cxx_search_list}" "${CXX}" "CXX" \ + "C++ compiler" "no" found_cxx + + + # -- Find a Fortran compiler ----------------------------------------------- + + # Acquire the default Fortran compiler search order. + fc_search_list=$(get_fc_search_list) + + # Select a Fortran compiler from the default list, or from FC if it refers + # to a valid binary. + # NOTE: A Fortran compiler is not necessary for building BLIS. The only + # reason we might want to query it is to detect the style of returning + # complex values from functions. The 'gnu' style returns complex values + # from functions normally, via the C language return statement, while the + # 'intel' style returns them in a "hidden" parameter (inserted by the + # compiler) that precedes all other function parameters. + select_tool_w_env "${fc_search_list}" "${FC}" "FC" \ + "Fortran compiler" "no" found_fc + + + # -- Find a static library archiver ---------------------------------------- + + # Acquire the default archiver search order. + ar_search_list=$(get_ar_search_list) + + # Select an archiver from the default list, or from AR if it refers + # to a valid binary. + select_tool_w_env "${ar_search_list}" "${AR}" "AR" \ + "library archiver" "yes" found_ar + + + # -- Find an archive indexer ----------------------------------------------- + + # Acquire the default archive indexer search order. + ranlib_search_list=$(get_ranlib_search_list) + + # Select an archive indexer from the default list, or from RANLIB if it + # refers to a valid binary. + select_tool_w_env "${ranlib_search_list}" "${RANLIB}" "RANLIB" \ + "archive indexer" "yes" found_ranlib + + # -- Read the configuration registry --------------------------------------- # Make sure the config registry file exists and can be opened. @@ -3399,10 +3618,16 @@ main() enable_sandbox_01=0 fi - # Check the method used for returning complex numbers + # Check the method used for returning complex numbers. if [ "x${complex_return}" = "xdefault" ]; then - if [ -n "${FC}" ]; then - # Determine the complex return type from the given Fortran compiler + + # If we prevoiusly found a Fortran compiler, let's query it to see what + # kind of complex return type it uses (gnu or intel). The 'gnu' style + # returns complex values from functions normally, via the C language + # return statement, while the 'intel' style returns them in a "hidden" + # parameter (inserted by the compiler) that precedes all other function + # parameters. + if [ -n "${found_fc}" ]; then # Query the full vendor version string output. This includes the # version number along with (potentially) a bunch of other textual @@ -3411,8 +3636,7 @@ main() # stdout. But it works for now. vendor_string="$(${FC} --version 2>/dev/null)" - # Query the compiler "vendor" (ie: the compiler's simple name) and - # isolate the version number. + # Query the compiler "vendor" (ie: the compiler's simple name). # The last part ({ read first rest ; echo $first ; }) is a workaround # to OS X's egrep only returning the first match. fc_vendor=$(echo "${vendor_string}" | egrep -o 'ifort|GNU' | { read first rest ; echo $first ; }) @@ -3445,23 +3669,19 @@ main() # Variables that may contain forward slashes, such as paths, need extra # escaping when used in sed commands. We insert those extra escape # characters here so that the sed commands below do the right thing. - os_name_esc=$(echo "${os_name}" | sed 's/\//\\\//g') - prefix_esc=$(echo "${prefix}" | sed 's/\//\\\//g') - exec_prefix_esc=$(echo "${exec_prefix}" | sed 's/\//\\\//g') - libdir_esc=$(echo "${libdir}" | sed 's/\//\\\//g') - includedir_esc=$(echo "${includedir}" | sed 's/\//\\\//g') - sharedir_esc=$(echo "${sharedir}" | sed 's/\//\\\//g') - dist_path_esc=$(echo "${dist_path}" | sed 's/\//\\\//g') - cc_esc=$(echo "${found_cc}" | sed 's/\//\\\//g') - cxx_esc=$(echo "${found_cxx}" | sed 's/\//\\\//g') - python_esc=$(echo "${found_python}" | sed 's/\//\\\//g') - #sandbox_relpath_esc=$(echo "${sandbox_relpath}" | sed 's/\//\\\//g') - - # For RANLIB, if the variable is not set, we use a default value of - # 'ranlib'. - ranlib_esc=$(echo "${RANLIB:-ranlib}" | sed 's/\//\\\//g') - # For AR, if the variable is not set, we use a default value of 'ar'. - ar_esc=$(echo "${AR:-ar}" | sed 's/\//\\\//g') + os_name_esc=$(echo "${os_name}" | sed 's/\//\\\//g') + prefix_esc=$(echo "${prefix}" | sed 's/\//\\\//g') + exec_prefix_esc=$(echo "${exec_prefix}" | sed 's/\//\\\//g') + libdir_esc=$(echo "${libdir}" | sed 's/\//\\\//g') + includedir_esc=$(echo "${includedir}" | sed 's/\//\\\//g') + sharedir_esc=$(echo "${sharedir}" | sed 's/\//\\\//g') + dist_path_esc=$(echo "${dist_path}" | sed 's/\//\\\//g') + cc_esc=$(echo "${found_cc}" | sed 's/\//\\\//g') + cxx_esc=$(echo "${found_cxx}" | sed 's/\//\\\//g') + ar_esc=$(echo "${found_ar}" | sed 's/\//\\\//g') + ranlib_esc=$(echo "${found_ranlib}" | sed 's/\//\\\//g') + python_esc=$(echo "${found_python}" | sed 's/\//\\\//g') + libpthread_esc=$(echo "${LIBPTHREAD--lpthread}" | sed 's/\//\\\//g') cflags_preset_esc=$(echo "${cflags_preset}" | sed 's/\//\\\//g') ldflags_preset_esc=$(echo "${ldflags_preset}" | sed 's/\//\\\//g') @@ -3577,8 +3797,8 @@ main() | sed -e "s/@aocc_older_than_3_0_0@/${aocc_older_than_3_0_0}/g" \ | sed -e "s/@CC@/${cc_esc}/g" \ | sed -e "s/@CXX@/${cxx_esc}/g" \ - | sed -e "s/@RANLIB@/${ranlib_esc}/g" \ | sed -e "s/@AR@/${ar_esc}/g" \ + | sed -e "s/@RANLIB@/${ranlib_esc}/g" \ | sed -e "s/@PYTHON@/${python_esc}/g" \ | sed -e "s/@libpthread@/${libpthread_esc}/g" \ | sed -e "s/@cflags_preset@/${cflags_preset_esc}/g" \ diff --git a/frame/compat/bla_dot.c b/frame/compat/bla_dot.c index 0699cb22f..f5396b190 100644 --- a/frame/compat/bla_dot.c +++ b/frame/compat/bla_dot.c @@ -92,9 +92,10 @@ INSERT_GENTFUNCDOTR_BLAS( dot, dotv ) INSERT_GENTFUNCDOTC_BLAS( dot, dotv ) -#else +#else // #ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL -// For the "intel" complex return type, use a hidden parameter to return the result +// For the "intel" complex return type, use a hidden preceding parameter to +// return the result rather than an actual return value. #undef GENTFUNCDOT #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \ \ From 71851a0549276b17db18a0a0c8ab4f54493bf033 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 8 Mar 2022 17:38:09 -0600 Subject: [PATCH 036/230] Fixed level-3 performance bug in haswell ukernels. Details: - Fixed a performance regression affecting nearly all level-3 operations that use the 'haswell' sgemm and dgemm microkernels. This regression was introduced in 54fa28b, caused by an ill-formed conditional expression in the assembly code that controls whether cache lines of C should be prefetched as rows or as columns. Essentially, the two branches were reversed, causing incomplete prefetching to occur for both row- and column-stored instances of matrix C. Thanks to Devin Matthews for his help finding and fixing this bug. --- kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c index d0e793867..70ea4ccd7 100644 --- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c @@ -123,7 +123,7 @@ void bli_sgemm_haswell_asm_6x16 mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) - cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPREFETCH) // jump to column prefetch case lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; @@ -803,8 +803,8 @@ void bli_dgemm_haswell_asm_6x8 mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) - cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. - jz(.SCOLPREFETCH) // jump to column prefetch case + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. + jz(.DCOLPREFETCH) // jump to column prefetch case lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c; @@ -815,9 +815,9 @@ void bli_dgemm_haswell_asm_6x8 prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c - jmp(.SPREFETCHDONE) + jmp(.DPREFETCHDONE) - label(.SCOLPREFETCH) + label(.DCOLPREFETCH) lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c; @@ -830,7 +830,7 @@ void bli_dgemm_haswell_asm_6x8 prefetch(0, mem(rdx, r13, 1, 7*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 4, 7*8)) // prefetch c + 7*cs_c - label(.SPREFETCHDONE) + label(.DPREFETCHDONE) mov(var(k_iter), rsi) // i = k_iter; From cad10410b2305bc0e328c5f2517ab02593b53428 Mon Sep 17 00:00:00 2001 From: Ivan Korostelev Date: Thu, 10 Mar 2022 09:58:14 -0600 Subject: [PATCH 037/230] POWER10: edge cases in microkernel (#620) Use new API for POWER10 gemm microkernel --- sandbox/power10/gemm_template.h | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/sandbox/power10/gemm_template.h b/sandbox/power10/gemm_template.h index 6f9b23032..eb0ef24bb 100644 --- a/sandbox/power10/gemm_template.h +++ b/sandbox/power10/gemm_template.h @@ -80,9 +80,6 @@ void GEMM_FUNC_NAME(ch) \ DTYPE_OUT* c, inc_t rsc, inc_t csc \ ) \ { \ - DTYPE_OUT zero = 0.0; \ - DTYPE_OUT beta_ = *beta; \ - \ DTYPE_IN * restrict btilde_sys = ( DTYPE_IN *) aligned_alloc( P10_PG_SIZE, B_ALIGN + KC * NC * sizeof( DTYPE_IN ) ); \ DTYPE_IN * restrict atilde_sys = ( DTYPE_IN *) aligned_alloc( P10_PG_SIZE, A_ALIGN + MC * KC * sizeof( DTYPE_IN ) ); \ \ @@ -104,10 +101,6 @@ void GEMM_FUNC_NAME(ch) \ DTYPE_OUT * restrict cblock = c; \ DTYPE_IN * restrict bblock = b; \ \ - DTYPE_OUT tmp_cmicrotile[MR*NR]; \ - int rsct = ( rsc == 1 ? 1 : NR ); \ - int csct = ( rsc == 1 ? MR : 1 ); \ - \ for ( int jc=0; jc Date: Fri, 11 Mar 2022 13:28:50 -0600 Subject: [PATCH 038/230] Avoid gemmsup barriers when not packing A or B. (#622) Details: - Implemented a multithreaded optimization for the special (and common) case of employing the gemmsup code path when the user requests (implicitly or explicitly) that neither A nor B be packed during computation. This optimization takes the form of a greatly reduced code branch in bli_thrinfo_sup_create_for_cntl(), which avoids a broadcast and two barriers, and results in higher performance when obtaining two-way or higher parallelism within BLIS. Thanks to Bhaskar Nallani of AMD for proposing this change via issue #605. - Added an early return branch to bli_thrinfo_create_for_cntl() that detects and quickly handles cases where no parallelism is being obtained within BLIS (i.e., single-threaded execution). Note that this special case handling was/is already present in bli_thrinfo_sup_create_for_cntl(). - CREDITS file update. --- CREDITS | 1 + frame/thread/bli_thrinfo.c | 18 ++++ frame/thread/bli_thrinfo_sup.c | 181 ++++++++++++++++++++------------- 3 files changed, 131 insertions(+), 69 deletions(-) diff --git a/CREDITS b/CREDITS index 7dd452daa..85ed97c6a 100644 --- a/CREDITS +++ b/CREDITS @@ -64,6 +64,7 @@ but many others have contributed code and feedback, including Simon Lukas Märtens @ACSimon33 (RWTH Aachen University) Devin Matthews @devinamatthews (The University of Texas at Austin) Stefanos Mavros @smavros + Mithun Mohan @MithunMohanKadavil (AMD) Ilknur Mustafazade @Runkli @nagsingh Bhaskar Nallani @BhaskarNallani (AMD) diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c index f9cd5ce74..0282be170 100644 --- a/frame/thread/bli_thrinfo.c +++ b/frame/thread/bli_thrinfo.c @@ -298,6 +298,24 @@ thrinfo_t* bli_thrinfo_create_for_cntl thrinfo_t* thread_par ) { + // If we are running with a single thread, all of the code can be reduced + // and simplified to this. + if ( bli_rntm_calc_num_threads( rntm ) == 1 ) + { + thrinfo_t* thread_chl = bli_thrinfo_create + ( + rntm, // rntm + &BLIS_SINGLE_COMM, // ocomm + 0, // ocomm_id + 1, // n_way + 0, // work_id + FALSE, // free_comm + BLIS_NO_PART, // bszid + NULL // sub_node + ); + return thread_chl; + } + thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ]; thrcomm_t** new_comms = NULL; diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c index ab28b7160..984820f39 100644 --- a/frame/thread/bli_thrinfo_sup.c +++ b/frame/thread/bli_thrinfo_sup.c @@ -145,7 +145,6 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl thrinfo_t* thread_par ) { -#if 1 // If we are running with a single thread, all of the code can be reduced // and simplified to this. if ( bli_rntm_calc_num_threads( rntm ) == 1 ) @@ -163,84 +162,128 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl ); return thread_chl; } -#endif - thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ]; - thrcomm_t** new_comms = NULL; + // The remainder of this function handles the cases involving the use of + // multiple BLIS threads. - const dim_t parent_nt_in = bli_thread_num_threads( thread_par ); - const dim_t parent_n_way = bli_thread_n_way( thread_par ); - const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); - const dim_t parent_work_id = bli_thread_work_id( thread_par ); - - // Sanity check: make sure the number of threads in the parent's - // communicator is divisible by the number of new sub-groups. - if ( parent_nt_in % parent_n_way != 0 ) + if ( bli_rntm_pack_a( rntm ) == FALSE && + bli_rntm_pack_b( rntm ) == FALSE ) { - printf( "Assertion failed: parent_nt_in parent_n_way != 0\n" ); - bli_abort(); + // If we are packing neither A nor B, there are no broadcasts or barriers + // needed to synchronize threads (since all threads can work completely + // independently). In this special case situation, the thrinfo_t can be + // created with much simpler logic. + + const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); + + // Compute: + // - the number of threads inside the new child comm, + // - the current thread's id within the new communicator, + // - the current thread's work id, given the ways of parallelism + // to be obtained within the next loop. + const dim_t child_nt_in = bli_rntm_calc_num_threads_in( bszid_chl, rntm ); + const dim_t child_n_way = bli_rntm_ways_for( *bszid_chl, rntm ); + const dim_t child_comm_id = parent_comm_id % child_nt_in; + const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); + + // All threads create a new thrinfo_t node using the communicator + // that was created by their chief, as identified by parent_work_id. + thrinfo_t* thread_chl = bli_thrinfo_create + ( + rntm, // rntm + NULL, // ocomm + child_comm_id, // ocomm_id + child_n_way, // n_way + child_work_id, // work_id + TRUE, // free_comm + *bszid_chl, // bszid + NULL // sub_node + ); + + return thread_chl; } + else + { + // If we are packing at least one of A or B, then we use the general + // approach that employs broadcasts and barriers. + + thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ]; + thrcomm_t** new_comms = NULL; - // Compute: - // - the number of threads inside the new child comm, - // - the current thread's id within the new communicator, - // - the current thread's work id, given the ways of parallelism - // to be obtained within the next loop. - const dim_t child_nt_in = bli_rntm_calc_num_threads_in( bszid_chl, rntm ); - const dim_t child_n_way = bli_rntm_ways_for( *bszid_chl, rntm ); - const dim_t child_comm_id = parent_comm_id % child_nt_in; - const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); + const dim_t parent_nt_in = bli_thread_num_threads( thread_par ); + const dim_t parent_n_way = bli_thread_n_way( thread_par ); + const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); + const dim_t parent_work_id = bli_thread_work_id( thread_par ); + + // Sanity check: make sure the number of threads in the parent's + // communicator is divisible by the number of new sub-groups. + if ( parent_nt_in % parent_n_way != 0 ) + { + printf( "Assertion failed: parent_nt_in parent_n_way != 0\n" ); + bli_abort(); + } + + // Compute: + // - the number of threads inside the new child comm, + // - the current thread's id within the new communicator, + // - the current thread's work id, given the ways of parallelism + // to be obtained within the next loop. + const dim_t child_nt_in = bli_rntm_calc_num_threads_in( bszid_chl, rntm ); + const dim_t child_n_way = bli_rntm_ways_for( *bszid_chl, rntm ); + const dim_t child_comm_id = parent_comm_id % child_nt_in; + const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); //printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl ); - // The parent's chief thread creates a temporary array of thrcomm_t - // pointers. - if ( bli_thread_am_ochief( thread_par ) ) - { - err_t r_val; + // The parent's chief thread creates a temporary array of thrcomm_t + // pointers. + if ( bli_thread_am_ochief( thread_par ) ) + { + err_t r_val; - if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) - new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val ); - else - new_comms = static_comms; - } + if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) + new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val ); + else + new_comms = static_comms; + } - // Broadcast the temporary array to all threads in the parent's - // communicator. - new_comms = bli_thread_broadcast( thread_par, new_comms ); - - // Chiefs in the child communicator allocate the communicator - // object and store it in the array element corresponding to the - // parent's work id. - if ( child_comm_id == 0 ) - new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in ); - - bli_thread_barrier( thread_par ); - - // All threads create a new thrinfo_t node using the communicator - // that was created by their chief, as identified by parent_work_id. - thrinfo_t* thread_chl = bli_thrinfo_create - ( - rntm, // rntm - new_comms[ parent_work_id ], // ocomm - child_comm_id, // ocomm_id - child_n_way, // n_way - child_work_id, // work_id - TRUE, // free_comm - *bszid_chl, // bszid - NULL // sub_node - ); - - bli_thread_barrier( thread_par ); - - // The parent's chief thread frees the temporary array of thrcomm_t - // pointers. - if ( bli_thread_am_ochief( thread_par ) ) - { - if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) - bli_free_intl( new_comms ); - } + // Broadcast the temporary array to all threads in the parent's + // communicator. + new_comms = bli_thread_broadcast( thread_par, new_comms ); + + // Chiefs in the child communicator allocate the communicator + // object and store it in the array element corresponding to the + // parent's work id. + if ( child_comm_id == 0 ) + new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in ); + + bli_thread_barrier( thread_par ); + + // All threads create a new thrinfo_t node using the communicator + // that was created by their chief, as identified by parent_work_id. + thrinfo_t* thread_chl = bli_thrinfo_create + ( + rntm, // rntm + new_comms[ parent_work_id ], // ocomm + child_comm_id, // ocomm_id + child_n_way, // n_way + child_work_id, // work_id + TRUE, // free_comm + *bszid_chl, // bszid + NULL // sub_node + ); - return thread_chl; + bli_thread_barrier( thread_par ); + + // The parent's chief thread frees the temporary array of thrcomm_t + // pointers. + if ( bli_thread_am_ochief( thread_par ) ) + { + if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) + bli_free_intl( new_comms ); + } + + return thread_chl; + } } From f1dbb0e514f53a3240d3a6cbdc3306b01a2206f5 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 11 Mar 2022 13:38:28 -0600 Subject: [PATCH 039/230] Trival whitespace change; commit log addendum. Details: - A co-attribution to Mithun Mohan was inadvertently omitted from the commit log for headline change in the previous commit, 7c07b47. --- frame/thread/bli_thrinfo_sup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c index 984820f39..881990f78 100644 --- a/frame/thread/bli_thrinfo_sup.c +++ b/frame/thread/bli_thrinfo_sup.c @@ -160,6 +160,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl BLIS_NO_PART, // bszid NULL // sub_node ); + return thread_chl; } From d6810000e961fe807dc5a7db81180a8355f3eac0 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 14 Mar 2022 10:29:54 -0500 Subject: [PATCH 040/230] Update Multithreading.md Add notes about `BLIS_IR_NT` (should typically be 1) and `BLIS_JR_NT` (should typically be small, e.g. <= 4). [ci skip] --- docs/Multithreading.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/Multithreading.md b/docs/Multithreading.md index b50db5b70..48fbc8ca1 100644 --- a/docs/Multithreading.md +++ b/docs/Multithreading.md @@ -142,13 +142,13 @@ The manual way of specifying parallelism involves communicating which loops with The below chart describes the five loops used in BLIS's matrix multiplication operations. -| Loop around microkernel | Environment variable | Direction | Notes | -|:-------------------------|:---------------------|:----------|:------------| -| 5th loop | `BLIS_JC_NT` | `n` | | -| 4th loop | _N/A_ | `k` | Not enabled | -| 3rd loop | `BLIS_IC_NT` | `m` | | -| 2nd loop | `BLIS_JR_NT` | `n` | | -| 1st loop | `BLIS_IR_NT` | `m` | | +| Loop around microkernel | Environment variable | Direction | Notes | +|:-------------------------|:---------------------|:----------|:---------------| +| 5th loop | `BLIS_JC_NT` | `n` | | +| 4th loop | _N/A_ | `k` | Not enabled | +| 3rd loop | `BLIS_IC_NT` | `m` | | +| 2nd loop | `BLIS_JR_NT` | `n` | Typically <= 4 | +| 1st loop | `BLIS_IR_NT` | `m` | Typically 1 | **Note**: Parallelization of the 4th loop is not currently enabled because each iteration of the loop updates the same part of the output matrix C. Thus, to safely parallelize it requires either a reduction or mutex locks when updating C. @@ -161,7 +161,7 @@ In general, the way to choose how to set these environment variables is as follo Next, which combinations of loops to parallelize depends on which caches are shared. Here are some of the more common scenarios: * When compute resources have private L3 caches (example: multi-socket systems), try parallelizing the `JC` loop. This means threads (or thread groups) will pack and compute with different row panels from matrix B. * For compute resources that have private L2 caches but that share an L3 cache (example: cores on a socket), try parallelizing the `IC` loop. In this situation, threads will share the same packed row panel from matrix B, but pack and compute with different blocks of matrix A. - * If compute resources share an L2 cache but have private L1 caches (example: pairs of cores), try parallelizing the `JR` loop. Here, threads share the same packed block of matrix A but read different packed micropanels of B into their private L1 caches. In some situations, parallelizing the `IR` loop may also be effective. + * If compute resources share an L2 cache but have private L1 caches (example: pairs of cores), try parallelizing the `JR` loop. Here, threads share the same packed block of matrix A but read different packed micropanels of B into their private L1 caches. In some situations, *lightly* parallelizing the `IR` loop may also be effective. ![The primary algorithm for level-3 operations in BLIS](http://www.cs.utexas.edu/users/field/mm_algorithm_color.png) From 0db2bd5341c5c3ed5f1cc2bffa90952735efa45f Mon Sep 17 00:00:00 2001 From: Bhaskar Nallani Date: Fri, 25 Mar 2022 05:11:55 +0530 Subject: [PATCH 041/230] Added BLAS/CBLAS APIs for gemm3m. (#590) Details: - Created ?gemm3m_() and cblas_?gemm3m() APIs that (for now) simply invoke the 1m implementation unconditionally. (Note that these APIs bypass sup handling.) - Added BLAS prototypes for gemm3m in frame/compat/bla_gemm3m.h. - Added CBLAS prototypes for gemm3m in frame/compat/cblas/src/cblas.h. - Relocated: frame/compat/cblas/src/cblas_?gemmt.c files into frame/compat/cblas/src/extra/ - Relocated frame/compat/bla_gemmt.? into frame/compat/extra/ . - Minor reorganization of prototypes and cpp macro directives in bli_blas.h, cblas.h, and cblas_f77.h. - Trival whitespace change to cblas_zgemm.c. --- frame/compat/bli_blas.h | 20 +- frame/compat/cblas/src/cblas.h | 56 ++- frame/compat/cblas/src/cblas_f77.h | 13 +- frame/compat/cblas/src/cblas_zgemm.c | 2 +- frame/compat/cblas/src/extra/cblas_cgemm3m.c | 115 ++++++ .../cblas/src/{ => extra}/cblas_cgemmt.c | 0 .../cblas/src/{ => extra}/cblas_dgemmt.c | 0 .../cblas/src/{ => extra}/cblas_sgemmt.c | 0 frame/compat/cblas/src/extra/cblas_zgemm3m.c | 113 ++++++ .../cblas/src/{ => extra}/cblas_zgemmt.c | 0 frame/compat/check/bla_gemm3m_check.h | 89 +++++ frame/compat/extra/bla_gemm3m.c | 259 +++++++++++++ frame/compat/extra/bla_gemm3m.h | 59 +++ frame/compat/extra/bla_gemm_batch.c | 15 +- frame/compat/{ => extra}/bla_gemmt.c | 15 +- frame/compat/{ => extra}/bla_gemmt.h | 0 test/Makefile | 2 +- test/test_gemm3m.c | 352 ++++++++++++++++++ 18 files changed, 1062 insertions(+), 48 deletions(-) create mode 100644 frame/compat/cblas/src/extra/cblas_cgemm3m.c rename frame/compat/cblas/src/{ => extra}/cblas_cgemmt.c (100%) rename frame/compat/cblas/src/{ => extra}/cblas_dgemmt.c (100%) rename frame/compat/cblas/src/{ => extra}/cblas_sgemmt.c (100%) create mode 100644 frame/compat/cblas/src/extra/cblas_zgemm3m.c rename frame/compat/cblas/src/{ => extra}/cblas_zgemmt.c (100%) create mode 100644 frame/compat/check/bla_gemm3m_check.h create mode 100644 frame/compat/extra/bla_gemm3m.c create mode 100644 frame/compat/extra/bla_gemm3m.h rename frame/compat/{ => extra}/bla_gemmt.c (97%) rename frame/compat/{ => extra}/bla_gemmt.h (100%) create mode 100644 test/test_gemm3m.c diff --git a/frame/compat/bli_blas.h b/frame/compat/bli_blas.h index a65953c11..c88a2e3c3 100644 --- a/frame/compat/bli_blas.h +++ b/frame/compat/bli_blas.h @@ -113,7 +113,6 @@ #include "bla_amax.h" #include "bla_asum.h" #include "bla_axpy.h" -#include "bla_axpby.h" #include "bla_copy.h" #include "bla_dot.h" #include "bla_nrm2.h" @@ -187,7 +186,6 @@ #include "bla_syr2k.h" #include "bla_trmm.h" #include "bla_trsm.h" -#include "bla_gemmt.h" #include "bla_gemm_check.h" #include "bla_hemm_check.h" @@ -198,12 +196,28 @@ #include "bla_syr2k_check.h" #include "bla_trmm_check.h" #include "bla_trsm_check.h" + + +// -- BLAS extension prototypes -- + +// unique to BLIS + +#include "bla_axpby.h" + +// level-3 + +#include "bla_gemmt.h" #include "bla_gemmt_check.h" -// -- Batch prototypes -- +// batch #include "bla_gemm_batch.h" +// 3m + +#include "bla_gemm3m.h" +#include "bla_gemm3m_check.h" + // -- Fortran-compatible APIs to BLIS functions -- diff --git a/frame/compat/cblas/src/cblas.h b/frame/compat/cblas/src/cblas.h index cee74233c..22399ac8d 100644 --- a/frame/compat/cblas/src/cblas.h +++ b/frame/compat/cblas/src/cblas.h @@ -449,11 +449,6 @@ void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); -void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, - f77_int N, f77_int K, float alpha, const float *A, - f77_int lda, const float *B, f77_int ldb, - float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, @@ -484,11 +479,6 @@ void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); -void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, - f77_int N, f77_int K, double alpha, const double *A, - f77_int lda, const double *B, f77_int ldb, - double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, @@ -519,11 +509,6 @@ void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); -void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, - f77_int N, f77_int K, const void *alpha, const void *A, - f77_int lda, const void *B, f77_int ldb, - const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, @@ -554,11 +539,6 @@ void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); -void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, - f77_int N, f77_int K, const void *alpha, const void *A, - f77_int lda, const void *B, f77_int ldb, - const void *beta, void *C, f77_int ldc); /* @@ -616,6 +596,29 @@ void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); +// -- APIs to level-3-like operations -- + +void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, + f77_int N, f77_int K, float alpha, const float *A, + f77_int lda, const float *B, f77_int ldb, + float beta, float *C, f77_int ldc); +void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, + f77_int N, f77_int K, double alpha, const double *A, + f77_int lda, const double *B, f77_int ldb, + double beta, double *C, f77_int ldc); +void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, + f77_int N, f77_int K, const void *alpha, const void *A, + f77_int lda, const void *B, f77_int ldb, + const void *beta, void *C, f77_int ldc); +void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, + f77_int N, f77_int K, const void *alpha, const void *A, + f77_int lda, const void *B, f77_int ldb, + const void *beta, void *C, f77_int ldc); + // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, @@ -652,6 +655,19 @@ void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); +// -- 3m APIs -- + +void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, + f77_int K, const void *alpha, const void *A, + f77_int lda, const void *B, f77_int ldb, + const void *beta, void *C, f77_int ldc); +void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, + f77_int K, const void *alpha, const void *A, + f77_int lda, const void *B, f77_int ldb, + const void *beta, void *C, f77_int ldc); + #ifdef __cplusplus } #endif diff --git a/frame/compat/cblas/src/cblas_f77.h b/frame/compat/cblas/src/cblas_f77.h index e534d2054..acb354aaf 100644 --- a/frame/compat/cblas/src/cblas_f77.h +++ b/frame/compat/cblas/src/cblas_f77.h @@ -200,20 +200,23 @@ /* * BLAS extensions */ -#define F77_sgemmt sgemmt_ -#define F77_dgemmt dgemmt_ -#define F77_cgemmt cgemmt_ -#define F77_zgemmt zgemmt_ - #define F77_saxpby saxpby_ #define F77_daxpby daxpby_ #define F77_caxpby caxpby_ #define F77_zaxpby zaxpby_ +#define F77_sgemmt sgemmt_ +#define F77_dgemmt dgemmt_ +#define F77_cgemmt cgemmt_ +#define F77_zgemmt zgemmt_ + #define F77_sgemm_batch sgemm_batch_ #define F77_dgemm_batch dgemm_batch_ #define F77_cgemm_batch cgemm_batch_ #define F77_zgemm_batch zgemm_batch_ +#define F77_cgemm3m cgemm3m_ +#define F77_zgemm3m zgemm3m_ + #endif /* CBLAS_F77_H */ diff --git a/frame/compat/cblas/src/cblas_zgemm.c b/frame/compat/cblas/src/cblas_zgemm.c index e50de2205..8e08c2031 100644 --- a/frame/compat/cblas/src/cblas_zgemm.c +++ b/frame/compat/cblas/src/cblas_zgemm.c @@ -104,7 +104,7 @@ void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, F77_zgemm(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)B, &F77_ldb, (dcomplex*)A, &F77_lda, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } - else cblas_xerbla(1, "cblas_zgemm", "Illegal Order setting, %d\n", Order); + else cblas_xerbla(1, "cblas_zgemm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; diff --git a/frame/compat/cblas/src/extra/cblas_cgemm3m.c b/frame/compat/cblas/src/extra/cblas_cgemm3m.c new file mode 100644 index 000000000..514e52545 --- /dev/null +++ b/frame/compat/cblas/src/extra/cblas_cgemm3m.c @@ -0,0 +1,115 @@ +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS +/* + * + * cblas_cgemm3m.c + * + * This program is a C interface to cgemm3m. + * + * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, + f77_int K, const void *alpha, const void *A, + f77_int lda, const void *B, f77_int ldb, + const void *beta, void *C, f77_int ldc) +{ + char TA, TB; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_TB; +#else + #define F77_TA &TA + #define F77_TB &TB +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_M M + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + + if( Order == CblasColMajor ) + { + if(TransA == CblasTrans) TA='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(2, "cblas_cgemm3m", "Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if(TransB == CblasTrans) TB='T'; + else if ( TransB == CblasConjTrans ) TB='C'; + else if ( TransB == CblasNoTrans ) TB='N'; + else + { + cblas_xerbla(3, "cblas_cgemm3m", "Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); + #endif + + F77_cgemm3m(F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, (scomplex*)alpha, (scomplex*)A, + &F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc); + } else if (Order == CblasRowMajor) + { + RowMajorStrg = 1; + if(TransA == CblasTrans) TB='T'; + else if ( TransA == CblasConjTrans ) TB='C'; + else if ( TransA == CblasNoTrans ) TB='N'; + else + { + cblas_xerbla(2, "cblas_cgemm3m", "Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(TransB == CblasTrans) TA='T'; + else if ( TransB == CblasConjTrans ) TA='C'; + else if ( TransB == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(2, "cblas_cgemm3m", "Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); + #endif + + + F77_cgemm3m(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, (scomplex*)alpha, (scomplex*)B, + &F77_ldb, (scomplex*)A, &F77_lda, (scomplex*)beta, (scomplex*)C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_cgemm3m", "Illegal Order setting, %d\n", Order); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} +#endif diff --git a/frame/compat/cblas/src/cblas_cgemmt.c b/frame/compat/cblas/src/extra/cblas_cgemmt.c similarity index 100% rename from frame/compat/cblas/src/cblas_cgemmt.c rename to frame/compat/cblas/src/extra/cblas_cgemmt.c diff --git a/frame/compat/cblas/src/cblas_dgemmt.c b/frame/compat/cblas/src/extra/cblas_dgemmt.c similarity index 100% rename from frame/compat/cblas/src/cblas_dgemmt.c rename to frame/compat/cblas/src/extra/cblas_dgemmt.c diff --git a/frame/compat/cblas/src/cblas_sgemmt.c b/frame/compat/cblas/src/extra/cblas_sgemmt.c similarity index 100% rename from frame/compat/cblas/src/cblas_sgemmt.c rename to frame/compat/cblas/src/extra/cblas_sgemmt.c diff --git a/frame/compat/cblas/src/extra/cblas_zgemm3m.c b/frame/compat/cblas/src/extra/cblas_zgemm3m.c new file mode 100644 index 000000000..8be4278b4 --- /dev/null +++ b/frame/compat/cblas/src/extra/cblas_zgemm3m.c @@ -0,0 +1,113 @@ +#include "blis.h" +#ifdef BLIS_ENABLE_CBLAS +/* + * + * cblas_zgemm3m.c + * + * This program is a C interface to zgemm3m. + * + * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, + f77_int K, const void *alpha, const void *A, + f77_int lda, const void *B, f77_int ldb, + const void *beta, void *C, f77_int ldc) +{ + char TA, TB; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_TB; +#else + #define F77_TA &TA + #define F77_TB &TB +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_M M + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + + if( Order == CblasColMajor ) + { + if(TransA == CblasTrans) TA='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(2, "cblas_zgemm3m", "Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if(TransB == CblasTrans) TB='T'; + else if ( TransB == CblasConjTrans ) TB='C'; + else if ( TransB == CblasNoTrans ) TB='N'; + else + { + cblas_xerbla(3, "cblas_zgemm3m", "Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); + #endif + F77_zgemm3m(F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, (dcomplex*)alpha, (dcomplex*)A, + &F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); + } else if (Order == CblasRowMajor) + { + RowMajorStrg = 1; + if(TransA == CblasTrans) TB='T'; + else if ( TransA == CblasConjTrans ) TB='C'; + else if ( TransA == CblasNoTrans ) TB='N'; + else + { + cblas_xerbla(2, "cblas_zgemm3m", "Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(TransB == CblasTrans) TA='T'; + else if ( TransB == CblasConjTrans ) TA='C'; + else if ( TransB == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(2, "cblas_zgemm3m", "Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); + #endif + + F77_zgemm3m(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)B, + &F77_ldb, (dcomplex*)A, &F77_lda, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_zgemm3m", "Illegal Order setting, %d\n", Order); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} +#endif diff --git a/frame/compat/cblas/src/cblas_zgemmt.c b/frame/compat/cblas/src/extra/cblas_zgemmt.c similarity index 100% rename from frame/compat/cblas/src/cblas_zgemmt.c rename to frame/compat/cblas/src/extra/cblas_zgemmt.c diff --git a/frame/compat/check/bla_gemm3m_check.h b/frame/compat/check/bla_gemm3m_check.h new file mode 100644 index 000000000..f565b5d29 --- /dev/null +++ b/frame/compat/check/bla_gemm3m_check.h @@ -0,0 +1,89 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef BLIS_ENABLE_BLAS + +#define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ +{ \ + f77_int info = 0; \ + f77_int nota, notb; \ + f77_int conja, conjb; \ + f77_int ta, tb; \ + f77_int nrowa, nrowb; \ +\ + nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ + notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ + conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ + conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ + ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ + tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ +\ + if ( nota ) { nrowa = *m; } \ + else { nrowa = *k; } \ + if ( notb ) { nrowb = *k; } \ + else { nrowb = *n; } \ +\ + if ( !nota && !conja && !ta ) \ + info = 1; \ + else if ( !notb && !conjb && !tb ) \ + info = 2; \ + else if ( *m < 0 ) \ + info = 3; \ + else if ( *n < 0 ) \ + info = 4; \ + else if ( *k < 0 ) \ + info = 5; \ + else if ( *lda < bli_max( 1, nrowa ) ) \ + info = 8; \ + else if ( *ldb < bli_max( 1, nrowb ) ) \ + info = 10; \ + else if ( *ldc < bli_max( 1, *m ) ) \ + info = 13; \ +\ + if ( info != 0 ) \ + { \ + char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ +\ + sprintf( func_str, "%s%-5s", dt_str, op_str ); \ +\ + bli_string_mkupper( func_str ); \ +\ + PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ +\ + return; \ + } \ +} + +#endif diff --git a/frame/compat/extra/bla_gemm3m.c b/frame/compat/extra/bla_gemm3m.c new file mode 100644 index 000000000..11d542e69 --- /dev/null +++ b/frame/compat/extra/bla_gemm3m.c @@ -0,0 +1,259 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +// +// Define BLAS-to-BLIS interfaces. +// + +#ifdef BLIS_BLAS3_CALLS_TAPI + +#undef GENTFUNCCO +#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_char* transa, \ + const f77_char* transb, \ + const f77_int* m, \ + const f77_int* n, \ + const f77_int* k, \ + const ftype* alpha, \ + const ftype* a, const f77_int* lda, \ + const ftype* b, const f77_int* ldb, \ + const ftype* beta, \ + ftype* c, const f77_int* ldc \ + ) \ +{ \ + trans_t blis_transa; \ + trans_t blis_transb; \ + dim_t m0, n0, k0; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + /* Perform BLAS parameter checking. */ \ + PASTEBLACHK(blasname) \ + ( \ + MKSTR(ch), \ + MKSTR(blasname), \ + transa, \ + transb, \ + m, \ + n, \ + k, \ + lda, \ + ldb, \ + ldc \ + ); \ +\ + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ + bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \ +\ + /* Typecast BLAS integers to BLIS integers. */ \ + bli_convert_blas_dim1( *m, m0 ); \ + bli_convert_blas_dim1( *n, n0 ); \ + bli_convert_blas_dim1( *k, k0 ); \ +\ + /* Set the row and column strides of the matrix operands. */ \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ + const inc_t rs_b = 1; \ + const inc_t cs_b = *ldb; \ + const inc_t rs_c = 1; \ + const inc_t cs_c = *ldc; \ +\ + /* As a placeholder, invoke 1m since BLIS does no longer contains an + official 3m implementation. Note that we do this by inlining an + abbreviated version of bli_gemm_ex() so that we can bypass + consideration of sup, which doesn't make sense in this context. */ \ + { \ + cntx_t* cntx = bli_gks_query_ind_cntx( BLIS_1M, dt ); \ +\ + rntm_t rntm_l; \ + rntm_t* rntm = &rntm_l; \ + bli_rntm_init_from_global( rntm ); \ +\ + /* Note that we MUST disable sup handling since it could redirect + execution for some problem sizes to a non-3m implementation. */ \ + bli_rntm_disable_l3_sup( rntm ); \ +\ + /* Call BLIS interface. */ \ + PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + blis_transa, \ + blis_transb, \ + m0, \ + n0, \ + k0, \ + (ftype*)alpha, \ + (ftype*)a, rs_a, cs_a, \ + (ftype*)b, rs_b, cs_b, \ + (ftype*)beta, \ + (ftype*)c, rs_c, cs_c, \ + cntx, \ + rntm \ + ); \ + } \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +} + +#else + +#undef GENTFUNCCO +#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_char* transa, \ + const f77_char* transb, \ + const f77_int* m, \ + const f77_int* n, \ + const f77_int* k, \ + const ftype* alpha, \ + const ftype* a, const f77_int* lda, \ + const ftype* b, const f77_int* ldb, \ + const ftype* beta, \ + ftype* c, const f77_int* ldc \ + ) \ +{ \ + trans_t blis_transa; \ + trans_t blis_transb; \ + dim_t m0, n0, k0; \ +\ + /* Initialize BLIS. */ \ + bli_init_auto(); \ +\ + /* Perform BLAS parameter checking. */ \ + PASTEBLACHK(blasname) \ + ( \ + MKSTR(ch), \ + MKSTR(blasname), \ + transa, \ + transb, \ + m, \ + n, \ + k, \ + lda, \ + ldb, \ + ldc \ + ); \ +\ + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ + bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \ +\ + /* Typecast BLAS integers to BLIS integers. */ \ + bli_convert_blas_dim1( *m, m0 ); \ + bli_convert_blas_dim1( *n, n0 ); \ + bli_convert_blas_dim1( *k, k0 ); \ +\ + /* Set the row and column strides of the matrix operands. */ \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ + const inc_t rs_b = 1; \ + const inc_t cs_b = *ldb; \ + const inc_t rs_c = 1; \ + const inc_t cs_c = *ldc; \ +\ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t ao = BLIS_OBJECT_INITIALIZER; \ + obj_t bo = BLIS_OBJECT_INITIALIZER; \ + obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ + obj_t co = BLIS_OBJECT_INITIALIZER; \ +\ + dim_t m0_a, n0_a; \ + dim_t m0_b, n0_b; \ +\ + bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \ + bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \ +\ + bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \ +\ + bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \ + bli_obj_init_finish( dt, m0, n0, (ftype*)c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_conjtrans( blis_transa, &ao ); \ + bli_obj_set_conjtrans( blis_transb, &bo ); \ +\ + /* As a placeholder, invoke 1m since BLIS does no longer contains an + official 3m implementation. Note that we do this by inlining an + abbreviated version of bli_gemm_ex() so that we can bypass + consideration of sup, which doesn't make sense in this context. */ \ + { \ + cntx_t* cntx = bli_gks_query_ind_cntx( BLIS_1M, dt ); \ +\ + rntm_t rntm_l; \ + rntm_t* rntm = &rntm_l; \ + bli_rntm_init_from_global( &rntm_l ); \ +\ + /* This is probably not needed given that we performed BLAS-style + parameter checking above, but bli_gemm_check() is normally called + in the normal course of bli_gemm_ex(). */ \ + if ( bli_error_checking_is_enabled() ) \ + bli_gemm_check( &alphao, &ao, &bo, &betao, &co, cntx ); \ +\ + PASTEMAC(blisname,_front) \ + ( \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co, \ + cntx, \ + rntm, \ + NULL \ + ); \ + } \ +\ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ +} + +#endif + +#ifdef BLIS_ENABLE_BLAS +INSERT_GENTFUNCCO_BLAS( gemm3m, gemm ) +#endif + diff --git a/frame/compat/extra/bla_gemm3m.h b/frame/compat/extra/bla_gemm3m.h new file mode 100644 index 000000000..86b7277c8 --- /dev/null +++ b/frame/compat/extra/bla_gemm3m.h @@ -0,0 +1,59 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype BLAS-to-BLIS interfaces. +// +#undef GENTPROTCO +#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ +\ +BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ + ( \ + const f77_char* transa, \ + const f77_char* transb, \ + const f77_int* m, \ + const f77_int* n, \ + const f77_int* k, \ + const ftype* alpha, \ + const ftype* a, const f77_int* lda, \ + const ftype* b, const f77_int* ldb, \ + const ftype* beta, \ + ftype* c, const f77_int* ldc \ + ); + +#ifdef BLIS_ENABLE_BLAS +INSERT_GENTPROTCO_BLAS( gemm3m ) +#endif + diff --git a/frame/compat/extra/bla_gemm_batch.c b/frame/compat/extra/bla_gemm_batch.c index be84572a3..4b2597e19 100644 --- a/frame/compat/extra/bla_gemm_batch.c +++ b/frame/compat/extra/bla_gemm_batch.c @@ -63,9 +63,6 @@ void PASTEF77(ch,blasname) \ trans_t blis_transa; \ trans_t blis_transb; \ dim_t m0, n0, k0; \ - inc_t rs_a, cs_a; \ - inc_t rs_b, cs_b; \ - inc_t rs_c, cs_c; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -102,12 +99,12 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_dim1( k_array[i], k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ - rs_a = 1; \ - cs_a = lda_array[i]; \ - rs_b = 1; \ - cs_b = ldb_array[i]; \ - rs_c = 1; \ - cs_c = ldc_array[i]; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = lda_array[i]; \ + const inc_t rs_b = 1; \ + const inc_t cs_b = ldb_array[i]; \ + const inc_t rs_c = 1; \ + const inc_t cs_c = ldc_array[i]; \ \ for ( f77_int j = 0; j < group_size[i]; j++ ) \ { \ diff --git a/frame/compat/bla_gemmt.c b/frame/compat/extra/bla_gemmt.c similarity index 97% rename from frame/compat/bla_gemmt.c rename to frame/compat/extra/bla_gemmt.c index 6f2439e9f..101cc6d13 100644 --- a/frame/compat/bla_gemmt.c +++ b/frame/compat/extra/bla_gemmt.c @@ -63,9 +63,6 @@ void PASTEF77(ch,blasname) \ trans_t blis_transa; \ trans_t blis_transb; \ dim_t m0, k0; \ - inc_t rs_a, cs_a; \ - inc_t rs_b, cs_b; \ - inc_t rs_c, cs_c; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -95,12 +92,12 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_dim1( *k, k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ - rs_a = 1; \ - cs_a = *lda; \ - rs_b = 1; \ - cs_b = *ldb; \ - rs_c = 1; \ - cs_c = *ldc; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ + const inc_t rs_b = 1; \ + const inc_t cs_b = *ldb; \ + const inc_t rs_c = 1; \ + const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_gemmt.h b/frame/compat/extra/bla_gemmt.h similarity index 100% rename from frame/compat/bla_gemmt.h rename to frame/compat/extra/bla_gemmt.h diff --git a/test/Makefile b/test/Makefile index ae998ccde..361cd2ff8 100644 --- a/test/Makefile +++ b/test/Makefile @@ -145,7 +145,7 @@ CFLAGS += -I$(TEST_SRC_PATH) # Define the operations we will test. TEST_OPS := dotv axpyv axpbyv\ gemv ger hemv her her2 trmv trsv \ - gemm gemm_batch hemm herk her2k trmm trsm + gemm gemm3m gemm_batch hemm herk her2k trmm trsm # Optionally test gemmt, which some libraries might not implement. ifeq ($(BUILD_GEMMT),yes) diff --git a/test/test_gemm3m.c b/test/test_gemm3m.c new file mode 100644 index 000000000..8e7042901 --- /dev/null +++ b/test/test_gemm3m.c @@ -0,0 +1,352 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef WIN32 +#include +#else +#include +#endif +#include "blis.h" +#include "cblas.h" + +#define CBLAS +//#define FILE_IN_OUT +//#define PRINT +#define MATRIX_INITIALISATION + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, n, k; + dim_t p; + dim_t p_begin, p_end, p_inc; + int m_input, n_input, k_input; + num_t dt; + int r, n_repeats; + trans_t transa; + trans_t transb; + f77_char f77_transa; + f77_char f77_transb; + + double dtime; + double dtime_save; + double gflops; +#ifdef FILE_IN_OUT + FILE* fin = NULL; + FILE* fout = NULL; +#endif + //bli_init(); + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + n_repeats = 3; + +#ifndef PRINT + p_begin = 200; + p_end = 2000; + p_inc = 100; + + m_input = -1; + n_input = -1; + k_input = -1; +#else + p_begin = 16; + p_end = 16; + p_inc = 1; + + m_input = 5; + k_input = 6; + n_input = 4; +#endif + + dt = BLIS_SCOMPLEX; + //dt = BLIS_DCOMPLEX; + + transa = BLIS_NO_TRANSPOSE; + transb = BLIS_NO_TRANSPOSE; + + bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); + bli_param_map_blis_to_netlib_trans( transb, &f77_transb ); + + // printf("BLIS Library version is : %s\n", bli_info_get_version_str()); + +#ifdef FILE_IN_OUT + if ( argc < 3 ) + { + printf( "Usage: ./test_gemm_XX.x input.csv output.csv\n" ); + exit(1); + } + fin = fopen( argv[1], "r" ); + if ( fin == NULL ) + { + printf( "Error opening the file %s\n", argv[1] ); + exit(1); + } + fout = fopen( argv[2], "w" ); + if ( fout == NULL ) + { + printf( "Error opening output file %s\n", argv[2] ); + exit(1); + } + + fprintf( fout, "m\t k\t n\t cs_a\t cs_b\t cs_c\t gflops\t GEMM_Algo\n" ); + printf( "~~~~~~~~~~_BLAS\t m\t k\t n\t cs_a\t cs_b\t cs_c \t gflops\t GEMM_Algo\n" ); + + inc_t cs_a; + inc_t cs_b; + inc_t cs_c; + + while ( fscanf(fin, "%lld %lld %lld %lld %lld %lld\n", &m, &k, &n, &cs_a, &cs_b, &cs_c) == 6 ) + { + if ( ( m > cs_a ) || + ( k > cs_b ) || + ( m > cs_c ) ) continue; // leading dimension should be greater than number of rows + + bli_obj_create( dt, 1, 1, 0, 0, &alpha); + bli_obj_create( dt, 1, 1, 0, 0, &beta ); + + bli_obj_create( dt, m, k, 1, cs_a, &a ); + bli_obj_create( dt, k, n, 1, cs_b, &b ); + bli_obj_create( dt, m, n, 1, cs_c, &c ); + bli_obj_create( dt, m, n, 1, cs_c, &c_save ); +#ifdef MATRIX_INITIALISATION + bli_randm( &a ); + bli_randm( &b ); + bli_randm( &c ); +#endif + bli_obj_set_conjtrans( transa, &a); + bli_obj_set_conjtrans( transb, &b); + + //bli_setsc( 0.0, -1, &alpha ); + //bli_setsc( 0.0, 1, &beta ); + + bli_setsc( -1, 0.0, &alpha ); + bli_setsc( 1, 0.0, &beta ); + +#else + for ( p = p_begin; p <= p_end; p += p_inc ) + { + if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + if ( k_input < 0 ) k = p * ( dim_t )abs(k_input); + else k = ( dim_t ) k_input; + + bli_obj_create( dt, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt, 1, 1, 0, 0, &beta ); + + bli_obj_create( dt, m, k, 0, 0, &a ); + bli_obj_create( dt, k, n, 0, 0, &b ); + bli_obj_create( dt, m, n, 0, 0, &c ); + bli_obj_create( dt, m, n, 0, 0, &c_save ); +#ifdef MATRIX_INITIALISATION + + bli_randm( &a ); + bli_randm( &b ); + bli_randm( &c ); +#endif + bli_obj_set_conjtrans( transa, &a ); + bli_obj_set_conjtrans( transb, &b ); + + bli_setsc( (0.9/1.0), 0.2, &alpha ); + bli_setsc( -(1.1/1.0), 0.3, &beta ); + +#endif + bli_copym( &c, &c_save ); + + dtime_save = DBL_MAX; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + dtime = bli_clock(); + + +#ifdef PRINT + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "b", &b, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#ifndef CBLAS + + if ( bli_is_scomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + scomplex* alphap = bli_obj_buffer( &alpha ); + scomplex* ap = bli_obj_buffer( &a ); + scomplex* bp = bli_obj_buffer( &b ); + scomplex* betap = bli_obj_buffer( &beta ); + scomplex* cp = bli_obj_buffer( &c ); + + cgemm3m_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_dcomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + dcomplex* alphap = bli_obj_buffer( &alpha ); + dcomplex* ap = bli_obj_buffer( &a ); + dcomplex* bp = bli_obj_buffer( &b ); + dcomplex* betap = bli_obj_buffer( &beta ); + dcomplex* cp = bli_obj_buffer( &c ); + + zgemm3m_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } +#else + if ( bli_is_scomplex( dt ) ) + { + scomplex* ap = bli_obj_buffer( &a ); + scomplex* bp = bli_obj_buffer( &b ); + scomplex* cp = bli_obj_buffer( &c ); + scomplex* alphap = bli_obj_buffer( &alpha ); + scomplex* betap = bli_obj_buffer( &beta ); + cblas_cgemm3m( CblasColMajor, + CblasNoTrans, + CblasNoTrans, + m, + n, + k, + (const void*)alphap, + ap, m, + bp, k, + (const void*)betap, + cp, m ); + } + else if (bli_is_dcomplex(dt)) + { + dcomplex* ap = bli_obj_buffer( &a ); + dcomplex* bp = bli_obj_buffer( &b ); + dcomplex* cp = bli_obj_buffer( &c ); + dcomplex* alphap = bli_obj_buffer( &alpha ); + dcomplex* betap = bli_obj_buffer( &beta ); + cblas_zgemm3m( CblasColMajor, + CblasNoTrans, + CblasNoTrans, + m, + n, + k, + (const void*)alphap, + ap, m, + bp, k, + (const void*)betap, + cp, m ); + } +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.6f", "" ); + exit(1); +#endif + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 ); + + gflops *= 4.0; //to represent complex ops in gflops + +#ifdef BLIS + printf( "data_gemm_blis" ); +#else + printf( "data_gemm_%s", BLAS ); +#endif + +#ifdef FILE_IN_OUT + + printf("%6lu \t %4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f\n", \ + ( unsigned long )m, + ( unsigned long )k, + ( unsigned long )n, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c, gflops); + + + fprintf(fout, "%6lu \t %4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f \n", \ + ( unsigned long )m, + ( unsigned long )k, + ( unsigned long )n, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c, gflops); + fflush(fout); + +#else + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )k, + ( unsigned long )n, gflops ); +#endif + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + //bli_finalize(); +#ifdef FILE_IN_OUT + fclose( fin ); + fclose( fout ); +#endif + return 0; +} From 1ec020b33ece1681c0041e2549eed2bd4c6cf356 Mon Sep 17 00:00:00 2001 From: Dipal M Zambare <71366780+dzambare@users.noreply.github.com> Date: Wed, 30 Mar 2022 02:45:36 +0530 Subject: [PATCH 042/230] AMD kernel updates; frame-specific AMD updates. (#597) Details: - Allow building BLIS with certain framework files (each with the '_amd' suffix) that have been customized by AMD for Zen-based hardware. These customized files were derived from portable versions of the same files (i.e., those without the '_amd' suffix). Whether the portable or AMD- specific files are compiled is now controlled by a new configure option, --[en|dis]able-amd-frame-tweaks. This option is disabled by default in vanilla BLIS, though AMD may choose to enable it by default in their fork. For now, the added AMD-specific files are: - bli_gemv_unf_var2_amd.c - bla_copy_amd.c - bla_gemv_amd.c These files reside in 'amd' subdirectories found within the directory housing their generic counterparts. - Register optimized real-domain copyv, setv, and swapv kernels in bli_cntx_init_zen.c. - Various minor updates to level-1v kernels in 'zen' kernel set. - Added caxpyf kernel as well as saxpyf and multiple daxpyf kernels to the 'zen' kernel set - If the problem passed to ?gemm_() in bla_gemm.c has a unit m or n dim, call gemv instead and return early. - Combined variable declarations with their initialization in various level-2 and level-3 BLAS compatibility files, and also inserted 'const' qualifer in those same declaration statements. - Moved frame/compat/bla_gemmt.c and .h to frame/compat/extra/ . - Added copyv and swapv test drivers to 'test' directory. - Whitespace, comment changes. --- Makefile | 23 + build/config.mk.in | 4 + config/amd64/bli_family_amd64.h | 6 - config/zen/bli_cntx_init_zen.c | 11 +- config/zen2/bli_cntx_init_zen2.c | 5 +- config/zen3/bli_cntx_init_zen3.c | 9 +- config/zen3/bli_family_zen3.h | 5 +- configure | 50 +- frame/2/gemv/amd/bli_gemv_unf_var2_amd.c | 222 ++++ frame/compat/amd/bla_copy_amd.c | 147 +++ frame/compat/amd/bla_gemv_amd.c | 172 +++ frame/compat/bla_gemm.c | 59 +- frame/compat/bla_gemv.c | 27 +- frame/compat/bla_ger.c | 5 +- frame/compat/bla_hemm.c | 15 +- frame/compat/bla_hemv.c | 5 +- frame/compat/bla_her.c | 5 +- frame/compat/bla_her2.c | 5 +- frame/compat/bla_her2k.c | 15 +- frame/compat/bla_herk.c | 10 +- frame/compat/bla_symm.c | 15 +- frame/compat/bla_symv.c | 5 +- frame/compat/bla_syr.c | 5 +- frame/compat/bla_syr2.c | 5 +- frame/compat/bla_syr2k.c | 15 +- frame/compat/bla_syrk.c | 10 +- frame/compat/bla_trmm.c | 10 +- frame/compat/bla_trmv.c | 5 +- frame/compat/bla_trsm.c | 10 +- frame/compat/bla_trsv.c | 5 +- kernels/zen/1/bli_scalv_zen_int10.c | 62 +- kernels/zen/1f/bli_axpyf_zen_int_4.c | 277 +++++ kernels/zen/1f/bli_axpyf_zen_int_5.c | 1231 ++++++++++++++++++++++ kernels/zen/bli_kernels_zen.h | 14 +- kernels/zen2/1f/bli_axpyf_zen_int_5.c | 599 ----------- test/test_copyv.c | 218 ++++ test/test_swapv.c | 180 ++++ 37 files changed, 2716 insertions(+), 750 deletions(-) create mode 100644 frame/2/gemv/amd/bli_gemv_unf_var2_amd.c create mode 100644 frame/compat/amd/bla_copy_amd.c create mode 100644 frame/compat/amd/bla_gemv_amd.c create mode 100644 kernels/zen/1f/bli_axpyf_zen_int_4.c create mode 100644 kernels/zen/1f/bli_axpyf_zen_int_5.c delete mode 100644 kernels/zen2/1f/bli_axpyf_zen_int_5.c create mode 100644 test/test_copyv.c create mode 100644 test/test_swapv.c diff --git a/Makefile b/Makefile index 992983328..5605dd8fc 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2022, Advanced Micro Devices, Inc. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -219,6 +220,28 @@ MK_ADDON_OBJS := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDO # enabled a configure-time, this variable will we empty. MK_SANDBOX_OBJS := $(call gen-obj-paths-from-src,$(SANDBOX_SRC_SUFS),$(MK_SANDBOX_SRC),$(SANDBOX_PATH),$(BASE_OBJ_SANDBOX_PATH)) +# AMD has chosen to introduce AOCL-specific optimizations to certain BLIS +# framework files that are otherwise intended to remain generic. Upstream +# developers of vanilla BLIS have agreed to integrate some of these +# optimizations, but in a way that keeps the AOCL-specific code segregated +# in separate files containing the suffix '_amd'. For example, the BLAS +# compatibility layer in vanilla BLIS contains a generic file named +# 'bla_gemm.c'. AMD's version of this file is named 'bla_gemm_amd.c'. +# Only one or the other is ever built and included in libblis. Currently, +# these files are chosen automatically based on the target configuration. +ifeq ($(ENABLE_AMD_FRAME_TWEAKS),yes) +# Build is being done for AMD platforms; remove the objects which DO NOT have +# an "_amd" suffix. +MK_FRAME_AMD_OBJS := $(filter $(BASE_OBJ_FRAME_PATH)/%amd.o, $(MK_FRAME_OBJS)) +FILES_TO_REMOVE := $(subst _amd.o,.o, $(MK_FRAME_AMD_OBJS)) +MK_FRAME_OBJS := $(filter-out $(FILES_TO_REMOVE), $(MK_FRAME_OBJS)) +else +# Build is being done for non-AMD platforms; remove the objects which DO have +# an "_amd" suffix. +MK_FRAME_AMD_OBJS := $(filter $(BASE_OBJ_FRAME_PATH)/%amd.o, $(MK_FRAME_OBJS)) +MK_FRAME_OBJS := $(filter-out $(MK_FRAME_AMD_OBJS), $(MK_FRAME_OBJS)) +endif + # Combine all of the object files into some readily-accessible variables. MK_BLIS_OBJS := $(MK_CONFIG_OBJS) \ $(MK_KERNELS_OBJS) \ diff --git a/build/config.mk.in b/build/config.mk.in index 79ecea653..56d6211c2 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -5,6 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2022, Advanced Micro Devices, Inc. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -203,5 +204,8 @@ SANDBOX := @sandbox@ # variable is set to the empty value. LIBPTHREAD := @libpthread@ +# Whether we should use AMD-customized versions of certain framework files. +ENABLE_AMD_FRAME_TWEAKS := @enable_amd_frame_tweaks@ + # end of ifndef CONFIG_MK_INCLUDED conditional block endif diff --git a/config/amd64/bli_family_amd64.h b/config/amd64/bli_family_amd64.h index ac10789aa..4791cceeb 100644 --- a/config/amd64/bli_family_amd64.h +++ b/config/amd64/bli_family_amd64.h @@ -35,11 +35,5 @@ #ifndef BLIS_FAMILY_AMD64_H #define BLIS_FAMILY_AMD64_H -// Enable framework optimizations for EPYC family processors. -// With this macro defined, we can call kernels directly from -// BLAS interfaces for levels 1 & 2. -// This macro needs to be defined for all EPYC configurations. -#define BLIS_CONFIG_EPYC - #endif diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index 615a31a04..1b16cd06f 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020-2022, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -66,6 +66,7 @@ void bli_cntx_init_zen( cntx_t* cntx ) // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, + cntx ); @@ -98,13 +99,14 @@ void bli_cntx_init_zen( cntx_t* cntx ) // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, + cntx ); // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( - 10, + 16, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, @@ -119,7 +121,7 @@ void bli_cntx_init_zen( cntx_t* cntx ) BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, #endif -#if 0 +#if 1 // copyv BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, @@ -142,7 +144,7 @@ void bli_cntx_init_zen( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif -#if 0 +#if 1 // setv BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, @@ -151,6 +153,7 @@ void bli_cntx_init_zen( cntx_t* cntx ) BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, #endif + cntx ); diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c index 0964ce463..ba728602b 100644 --- a/config/zen2/bli_cntx_init_zen2.c +++ b/config/zen2/bli_cntx_init_zen2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020-2022, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -64,6 +64,7 @@ void bli_cntx_init_zen2( cntx_t* cntx ) // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, + cntx ); @@ -96,6 +97,7 @@ void bli_cntx_init_zen2( cntx_t* cntx ) // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, + cntx ); @@ -135,6 +137,7 @@ void bli_cntx_init_zen2( cntx_t* cntx ) //set BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + cntx ); diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c index b5bbb05ed..0336ddc95 100644 --- a/config/zen3/bli_cntx_init_zen3.c +++ b/config/zen3/bli_cntx_init_zen3.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -49,6 +49,7 @@ void bli_cntx_init_zen3( cntx_t* cntx ) bli_cntx_set_l3_nat_ukrs ( 8, + // gemm BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, @@ -62,6 +63,7 @@ void bli_cntx_init_zen3( cntx_t* cntx ) // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, + cntx ); @@ -96,12 +98,15 @@ void bli_cntx_init_zen3( cntx_t* cntx ) bli_cntx_set_l1f_kers ( 4, + // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, + // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, + cntx ); @@ -114,8 +119,6 @@ void bli_cntx_init_zen3( cntx_t* cntx ) BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, - // axpyv - // axpyv BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, diff --git a/config/zen3/bli_family_zen3.h b/config/zen3/bli_family_zen3.h index 8487a7277..661313ca9 100644 --- a/config/zen3/bli_family_zen3.h +++ b/config/zen3/bli_family_zen3.h @@ -63,8 +63,9 @@ #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 -#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 -#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 + +#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 +#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 diff --git a/configure b/configure index 5f3e83eaa..f64aac705 100755 --- a/configure +++ b/configure @@ -5,7 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin -# Copyright (C) 2020, Advanced Micro Devices, Inc. +# Copyright (C) 2020-2022, Advanced Micro Devices, Inc. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -270,6 +270,21 @@ print_usage() echo " \"small\" depends on thresholds that may vary by sub-" echo " configuration." echo " " + echo " --enable-amd-frame-tweaks, --disable-amd-frame-tweaks" + echo " " + echo " Enable building with certain framework files that have" + echo " been customized by AMD for Zen-based microarchitectures." + echo " The default counterparts of these files must be portable," + echo " and so these customized files may provide some (typically" + echo " modest) performance improvement for some select operations" + echo " and/or APIs, though there may a few (tiny dimension) cases" + echo " where the improvement is more pronounced. Note that the" + echo " target configuration must be Zen-based (or 'amd64') for" + echo " this option to have any effect. (Also note that this" + echo " option is NOT to be confused with enabling AMD *kernels*," + echo " which are determined by the BLIS subconfiguration used at" + echo " runtime.) By default, these customized files are disabled." + echo " " echo " -a NAME --enable-addon=NAME" echo " " echo " Enable the code provided by an addon. An addon consists" @@ -2453,6 +2468,7 @@ main() enable_mixed_dt='yes' enable_mixed_dt_extra_mem='yes' enable_sup_handling='yes' + enable_amd_frame_tweaks='no' enable_memkind='' # The default memkind value is determined later on. enable_trsm_preinversion='yes' force_version='no' @@ -2665,6 +2681,12 @@ main() disable-sup-handling) enable_sup_handling='no' ;; + enable-amd-frame-tweaks) + enable_amd_frame_tweaks='yes' + ;; + disable-amd-frame-tweaks) + enable_amd_frame_tweaks='no' + ;; with-memkind) enable_memkind='yes' ;; @@ -3567,6 +3589,29 @@ main() exit 1 fi + # Check whether we should use AMD-customized versions of certain framework + # files. + if [ "x${enable_amd_frame_tweaks}" = "xyes" ]; then + + echo "${script_name}: AMD-specific framework files will be considered." + echo "${script_name}: checking eligibility of target configuration." + + # Make sure we are targeting either one of the zen subconfigs or the + # amd64 umbrella family. + uconf=$(echo ${config_name} | grep -c 'zen\|amd64') + + if [[ $uconf == 0 ]]; then + echo "${script_name}: target configuration '${config_name}' is not eligible." + echo "${script_name}: disabling AMD-specific framework files." + enable_amd_frame_tweaks='no' + else + echo "${script_name}: target configuration '${config_name}' is eligible." + echo "${script_name}: enabling AMD-specific framework files." + fi + else + echo "${script_name}: AMD-specific framework files will not be considered." + fi + # Check if addons were given. if [ -n "${addon_flag}" ]; then @@ -3709,7 +3754,7 @@ main() # Create a #define for the configuration family (config_name). uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]') config_name_define="#define BLIS_FAMILY_${uconf}\n" - + # Create a list of #defines, one for each configuration in config_list. config_list_defines="" for conf in ${config_list}; do @@ -3820,6 +3865,7 @@ main() | sed -e "s/@export_shared@/${export_shared}/g" \ | sed -e "s/@enable_blas@/${enable_blas}/g" \ | sed -e "s/@enable_cblas@/${enable_cblas}/g" \ + | sed -e "s/@enable_amd_frame_tweaks@/${enable_amd_frame_tweaks}/g" \ | sed -e "s/@enable_memkind@/${enable_memkind}/g" \ | sed -e "s/@pragma_omp_simd@/${pragma_omp_simd}/g" \ | sed -e "s/@addon_list@/${addon_list}/g" \ diff --git a/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c b/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c new file mode 100644 index 000000000..8f0f31479 --- /dev/null +++ b/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c @@ -0,0 +1,222 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2022, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname, scalvsuf, axpyfsuf, fusefac ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + trans_t transa, \ + conj_t conjx, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* x, inc_t incx, \ + ctype* beta, \ + ctype* y, inc_t incy, \ + cntx_t* cntx \ + ) \ +{ \ + /*const num_t dt = PASTEMAC(ch,type);*/ \ +\ + ctype* A1; \ + ctype* x1; \ + ctype* y1; \ + dim_t i; \ + dim_t b_fuse, f; \ + dim_t n_elem, n_iter; \ + inc_t rs_at, cs_at; \ + conj_t conja; \ +\ + bli_set_dims_incs_with_trans( transa, \ + m, n, rs_a, cs_a, \ + &n_elem, &n_iter, &rs_at, &cs_at ); \ +\ + conja = bli_extract_conj( transa ); \ +\ + /* y = beta * y; */ \ + /* NOTE: We don't explicitly handle the case where beta == 0 here + since that behavior is handled within the scalv kernel itself. */ \ + PASTEMAC2(ch,scalv,scalvsuf) \ + ( \ + BLIS_NO_CONJUGATE, \ + n_elem, \ + beta, \ + y, incy, \ + cntx \ + ); \ +\ + /* If alpha == 0, then we are done. */ \ + if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \ +\ + /*PASTECH(ch,axpyf_ker_ft) kfp_af;*/ \ +\ + /* Query the context for the kernel function pointer and fusing factor. */ \ + /*kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );*/ \ + /*b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );*/ \ + b_fuse = fusefac; \ +\ + for ( i = 0; i < n_iter; i += f ) \ + { \ + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \ +\ + A1 = a + (0 )*rs_at + (i )*cs_at; \ + x1 = x + (i )*incx; \ + y1 = y + (0 )*incy; \ +\ + /* y = y + alpha * A1 * x1; */ \ + /*kfp_af*/ \ + PASTEMAC2(ch,axpyf,axpyfsuf) \ + ( \ + conja, \ + conjx, \ + n_elem, \ + f, \ + alpha, \ + A1, rs_at, cs_at, \ + x1, incx, \ + y1, incy, \ + cntx \ + ); \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( gemv_unf_var2 ) +GENTFUNC( float, s, gemv_unf_var2, _zen_int10, _zen_int_5, 5 ) +GENTFUNC( double, d, gemv_unf_var2, _zen_int10, _zen_int_16x4, 4 ) +GENTFUNC( scomplex, c, gemv_unf_var2, _zen_int10, _zen_int_4, 4 ) +//GENTFUNC( dcomplex, z, gemv_unf_var2, _zen_int10, _ex, 1 ) + + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + trans_t transa, \ + conj_t conjx, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* x, inc_t incx, \ + ctype* beta, \ + ctype* y, inc_t incy, \ + cntx_t* cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + ctype* zero = PASTEMAC(ch,0); \ + ctype* A1; \ + ctype* x1; \ + ctype* y1; \ + dim_t i; \ + dim_t b_fuse, f; \ + dim_t n_elem, n_iter; \ + inc_t rs_at, cs_at; \ + conj_t conja; \ +\ + bli_set_dims_incs_with_trans( transa, \ + m, n, rs_a, cs_a, \ + &n_elem, &n_iter, &rs_at, &cs_at ); \ +\ + conja = bli_extract_conj( transa ); \ +\ + /* If beta is zero, use setv. Otherwise, scale by beta. */ \ + if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + /* y = 0; */ \ + PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + n_elem, \ + zero, \ + y, incy, \ + cntx, \ + NULL \ + ); \ + } \ + else \ + { \ + /* y = beta * y; */ \ + PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + n_elem, \ + beta, \ + y, incy, \ + cntx, \ + NULL \ + ); \ + } \ +\ + PASTECH(ch,axpyf_ker_ft) kfp_af; \ +\ + /* Query the context for the kernel function pointer and fusing factor. */ \ + kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \ +\ + for ( i = 0; i < n_iter; i += f ) \ + { \ + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \ +\ + A1 = a + (0 )*rs_at + (i )*cs_at; \ + x1 = x + (i )*incx; \ + y1 = y + (0 )*incy; \ +\ + /* y = y + alpha * A1 * x1; */ \ + kfp_af \ + ( \ + conja, \ + conjx, \ + n_elem, \ + f, \ + alpha, \ + A1, rs_at, cs_at, \ + x1, incx, \ + y1, incy, \ + cntx \ + ); \ + } \ +} + +//INSERT_GENTFUNC_BASIC0( gemv_unf_var2 ) +GENTFUNC( dcomplex, z, gemv_unf_var2 ) + diff --git a/frame/compat/amd/bla_copy_amd.c b/frame/compat/amd/bla_copy_amd.c new file mode 100644 index 000000000..6780b555e --- /dev/null +++ b/frame/compat/amd/bla_copy_amd.c @@ -0,0 +1,147 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2022, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +// +// Define BLAS-to-BLIS interfaces. +// +#undef GENTFUNC +#define GENTFUNC( ftype, ch, blasname, blisname, isuf ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_int* n, \ + const ftype* x, const f77_int* incx, \ + ftype* y, const f77_int* incy \ + ) \ +{ \ + dim_t n0; \ + ftype* x0; \ + ftype* y0; \ + inc_t incx0; \ + inc_t incy0; \ +\ + /* Initialize BLIS. */ \ + /*bli_init_auto()*/; \ +\ + /* Convert/typecast negative values of n to zero. */ \ + bli_convert_blas_dim1( *n, n0 ); \ +\ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ + bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ +\ + /* Call BLIS interface. */ \ + /* NOTE: While we skip explicit initialization for real domain instances + since we call the microkernel directly, the complex domain instances + still need initialization so that they can query valid contexts from + gks. However, the expert API will self-initialize before attempting + to query a context, so the complex domain cases should work fine. */ \ + PASTEMAC2(ch,blisname,isuf) \ + ( \ + BLIS_NO_CONJUGATE, \ + n0, \ + x0, incx0, \ + y0, incy0, \ + NULL \ + ); \ +\ + /* Finalize BLIS. */ \ + /*bli_finalize_auto();*/ \ +} + +#ifdef BLIS_ENABLE_BLAS +//INSERT_GENTFUNC_BLAS( copy, copyv ) +GENTFUNC( float, s, copy, copyv, _zen_int ) +GENTFUNC( double, d, copy, copyv, _zen_int ) +#endif + + +#undef GENTFUNC +#define GENTFUNC( ftype, ch, blasname, blisname, isuf ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_int* n, \ + const ftype* x, const f77_int* incx, \ + ftype* y, const f77_int* incy \ + ) \ +{ \ + dim_t n0; \ + ftype* x0; \ + ftype* y0; \ + inc_t incx0; \ + inc_t incy0; \ +\ + /* Initialize BLIS. */ \ + /*bli_init_auto()*/; \ +\ + /* Convert/typecast negative values of n to zero. */ \ + bli_convert_blas_dim1( *n, n0 ); \ +\ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ + bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ +\ + /* Call BLIS interface. */ \ + /* NOTE: While we skip explicit initialization for real domain instances + since we call the microkernel directly, the complex domain instances + still need initialization so that they can query valid contexts from + gks. However, the expert API will self-initialize before attempting + to query a context, so the complex domain cases should work fine. */ \ + PASTEMAC2(ch,blisname,isuf) \ + ( \ + BLIS_NO_CONJUGATE, \ + n0, \ + x0, incx0, \ + y0, incy0, \ + NULL, \ + NULL \ + ); \ +\ + /* Finalize BLIS. */ \ + /*bli_finalize_auto();*/ \ +} + +#ifdef BLIS_ENABLE_BLAS +//INSERT_GENTFUNC_BLAS( copy, copyv ) +GENTFUNC( scomplex, c, copy, copyv, _ex ) +GENTFUNC( dcomplex, z, copy, copyv, _ex ) +#endif + diff --git a/frame/compat/amd/bla_gemv_amd.c b/frame/compat/amd/bla_gemv_amd.c new file mode 100644 index 000000000..398d1bf2c --- /dev/null +++ b/frame/compat/amd/bla_gemv_amd.c @@ -0,0 +1,172 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2022, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +// +// Define BLAS-to-BLIS interfaces. +// +#undef GENTFUNC +#define GENTFUNC( ftype, ch, blasname, blisname ) \ +\ +void PASTEF77(ch,blasname) \ + ( \ + const f77_char* transa, \ + const f77_int* m, \ + const f77_int* n, \ + const ftype* alpha, \ + const ftype* a, const f77_int* lda, \ + const ftype* x, const f77_int* incx, \ + const ftype* beta, \ + ftype* y, const f77_int* incy \ + ) \ +{ \ + trans_t blis_transa; \ + dim_t m0, n0; \ + dim_t m_y, n_x; \ + ftype* x0; \ + ftype* y0; \ + inc_t incx0; \ + inc_t incy0; \ +\ + /* Initialize BLIS. */ \ + /*bli_init_auto();*/ \ +\ + /* Perform BLAS parameter checking. */ \ + PASTEBLACHK(blasname) \ + ( \ + MKSTR(ch), \ + MKSTR(blasname), \ + transa, \ + m, \ + n, \ + lda, \ + incx, \ + incy \ + ); \ +\ + /* BLAS handles cases where y has no elements as well as those where x has + no elements. In the case of the former, it cannot do any work since + the output vector is empty; but in the latter case, BLAS has peculiar + semantics. When x has no elements (and transa(A) has no columns), BLAS + returns immediately without performing any computation even if the + number of elements of y (and rows of transa(A)) is non-zero, in which + case any sane interpretations of gemv would have the the operation + reduce to y := beta * y. Here, we emulate the BLAS exactly so as to + provide "bug-for-bug" compatibility. Note that this extreme level of + compatibility would not be contemplated if it weren't for the fact + that some BLAS unit tests actually check for this behavior. Also, it + should be emphasized that BLIS, when called natively, does NOT exhibit + this quirky behavior; it will scale y by beta as one would expect. */ \ + if ( *m == 0 || *n == 0 ) \ + { \ + /* Finalize BLIS. */ \ + /*bli_finalize_auto();*/ \ +\ + return; \ + } \ +\ + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ +\ + /* Convert/typecast negative values of m and n to zero. */ \ + bli_convert_blas_dim1( *m, m0 ); \ + bli_convert_blas_dim1( *n, n0 ); \ +\ + /* Determine the dimensions of x and y so we can adjust the increments, + if necessary.*/ \ + bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \ +\ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \ + bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \ +\ + /* If alpha is zero, scale y by beta and return early. */ \ + if ( PASTEMAC(ch,eq0)( *alpha ) ) \ + { \ + PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + m_y, \ + ( ftype* )beta, \ + ( ftype* )y0, incy0, \ + NULL, \ + NULL \ + ); \ + return; \ + } \ +\ + /* Set the row and column strides of A. */ \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ +\ + /* Declare a void function pointer for the current operation. */ \ + PASTECH2(ch,blisname,_unb_ft) f; \ +\ + /* Choose the underlying implementation. */ \ + if ( bli_does_notrans( blis_transa ) ) f = PASTEMAC(ch,gemv_unf_var2); \ + else /* if ( bli_does_trans( blis_transa ) ) */ f = PASTEMAC(ch,gemv_unf_var1); \ +\ + /* Obtain a valid context from the gks. This is needed because these + implementations of ?gemv_() skip calling gemv_ex() and instead + call the unblocked fused variants directly. */ \ + cntx_t* cntx = bli_gks_query_cntx(); \ +\ + /* Invoke the variant chosen above, which loops over a level-1v or + level-1f kernel to implement the current operation. */ \ + f \ + ( \ + blis_transa, \ + BLIS_NO_CONJUGATE, \ + m0, \ + n0, \ + (ftype*)alpha, \ + (ftype*)a, rs_a, cs_a, \ + x0, incx0, \ + (ftype*)beta, \ + y0, incy0, \ + cntx \ + ); \ +\ + /* Finalize BLIS. */ \ + /*bli_finalize_auto();*/ \ +} + +#ifdef BLIS_ENABLE_BLAS +INSERT_GENTFUNC_BLAS( gemv, gemv ) +#endif + diff --git a/frame/compat/bla_gemm.c b/frame/compat/bla_gemm.c index e04e48cf5..e71d4e2fc 100644 --- a/frame/compat/bla_gemm.c +++ b/frame/compat/bla_gemm.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019-2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -62,9 +62,6 @@ void PASTEF77(ch,blasname) \ trans_t blis_transa; \ trans_t blis_transb; \ dim_t m0, n0, k0; \ - inc_t rs_a, cs_a; \ - inc_t rs_b, cs_b; \ - inc_t rs_c, cs_c; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -94,12 +91,12 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_dim1( *k, k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ - rs_a = 1; \ - cs_a = *lda; \ - rs_b = 1; \ - cs_b = *ldb; \ - rs_c = 1; \ - cs_c = *ldc; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ + const inc_t rs_b = 1; \ + const inc_t cs_b = *ldb; \ + const inc_t rs_c = 1; \ + const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ @@ -179,6 +176,48 @@ void PASTEF77(ch,blasname) \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ +\ + /* Handle special cases of m == 1 or n == 1 via gemv. */ \ + if ( n0 == 1 ) \ + { \ + dim_t m0t, k0t; \ + bli_set_dims_with_trans( blis_transa, m0, k0, &m0t, &k0t ); \ +\ + PASTEMAC2(ch,gemv,BLIS_TAPI_EX_SUF) \ + ( \ + blis_transa, \ + bli_extract_conj( blis_transb ), \ + m0t, k0t, \ + ( ftype* )alpha, \ + ( ftype* )a, rs_a, cs_a, \ + ( ftype* )b, ( bli_does_notrans( blis_transb ) ? rs_b : cs_b ), \ + ( ftype* )beta, \ + c, rs_c, \ + NULL, \ + NULL \ + ); \ + return; \ + } \ + else if ( m0 == 1 ) \ + { \ + dim_t n0t, k0t; \ + bli_set_dims_with_trans( blis_transb, n0, k0, &n0t, &k0t ); \ +\ + PASTEMAC2(ch,gemv,BLIS_TAPI_EX_SUF) \ + ( \ + blis_transb, \ + bli_extract_conj( blis_transa ), \ + n0t, k0t, \ + ( ftype* )alpha, \ + ( ftype* )b, cs_b, rs_b, \ + ( ftype* )a, ( bli_does_notrans( blis_transa ) ? cs_a : rs_a ), \ + ( ftype* )beta, \ + c, cs_c, \ + NULL, \ + NULL \ + ); \ + return; \ + } \ \ const num_t dt = PASTEMAC(ch,type); \ \ diff --git a/frame/compat/bla_gemv.c b/frame/compat/bla_gemv.c index 85c65dde4..8d730edd9 100644 --- a/frame/compat/bla_gemv.c +++ b/frame/compat/bla_gemv.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2022, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -60,7 +61,6 @@ void PASTEF77(ch,blasname) \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ - inc_t rs_a, cs_a; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -89,16 +89,19 @@ void PASTEF77(ch,blasname) \ if necessary.*/ \ bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \ \ - /* BLAS handles cases where trans(A) has no columns, and x has no elements, - in a peculiar way. In these situations, BLAS returns without performing - any action, even though most sane interpretations of gemv would have the - the operation reduce to y := beta * y. Here, we catch those cases that - BLAS would normally mishandle and emulate the BLAS exactly so as to + /* BLAS handles cases where y has no elements as well as those where x has + no elements. In the case of the former, it cannot do any work since + the output vector is empty; but in the latter case, BLAS has peculiar + semantics. When x has no elements (and transa(A) has no columns), BLAS + returns immediately without performing any computation even if the + number of elements of y (and rows of transa(A)) is non-zero, in which + case any sane interpretations of gemv would have the the operation + reduce to y := beta * y. Here, we emulate the BLAS exactly so as to provide "bug-for-bug" compatibility. Note that this extreme level of - compatibility would not be as much of an issue if it weren't for the - fact that some BLAS test suites actually test for these cases. Also, it - should be emphasized that BLIS, if called natively, does NOT exhibit - this quirky behavior; it will scale y by beta, as one would expect. */ \ + compatibility would not be contemplated if it weren't for the fact + that some BLAS unit tests actually check for this behavior. Also, it + should be emphasized that BLIS, when called natively, does NOT exhibit + this quirky behavior; it will scale y by beta as one would expect. */ \ if ( m_y > 0 && n_x == 0 ) \ { \ /* Finalize BLIS. */ \ @@ -113,8 +116,8 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \ \ /* Set the row and column strides of A. */ \ - rs_a = 1; \ - cs_a = *lda; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_ger.c b/frame/compat/bla_ger.c index db4f76f18..b558bfd94 100644 --- a/frame/compat/bla_ger.c +++ b/frame/compat/bla_ger.c @@ -56,7 +56,6 @@ void PASTEF772(ch,blasname,chc) \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ - inc_t rs_a, cs_a; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -84,8 +83,8 @@ void PASTEF772(ch,blasname,chc) \ bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Set the row and column strides of A. */ \ - rs_a = 1; \ - cs_a = *lda; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_hemm.c b/frame/compat/bla_hemm.c index 6bfb13e18..9a4484a09 100644 --- a/frame/compat/bla_hemm.c +++ b/frame/compat/bla_hemm.c @@ -61,9 +61,6 @@ void PASTEF77(ch,blasname) \ side_t blis_side; \ uplo_t blis_uploa; \ dim_t m0, n0; \ - inc_t rs_a, cs_a; \ - inc_t rs_b, cs_b; \ - inc_t rs_c, cs_c; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -91,12 +88,12 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ - rs_a = 1; \ - cs_a = *lda; \ - rs_b = 1; \ - cs_b = *ldb; \ - rs_c = 1; \ - cs_c = *ldc; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ + const inc_t rs_b = 1; \ + const inc_t cs_b = *ldb; \ + const inc_t rs_c = 1; \ + const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_hemv.c b/frame/compat/bla_hemv.c index 944468278..d036c10e3 100644 --- a/frame/compat/bla_hemv.c +++ b/frame/compat/bla_hemv.c @@ -58,7 +58,6 @@ void PASTEF77(ch,blasname) \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ - inc_t rs_a, cs_a; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -87,8 +86,8 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Set the row and column strides of A. */ \ - rs_a = 1; \ - cs_a = *lda; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_her.c b/frame/compat/bla_her.c index ade3cbdda..512081d89 100644 --- a/frame/compat/bla_her.c +++ b/frame/compat/bla_her.c @@ -54,7 +54,6 @@ void PASTEF77(ch,blasname) \ dim_t m0; \ ftype* x0; \ inc_t incx0; \ - inc_t rs_a, cs_a; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -81,8 +80,8 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \ \ /* Set the row and column strides of A. */ \ - rs_a = 1; \ - cs_a = *lda; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_her2.c b/frame/compat/bla_her2.c index e3ed4ce31..7d99a6378 100644 --- a/frame/compat/bla_her2.c +++ b/frame/compat/bla_her2.c @@ -57,7 +57,6 @@ void PASTEF77(ch,blasname) \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ - inc_t rs_a, cs_a; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -86,8 +85,8 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Set the row and column strides of A. */ \ - rs_a = 1; \ - cs_a = *lda; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_her2k.c b/frame/compat/bla_her2k.c index df5121975..2a058dc02 100644 --- a/frame/compat/bla_her2k.c +++ b/frame/compat/bla_her2k.c @@ -61,9 +61,6 @@ void PASTEF77(ch,blasname) \ uplo_t blis_uploc; \ trans_t blis_transa; \ dim_t m0, k0; \ - inc_t rs_a, cs_a; \ - inc_t rs_b, cs_b; \ - inc_t rs_c, cs_c; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -108,12 +105,12 @@ void PASTEF77(ch,blasname) \ } \ \ /* Set the row and column strides of the matrix operands. */ \ - rs_a = 1; \ - cs_a = *lda; \ - rs_b = 1; \ - cs_b = *ldb; \ - rs_c = 1; \ - cs_c = *ldc; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ + const inc_t rs_b = 1; \ + const inc_t cs_b = *ldb; \ + const inc_t rs_c = 1; \ + const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_herk.c b/frame/compat/bla_herk.c index d9c47f5af..8236e2032 100644 --- a/frame/compat/bla_herk.c +++ b/frame/compat/bla_herk.c @@ -60,8 +60,6 @@ void PASTEF77(ch,blasname) \ uplo_t blis_uploc; \ trans_t blis_transa; \ dim_t m0, k0; \ - inc_t rs_a, cs_a; \ - inc_t rs_c, cs_c; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -105,10 +103,10 @@ void PASTEF77(ch,blasname) \ } \ \ /* Set the row and column strides of the matrix operands. */ \ - rs_a = 1; \ - cs_a = *lda; \ - rs_c = 1; \ - cs_c = *ldc; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ + const inc_t rs_c = 1; \ + const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_symm.c b/frame/compat/bla_symm.c index b4f0b66d0..098beb472 100644 --- a/frame/compat/bla_symm.c +++ b/frame/compat/bla_symm.c @@ -61,9 +61,6 @@ void PASTEF77(ch,blasname) \ side_t blis_side; \ uplo_t blis_uploa; \ dim_t m0, n0; \ - inc_t rs_a, cs_a; \ - inc_t rs_b, cs_b; \ - inc_t rs_c, cs_c; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -91,12 +88,12 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ - rs_a = 1; \ - cs_a = *lda; \ - rs_b = 1; \ - cs_b = *ldb; \ - rs_c = 1; \ - cs_c = *ldc; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ + const inc_t rs_b = 1; \ + const inc_t cs_b = *ldb; \ + const inc_t rs_c = 1; \ + const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_symv.c b/frame/compat/bla_symv.c index 79076194c..c5b5ebda3 100644 --- a/frame/compat/bla_symv.c +++ b/frame/compat/bla_symv.c @@ -58,7 +58,6 @@ void PASTEF77(ch,blasname) \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ - inc_t rs_a, cs_a; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -87,8 +86,8 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Set the row and column strides of A. */ \ - rs_a = 1; \ - cs_a = *lda; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_syr.c b/frame/compat/bla_syr.c index 0ed4aebb1..6732a75cf 100644 --- a/frame/compat/bla_syr.c +++ b/frame/compat/bla_syr.c @@ -54,7 +54,6 @@ void PASTEF77(ch,blasname) \ dim_t m0; \ ftype* x0; \ inc_t incx0; \ - inc_t rs_a, cs_a; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -81,8 +80,8 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \ \ /* Set the row and column strides of A. */ \ - rs_a = 1; \ - cs_a = *lda; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_syr2.c b/frame/compat/bla_syr2.c index dbae67027..7050c0488 100644 --- a/frame/compat/bla_syr2.c +++ b/frame/compat/bla_syr2.c @@ -57,7 +57,6 @@ void PASTEF77(ch,blasname) \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ - inc_t rs_a, cs_a; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -87,8 +86,8 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Set the row and column strides of A. */ \ - rs_a = 1; \ - cs_a = *lda; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_syr2k.c b/frame/compat/bla_syr2k.c index 35cfca9a3..2b26171b6 100644 --- a/frame/compat/bla_syr2k.c +++ b/frame/compat/bla_syr2k.c @@ -61,9 +61,6 @@ void PASTEF77(ch,blasname) \ uplo_t blis_uploc; \ trans_t blis_transa; \ dim_t m0, k0; \ - inc_t rs_a, cs_a; \ - inc_t rs_b, cs_b; \ - inc_t rs_c, cs_c; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -100,12 +97,12 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_dim1( *k, k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ - rs_a = 1; \ - cs_a = *lda; \ - rs_b = 1; \ - cs_b = *ldb; \ - rs_c = 1; \ - cs_c = *ldc; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ + const inc_t rs_b = 1; \ + const inc_t cs_b = *ldb; \ + const inc_t rs_c = 1; \ + const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_syrk.c b/frame/compat/bla_syrk.c index 82ce2f166..4f3f15367 100644 --- a/frame/compat/bla_syrk.c +++ b/frame/compat/bla_syrk.c @@ -60,8 +60,6 @@ void PASTEF77(ch,blasname) \ uplo_t blis_uploc; \ trans_t blis_transa; \ dim_t m0, k0; \ - inc_t rs_a, cs_a; \ - inc_t rs_c, cs_c; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -97,10 +95,10 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_dim1( *k, k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ - rs_a = 1; \ - cs_a = *lda; \ - rs_c = 1; \ - cs_c = *ldc; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ + const inc_t rs_c = 1; \ + const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_trmm.c b/frame/compat/bla_trmm.c index ce099dc59..b77a60dd6 100644 --- a/frame/compat/bla_trmm.c +++ b/frame/compat/bla_trmm.c @@ -63,8 +63,6 @@ void PASTEF77(ch,blasname) \ trans_t blis_transa; \ diag_t blis_diaga; \ dim_t m0, n0; \ - inc_t rs_a, cs_a; \ - inc_t rs_b, cs_b; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -95,10 +93,10 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ - rs_a = 1; \ - cs_a = *lda; \ - rs_b = 1; \ - cs_b = *ldb; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ + const inc_t rs_b = 1; \ + const inc_t cs_b = *ldb; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_trmv.c b/frame/compat/bla_trmv.c index ffb31b12f..2821d4bfa 100644 --- a/frame/compat/bla_trmv.c +++ b/frame/compat/bla_trmv.c @@ -57,7 +57,6 @@ void PASTEF77(ch,blasname) \ dim_t m0; \ ftype* x0; \ inc_t incx0; \ - inc_t rs_a, cs_a; \ ftype* one_p; \ \ /* Initialize BLIS. */ \ @@ -89,8 +88,8 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \ \ /* Set the row and column strides of A. */ \ - rs_a = 1; \ - cs_a = *lda; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ \ /* Acquire a pointer to the global scalar constant BLIS_ONE. */ \ one_p = PASTEMAC(ch,1); \ diff --git a/frame/compat/bla_trsm.c b/frame/compat/bla_trsm.c index c0d8e4b3e..9af008090 100644 --- a/frame/compat/bla_trsm.c +++ b/frame/compat/bla_trsm.c @@ -63,8 +63,6 @@ void PASTEF77(ch,blasname) \ trans_t blis_transa; \ diag_t blis_diaga; \ dim_t m0, n0; \ - inc_t rs_a, cs_a; \ - inc_t rs_b, cs_b; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ @@ -95,10 +93,10 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ - rs_a = 1; \ - cs_a = *lda; \ - rs_b = 1; \ - cs_b = *ldb; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ + const inc_t rs_b = 1; \ + const inc_t cs_b = *ldb; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ diff --git a/frame/compat/bla_trsv.c b/frame/compat/bla_trsv.c index 445059720..91132934e 100644 --- a/frame/compat/bla_trsv.c +++ b/frame/compat/bla_trsv.c @@ -57,7 +57,6 @@ void PASTEF77(ch,blasname) \ dim_t m0; \ ftype* x0; \ inc_t incx0; \ - inc_t rs_a, cs_a; \ ftype* one_p; \ \ /* Initialize BLIS. */ \ @@ -89,8 +88,8 @@ void PASTEF77(ch,blasname) \ bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \ \ /* Set the row and column strides of A. */ \ - rs_a = 1; \ - cs_a = *lda; \ + const inc_t rs_a = 1; \ + const inc_t cs_a = *lda; \ \ /* Acquire a pointer to the global scalar constant BLIS_ONE. */ \ one_p = PASTEMAC(ch,1); \ diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c index c4096cbbc..c8488890f 100644 --- a/kernels/zen/1/bli_scalv_zen_int10.c +++ b/kernels/zen/1/bli_scalv_zen_int10.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without @@ -81,16 +81,9 @@ void bli_sscalv_zen_int10 if ( PASTEMAC(s,eq0)( *alpha ) ) { float* zero = bli_s0; -#ifdef BLIS_CONFIG_ZEN2 - bli_ssetv_zen_int - ( - BLIS_NO_CONJUGATE, - n, - zero, - x, incx, - cntx - ); -#else + + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx ); f ( @@ -100,7 +93,7 @@ void bli_sscalv_zen_int10 x, incx, cntx ); -#endif + return; } @@ -281,16 +274,9 @@ void bli_dscalv_zen_int10 if ( PASTEMAC(d,eq0)( *alpha ) ) { double* zero = bli_d0; -#ifdef BLIS_CONFIG_ZEN2 - bli_dsetv_zen_int - ( - BLIS_NO_CONJUGATE, - n, - zero, - x, incx, - cntx - ); -#else + + if( cntx == NULL ) cntx = bli_gks_query_cntx(); + dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx ); f @@ -301,7 +287,7 @@ void bli_dscalv_zen_int10 x, incx, cntx ); -#endif + return; } @@ -454,3 +440,33 @@ void bli_dscalv_zen_int10 } } +// ----------------------------------------------------------------------------- + +// +// NOTE: This function definition is provided as a placeholder in order to allow +// function names of scalv kernels to be hard-coded in bli_gemv_unf_var2_amd.c. +// + +void bli_cscalv_zen_int10 + ( + conj_t conjalpha, + dim_t n, + scomplex* restrict alpha, + scomplex* restrict x, inc_t incx, + cntx_t* restrict cntx + ) +{ + const num_t dt = BLIS_SCOMPLEX; + + cscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCALV_KER, cntx ); + + f + ( + conjalpha, + n, + alpha, + x, incx, + cntx + ); +} + diff --git a/kernels/zen/1f/bli_axpyf_zen_int_4.c b/kernels/zen/1f/bli_axpyf_zen_int_4.c new file mode 100644 index 000000000..5ddb56ac5 --- /dev/null +++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c @@ -0,0 +1,277 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "immintrin.h" +#include "blis.h" + + + void bli_caxpyf_zen_int_4 + ( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + scomplex* restrict alpha, + scomplex* restrict a, inc_t inca, inc_t lda, + scomplex* restrict x, inc_t incx, + scomplex* restrict y, inc_t incy, + cntx_t* restrict cntx + ) +{ + inc_t fuse_fac = 4; + inc_t i; + + __m256 ymm0, ymm1, ymm2, ymm3; + __m256 ymm4, ymm5, ymm6, ymm7; + __m256 ymm8, ymm10; + __m256 ymm12, ymm13; + + float* ap[4]; + float* y0 = (float*)y; + + scomplex chi0; + scomplex chi1; + scomplex chi2; + scomplex chi3; + + + dim_t setPlusOne = 1; + + if ( bli_is_conj(conja) ) + { + setPlusOne = -1; + } + // If either dimension is zero, or if alpha is zero, return early. + if ( bli_zero_dim2( m, b_n ) || bli_ceq0( *alpha ) ) return; + + // If b_n is not equal to the fusing factor, then perform the entire + // operation as a loop over axpyv. + if ( b_n != fuse_fac ) + { + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx ); + + for ( i = 0; i < b_n; ++i ) + { + scomplex* a1 = a + (0 )*inca + (i )*lda; + scomplex* chi1 = x + (i )*incx; + scomplex* y1 = y + (0 )*incy; + scomplex alpha_chi1; + + bli_ccopycjs( conjx, *chi1, alpha_chi1 ); + bli_cscals( *alpha, alpha_chi1 ); + + f + ( + conja, + m, + &alpha_chi1, + a1, inca, + y1, incy, + cntx + ); + } + + return; + } + + + // At this point, we know that b_n is exactly equal to the fusing factor. + if(bli_is_noconj(conjx)) + { + chi0 = *( x + 0*incx ); + chi1 = *( x + 1*incx ); + chi2 = *( x + 2*incx ); + chi3 = *( x + 3*incx ); + } + else + { + scomplex *pchi0 = x + 0*incx ; + scomplex *pchi1 = x + 1*incx ; + scomplex *pchi2 = x + 2*incx ; + scomplex *pchi3 = x + 3*incx ; + + bli_ccopycjs( conjx, *pchi0, chi0 ); + bli_ccopycjs( conjx, *pchi1, chi1 ); + bli_ccopycjs( conjx, *pchi2, chi2 ); + bli_ccopycjs( conjx, *pchi3, chi3 ); + } + + // Scale each chi scalar by alpha. + bli_cscals( *alpha, chi0 ); + bli_cscals( *alpha, chi1 ); + bli_cscals( *alpha, chi2 ); + bli_cscals( *alpha, chi3 ); + + lda *= 2; + incx *= 2; + incy *= 2; + inca *= 2; + + ap[0] = (float*)a; + ap[1] = (float*)a + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if( inca == 2 && incy == 2 ) + { + inc_t n1 = m/4; + inc_t n2 = m%4; + + ymm12 = _mm256_setzero_ps(); + ymm13 = _mm256_setzero_ps(); + + // broadcast real & imag parts of 4 elements of x + ymm0 = _mm256_broadcast_ss(&chi0.real); // real part of x0 + ymm1 = _mm256_broadcast_ss(&chi0.imag); // imag part of x0 + ymm2 = _mm256_broadcast_ss(&chi1.real); // real part of x1 + ymm3 = _mm256_broadcast_ss(&chi1.imag); // imag part of x1 + ymm4 = _mm256_broadcast_ss(&chi2.real); // real part of x2 + ymm5 = _mm256_broadcast_ss(&chi2.imag); // imag part of x2 + ymm6 = _mm256_broadcast_ss(&chi3.real); // real part of x3 + ymm7 = _mm256_broadcast_ss(&chi3.imag); // imag part of x3 + + for(i = 0; i < n1; i++) + { + //load first two columns of A + ymm8 = _mm256_loadu_ps(ap[0] + 0); + ymm10 = _mm256_loadu_ps(ap[1] + 0); + + ymm12 = _mm256_mul_ps(ymm8, ymm0); + ymm13 = _mm256_mul_ps(ymm8, ymm1); + + ymm12 = _mm256_fmadd_ps(ymm10, ymm2, ymm12); + ymm13 = _mm256_fmadd_ps(ymm10, ymm3, ymm13); + + //load 3rd and 4th columns of A + ymm8 = _mm256_loadu_ps(ap[2] + 0); + ymm10 = _mm256_loadu_ps(ap[3] + 0); + + ymm12 = _mm256_fmadd_ps(ymm8, ymm4, ymm12); + ymm13 = _mm256_fmadd_ps(ymm8, ymm5, ymm13); + + ymm12 = _mm256_fmadd_ps(ymm10, ymm6, ymm12); + ymm13 = _mm256_fmadd_ps(ymm10, ymm7, ymm13); + + //load Y vector + ymm10 = _mm256_loadu_ps(y0 + 0); + + if(bli_is_noconj(conja)) + { + //printf("Inside no conj if\n"); + ymm13 = _mm256_permute_ps(ymm13, 0xB1); + ymm8 = _mm256_addsub_ps(ymm12, ymm13); + } + else + { + ymm12 = _mm256_permute_ps(ymm12, 0xB1); + ymm8 = _mm256_addsub_ps(ymm13, ymm12); + ymm8 = _mm256_permute_ps(ymm8, 0xB1); + } + + ymm12 = _mm256_add_ps(ymm8, ymm10); + + _mm256_storeu_ps((float*)(y0), ymm12); + + y0 += 8; + ap[0] += 8; + ap[1] += 8; + ap[2] += 8; + ap[3] += 8; + } + + // If there are leftover iterations, perform them with scalar code. + + for ( i = 0; (i + 0) < n2 ; ++i ) + { + + scomplex y0c = *(scomplex*)y0; + + const scomplex a0c = *(scomplex*)ap[0]; + const scomplex a1c = *(scomplex*)ap[1]; + const scomplex a2c = *(scomplex*)ap[2]; + const scomplex a3c = *(scomplex*)ap[3]; + + y0c.real += chi0.real * a0c.real - chi0.imag * a0c.imag * setPlusOne; + y0c.real += chi1.real * a1c.real - chi1.imag * a1c.imag * setPlusOne; + y0c.real += chi2.real * a2c.real - chi2.imag * a2c.imag * setPlusOne; + y0c.real += chi3.real * a3c.real - chi3.imag * a3c.imag * setPlusOne; + + y0c.imag += chi0.imag * a0c.real + chi0.real * a0c.imag * setPlusOne; + y0c.imag += chi1.imag * a1c.real + chi1.real * a1c.imag * setPlusOne; + y0c.imag += chi2.imag * a2c.real + chi2.real * a2c.imag * setPlusOne; + y0c.imag += chi3.imag * a3c.real + chi3.real * a3c.imag * setPlusOne; + + *(scomplex*)y0 = y0c; + + ap[0] += 2; + ap[1] += 2; + ap[2] += 2; + ap[3] += 2; + y0 += 2; + } + //PASTEMAC(c,fprintm)(stdout, "Y after A*x in axpyf",m, 1, (scomplex*)y, 1, 1, "%4.1f", ""); + + } + else + { + for (i = 0 ; (i + 0) < m ; ++i ) + { + scomplex y0c = *(scomplex*)y0; + const scomplex a0c = *(scomplex*)ap[0]; + const scomplex a1c = *(scomplex*)ap[1]; + const scomplex a2c = *(scomplex*)ap[2]; + const scomplex a3c = *(scomplex*)ap[3]; + + y0c.real += chi0.real * a0c.real - chi0.imag * a0c.imag * setPlusOne; + y0c.real += chi1.real * a1c.real - chi1.imag * a1c.imag * setPlusOne; + y0c.real += chi2.real * a2c.real - chi2.imag * a2c.imag * setPlusOne; + y0c.real += chi3.real * a3c.real - chi3.imag * a3c.imag * setPlusOne; + + y0c.imag += chi0.imag * a0c.real + chi0.real * a0c.imag * setPlusOne; + y0c.imag += chi1.imag * a1c.real + chi1.real * a1c.imag * setPlusOne; + y0c.imag += chi2.imag * a2c.real + chi2.real * a2c.imag * setPlusOne; + y0c.imag += chi3.imag * a3c.real + chi3.real * a3c.imag * setPlusOne; + + *(scomplex*)y0 = y0c; + + ap[0] += inca; + ap[1] += inca; + ap[2] += inca; + ap[3] += inca; + y0 += incy; + } + } +} diff --git a/kernels/zen/1f/bli_axpyf_zen_int_5.c b/kernels/zen/1f/bli_axpyf_zen_int_5.c new file mode 100644 index 000000000..15a64d596 --- /dev/null +++ b/kernels/zen/1f/bli_axpyf_zen_int_5.c @@ -0,0 +1,1231 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "immintrin.h" +#include "blis.h" + +/* Union data structure to access AVX registers + One 256-bit AVX register holds 8 SP elements. */ +typedef union +{ + __m256 v; + float f[8] __attribute__((aligned(64))); +} v8sf_t; + +/* Union data structure to access AVX registers +* One 256-bit AVX register holds 4 DP elements. */ +typedef union +{ + __m256d v; + __m128d xmm[2]; + double d[4] __attribute__((aligned(64))); +} v4df_t; + +typedef union +{ + __m128d v; + double d[2] __attribute__((aligned(64))); +} v2df_t; + + +void bli_saxpyf_zen_int_5 + ( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + float* restrict alpha, + float* restrict a, inc_t inca, inc_t lda, + float* restrict x, inc_t incx, + float* restrict y, inc_t incy, + cntx_t* restrict cntx + ) +{ + const dim_t fuse_fac = 5; + + const dim_t n_elem_per_reg = 8; + const dim_t n_iter_unroll = 2; + + dim_t i; + + float* restrict a0; + float* restrict a1; + float* restrict a2; + float* restrict a3; + float* restrict a4; + + float* restrict y0; + + v8sf_t chi0v, chi1v, chi2v, chi3v; + v8sf_t chi4v; + + v8sf_t a00v, a01v, a02v, a03v; + v8sf_t a04v; + + v8sf_t a10v, a11v, a12v, a13v; + v8sf_t a14v; + + v8sf_t y0v, y1v; + + float chi0, chi1, chi2, chi3; + float chi4; + + // If either dimension is zero, or if alpha is zero, return early. + if ( bli_zero_dim2( m, b_n ) || bli_seq0( *alpha ) ) return; + + // If b_n is not equal to the fusing factor, then perform the entire + // operation as a loop over axpyv. + if ( b_n != fuse_fac ) + { + if(cntx == NULL) cntx = bli_gks_query_cntx(); + saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); + + for ( i = 0; i < b_n; ++i ) + { + float* a1 = a + (0 )*inca + (i )*lda; + float* chi1 = x + (i )*incx; + float* y1 = y + (0 )*incy; + float alpha_chi1; + + bli_scopycjs( conjx, *chi1, alpha_chi1 ); + bli_sscals( *alpha, alpha_chi1 ); + + f + ( + conja, + m, + &alpha_chi1, + a1, inca, + y1, incy, + cntx + ); + } + + return; + } + + // At this point, we know that b_n is exactly equal to the fusing factor. + + a0 = a + 0*lda; + a1 = a + 1*lda; + a2 = a + 2*lda; + a3 = a + 3*lda; + a4 = a + 4*lda; + y0 = y; + + chi0 = *( x + 0*incx ); + chi1 = *( x + 1*incx ); + chi2 = *( x + 2*incx ); + chi3 = *( x + 3*incx ); + chi4 = *( x + 4*incx ); + + + // Scale each chi scalar by alpha. + bli_sscals( *alpha, chi0 ); + bli_sscals( *alpha, chi1 ); + bli_sscals( *alpha, chi2 ); + bli_sscals( *alpha, chi3 ); + bli_sscals( *alpha, chi4 ); + + // Broadcast the (alpha*chi?) scalars to all elements of vector registers. + chi0v.v = _mm256_broadcast_ss( &chi0 ); + chi1v.v = _mm256_broadcast_ss( &chi1 ); + chi2v.v = _mm256_broadcast_ss( &chi2 ); + chi3v.v = _mm256_broadcast_ss( &chi3 ); + chi4v.v = _mm256_broadcast_ss( &chi4 ); + + // If there are vectorized iterations, perform them with vector + // instructions. + if ( inca == 1 && incy == 1 ) + { + for ( i = 0; (i + 15) < m; i += 16 ) + { + // Load the input values. + y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); + + a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg ); + a10v.v = _mm256_loadu_ps( a0 + 1*n_elem_per_reg ); + + a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg ); + a11v.v = _mm256_loadu_ps( a1 + 1*n_elem_per_reg ); + + a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg ); + a12v.v = _mm256_loadu_ps( a2 + 1*n_elem_per_reg ); + + a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg ); + a13v.v = _mm256_loadu_ps( a3 + 1*n_elem_per_reg ); + + a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg ); + a14v.v = _mm256_loadu_ps( a4 + 1*n_elem_per_reg ); + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v ); + y1v.v = _mm256_fmadd_ps( a10v.v, chi0v.v, y1v.v ); + + y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v ); + y1v.v = _mm256_fmadd_ps( a11v.v, chi1v.v, y1v.v ); + + y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v ); + y1v.v = _mm256_fmadd_ps( a12v.v, chi2v.v, y1v.v ); + + y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v ); + y1v.v = _mm256_fmadd_ps( a13v.v, chi3v.v, y1v.v ); + + y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v ); + y1v.v = _mm256_fmadd_ps( a14v.v, chi4v.v, y1v.v ); + + + // Store the output. + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v ); + _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v ); + + y0 += n_iter_unroll * n_elem_per_reg; + a0 += n_iter_unroll * n_elem_per_reg; + a1 += n_iter_unroll * n_elem_per_reg; + a2 += n_iter_unroll * n_elem_per_reg; + a3 += n_iter_unroll * n_elem_per_reg; + a4 += n_iter_unroll * n_elem_per_reg; + } + + for( ; (i + 7) < m; i += 8 ) + { + // Load the input values. + y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + + a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg ); + a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg ); + a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg ); + a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg ); + a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg ); + + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v ); + y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v ); + y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v ); + y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v ); + y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v ); + + // Store the output. + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v ); + + y0 += n_elem_per_reg; + a0 += n_elem_per_reg; + a1 += n_elem_per_reg; + a2 += n_elem_per_reg; + a3 += n_elem_per_reg; + a4 += n_elem_per_reg; + } + + // If there are leftover iterations, perform them with scalar code. + for ( ; (i + 0) < m ; ++i ) + { + double y0c = *y0; + + const float a0c = *a0; + const float a1c = *a1; + const float a2c = *a2; + const float a3c = *a3; + const float a4c = *a4; + + y0c += chi0 * a0c; + y0c += chi1 * a1c; + y0c += chi2 * a2c; + y0c += chi3 * a3c; + y0c += chi4 * a4c; + + *y0 = y0c; + + a0 += 1; + a1 += 1; + a2 += 1; + a3 += 1; + a4 += 1; + y0 += 1; + } + } + else + { + for ( i = 0; (i + 0) < m ; ++i ) + { + double y0c = *y0; + + const float a0c = *a0; + const float a1c = *a1; + const float a2c = *a2; + const float a3c = *a3; + const float a4c = *a4; + + y0c += chi0 * a0c; + y0c += chi1 * a1c; + y0c += chi2 * a2c; + y0c += chi3 * a3c; + y0c += chi4 * a4c; + + *y0 = y0c; + + a0 += inca; + a1 += inca; + a2 += inca; + a3 += inca; + a4 += inca; + y0 += incy; + } + + } +} + + +// ----------------------------------------------------------------------------- + +void bli_daxpyf_zen_int_5 + ( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + double* restrict alpha, + double* restrict a, inc_t inca, inc_t lda, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + cntx_t* restrict cntx + ) +{ + const dim_t fuse_fac = 5; + + const dim_t n_elem_per_reg = 4; + const dim_t n_iter_unroll = 2; + + dim_t i; + + double* restrict a0; + double* restrict a1; + double* restrict a2; + double* restrict a3; + double* restrict a4; + + double* restrict y0; + + v4df_t chi0v, chi1v, chi2v, chi3v; + v4df_t chi4v; + + v4df_t a00v, a01v, a02v, a03v; + v4df_t a04v; + + v4df_t a10v, a11v, a12v, a13v; + v4df_t a14v; + + v4df_t y0v, y1v; + + double chi0, chi1, chi2, chi3; + double chi4; + + // If either dimension is zero, or if alpha is zero, return early. + if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return; + + // If b_n is not equal to the fusing factor, then perform the entire + // operation as a loop over axpyv. + if ( b_n != fuse_fac ) + { + daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + + for ( i = 0; i < b_n; ++i ) + { + double* a1 = a + (0 )*inca + (i )*lda; + double* chi1 = x + (i )*incx; + double* y1 = y + (0 )*incy; + double alpha_chi1; + + bli_dcopycjs( conjx, *chi1, alpha_chi1 ); + bli_dscals( *alpha, alpha_chi1 ); + + f + ( + conja, + m, + &alpha_chi1, + a1, inca, + y1, incy, + cntx + ); + } + + return; + } + + // At this point, we know that b_n is exactly equal to the fusing factor. + + a0 = a + 0*lda; + a1 = a + 1*lda; + a2 = a + 2*lda; + a3 = a + 3*lda; + a4 = a + 4*lda; + y0 = y; + + chi0 = *( x + 0*incx ); + chi1 = *( x + 1*incx ); + chi2 = *( x + 2*incx ); + chi3 = *( x + 3*incx ); + chi4 = *( x + 4*incx ); + + + // Scale each chi scalar by alpha. + bli_dscals( *alpha, chi0 ); + bli_dscals( *alpha, chi1 ); + bli_dscals( *alpha, chi2 ); + bli_dscals( *alpha, chi3 ); + bli_dscals( *alpha, chi4 ); + + // Broadcast the (alpha*chi?) scalars to all elements of vector registers. + chi0v.v = _mm256_broadcast_sd( &chi0 ); + chi1v.v = _mm256_broadcast_sd( &chi1 ); + chi2v.v = _mm256_broadcast_sd( &chi2 ); + chi3v.v = _mm256_broadcast_sd( &chi3 ); + chi4v.v = _mm256_broadcast_sd( &chi4 ); + + // If there are vectorized iterations, perform them with vector + // instructions. + if ( inca == 1 && incy == 1 ) + { + for ( i = 0; (i + 7) < m; i += 8 ) + { + // Load the input values. + y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + + a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); + a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); + + a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); + a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); + + a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); + a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg ); + + a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); + a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg ); + + a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg ); + a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg ); + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v ); + + + // Store the output. + _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); + _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v ); + + y0 += n_iter_unroll * n_elem_per_reg; + a0 += n_iter_unroll * n_elem_per_reg; + a1 += n_iter_unroll * n_elem_per_reg; + a2 += n_iter_unroll * n_elem_per_reg; + a3 += n_iter_unroll * n_elem_per_reg; + a4 += n_iter_unroll * n_elem_per_reg; + } + + for( ; (i + 3) < m; i += 4 ) + { + // Load the input values. + y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + + a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); + a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); + a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); + a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); + a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg ); + + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); + y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); + y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); + y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); + y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v ); + + // Store the output. + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v ); + + y0 += n_elem_per_reg; + a0 += n_elem_per_reg; + a1 += n_elem_per_reg; + a2 += n_elem_per_reg; + a3 += n_elem_per_reg; + a4 += n_elem_per_reg; + } + + // If there are leftover iterations, perform them with scalar code. + for ( ; (i + 0) < m ; ++i ) + { + double y0c = *y0; + + const double a0c = *a0; + const double a1c = *a1; + const double a2c = *a2; + const double a3c = *a3; + const double a4c = *a4; + + y0c += chi0 * a0c; + y0c += chi1 * a1c; + y0c += chi2 * a2c; + y0c += chi3 * a3c; + y0c += chi4 * a4c; + + *y0 = y0c; + + a0 += 1; + a1 += 1; + a2 += 1; + a3 += 1; + a4 += 1; + y0 += 1; + } + } + else + { + for ( i = 0; (i + 0) < m ; ++i ) + { + double y0c = *y0; + + const double a0c = *a0; + const double a1c = *a1; + const double a2c = *a2; + const double a3c = *a3; + const double a4c = *a4; + + y0c += chi0 * a0c; + y0c += chi1 * a1c; + y0c += chi2 * a2c; + y0c += chi3 * a3c; + y0c += chi4 * a4c; + + *y0 = y0c; + + a0 += inca; + a1 += inca; + a2 += inca; + a3 += inca; + a4 += inca; + y0 += incy; + } + + } +} + +// ----------------------------------------------------------------------------- + +static void bli_daxpyf_zen_int_16x2 + ( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + double* restrict alpha, + double* restrict a, inc_t inca, inc_t lda, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + cntx_t* restrict cntx + ) +{ + const dim_t fuse_fac = 2; + + const dim_t n_elem_per_reg = 4; + const dim_t n_iter_unroll = 4; + + dim_t i; + + double* restrict a0; + double* restrict a1; + + double* restrict y0; + + v4df_t chi0v, chi1v; + + v4df_t a00v, a01v; + + v4df_t a10v, a11v; + + v4df_t a20v, a21v; + + v4df_t a30v, a31v; + + v4df_t y0v, y1v, y2v, y3v; + + double chi0, chi1; + + v2df_t a40v, a41v; + + v2df_t y4v; + // If either dimension is zero, or if alpha is zero, return early. + if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return; + + // If b_n is not equal to the fusing factor, then perform the entire + // operation as a loop over axpyv. + if ( b_n != fuse_fac ) + { + daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + + for ( i = 0; i < b_n; ++i ) + { + double* a1 = a + (0 )*inca + (i )*lda; + double* chi1 = x + (i )*incx; + double* y1 = y + (0 )*incy; + double alpha_chi1; + + bli_dcopycjs( conjx, *chi1, alpha_chi1 ); + bli_dscals( *alpha, alpha_chi1 ); + + f + ( + conja, + m, + &alpha_chi1, + a1, inca, + y1, incy, + cntx + ); + } + + return; + } + + // At this point, we know that b_n is exactly equal to the fusing factor. + + a0 = a + 0*lda; + a1 = a + 1*lda; + + y0 = y; + + chi0 = *( x + 0*incx ); + chi1 = *( x + 1*incx ); + + + // Scale each chi scalar by alpha. + bli_dscals( *alpha, chi0 ); + bli_dscals( *alpha, chi1 ); + + // Broadcast the (alpha*chi?) scalars to all elements of vector registers. + chi0v.v = _mm256_broadcast_sd( &chi0 ); + chi1v.v = _mm256_broadcast_sd( &chi1 ); + + // If there are vectorized iterations, perform them with vector + // instructions. + if ( inca == 1 && incy == 1 ) + { + for ( i = 0; (i + 15) < m; i += 16 ) + { + // Load the input values. + y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); + y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); + + a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); + a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); + a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg ); + a30v.v = _mm256_loadu_pd( a0 + 3*n_elem_per_reg ); + + a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); + a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); + a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg ); + a31v.v = _mm256_loadu_pd( a1 + 3*n_elem_per_reg ); + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); + y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v ); + y3v.v = _mm256_fmadd_pd( a30v.v, chi0v.v, y3v.v ); + + y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); + y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v ); + y3v.v = _mm256_fmadd_pd( a31v.v, chi1v.v, y3v.v ); + + // Store the output. + _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); + _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v ); + _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v ); + _mm256_storeu_pd( (double *)(y0 + 3*n_elem_per_reg), y3v.v ); + + y0 += n_iter_unroll * n_elem_per_reg; + a0 += n_iter_unroll * n_elem_per_reg; + a1 += n_iter_unroll * n_elem_per_reg; + } + + for ( ; (i + 11) < m; i += 12 ) + { + // Load the input values. + y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); + + a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); + a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); + a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg ); + + a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); + a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); + a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg ); + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); + y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v ); + + y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); + y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v ); + + // Store the output. + _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); + _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v ); + _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v ); + + y0 += 3 * n_elem_per_reg; + a0 += 3 * n_elem_per_reg; + a1 += 3 * n_elem_per_reg; + } + for ( ; (i + 7) < m; i += 8 ) + { + // Load the input values. + y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + + a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); + a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); + + a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); + a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); + + // Store the output. + _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); + _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v ); + + y0 += 2 * n_elem_per_reg; + a0 += 2 * n_elem_per_reg; + a1 += 2 * n_elem_per_reg; + } + + for ( ; (i + 3) < m; i += 4 ) + { + // Load the input values. + y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + + a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); + + a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); + + y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); + + // Store the output. + _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); + + y0 += n_elem_per_reg; + a0 += n_elem_per_reg; + a1 += n_elem_per_reg; + } + + for ( ; (i + 1) < m; i += 2 ) + { + // Load the input values. + y4v.v = _mm_loadu_pd( y0 + 0*n_elem_per_reg ); + + a40v.v = _mm_loadu_pd( a0 + 0*n_elem_per_reg ); + + a41v.v = _mm_loadu_pd( a1 + 0*n_elem_per_reg ); + + // perform : y += alpha * x; + y4v.v = _mm_fmadd_pd( a40v.v, chi0v.xmm[0], y4v.v ); + + y4v.v = _mm_fmadd_pd( a41v.v, chi1v.xmm[0], y4v.v ); + + // Store the output. + _mm_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y4v.v ); + + y0 += 2; + a0 += 2; + a1 += 2; + } + + // If there are leftover iterations, perform them with scalar code. + for ( ; (i + 0) < m ; ++i ) + { + double y0c = *y0; + + const double a0c = *a0; + const double a1c = *a1; + + y0c += chi0 * a0c; + y0c += chi1 * a1c; + + *y0 = y0c; + + a0 += 1; + a1 += 1; + y0 += 1; + } + } + else + { + for ( i = 0; (i + 0) < m ; ++i ) + { + double y0c = *y0; + + const double a0c = *a0; + const double a1c = *a1; + + y0c += chi0 * a0c; + y0c += chi1 * a1c; + + *y0 = y0c; + + a0 += inca; + a1 += inca; + y0 += incy; + } + + } +} + +// ----------------------------------------------------------------------------- +void bli_daxpyf_zen_int_16x4 + ( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + double* restrict alpha, + double* restrict a, inc_t inca, inc_t lda, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + cntx_t* restrict cntx + ) +{ + const dim_t fuse_fac = 4; + + const dim_t n_elem_per_reg = 4; + const dim_t n_iter_unroll = 4; + + dim_t i; + + double* restrict a0; + double* restrict a1; + double* restrict a2; + double* restrict a3; + + double* restrict y0; + + v4df_t chi0v, chi1v, chi2v, chi3v; + + v4df_t a00v, a01v, a02v, a03v; + + v4df_t a10v, a11v, a12v, a13v; + + v4df_t a20v, a21v, a22v, a23v; + + v4df_t a30v, a31v, a32v, a33v; + + v4df_t y0v, y1v, y2v, y3v; + + double chi0, chi1, chi2, chi3; + + v2df_t y4v; + + v2df_t a40v, a41v, a42v, a43v; + + // If either dimension is zero, or if alpha is zero, return early. + if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return; + + // If b_n is not equal to the fusing factor, then perform the entire + // operation as a loop over axpyv. + if ( b_n != fuse_fac ) + { + if(cntx == NULL) cntx = bli_gks_query_cntx(); + daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + + for ( i = 0; i < b_n; ++i ) + { + double* a1 = a + (0 )*inca + (i )*lda; + double* chi1 = x + (i )*incx; + double* y1 = y + (0 )*incy; + double alpha_chi1; + + bli_dcopycjs( conjx, *chi1, alpha_chi1 ); + bli_dscals( *alpha, alpha_chi1 ); + + f + ( + conja, + m, + &alpha_chi1, + a1, inca, + y1, incy, + cntx + ); + } + + return; + } + + // At this point, we know that b_n is exactly equal to the fusing factor. + + a0 = a + 0*lda; + a1 = a + 1*lda; + a2 = a + 2*lda; + a3 = a + 3*lda; + + y0 = y; + + chi0 = *( x + 0*incx ); + chi1 = *( x + 1*incx ); + chi2 = *( x + 2*incx ); + chi3 = *( x + 3*incx ); + + // Scale each chi scalar by alpha. + bli_dscals( *alpha, chi0 ); + bli_dscals( *alpha, chi1 ); + bli_dscals( *alpha, chi2 ); + bli_dscals( *alpha, chi3 ); + + // Broadcast the (alpha*chi?) scalars to all elements of vector registers. + chi0v.v = _mm256_broadcast_sd( &chi0 ); + chi1v.v = _mm256_broadcast_sd( &chi1 ); + chi2v.v = _mm256_broadcast_sd( &chi2 ); + chi3v.v = _mm256_broadcast_sd( &chi3 ); + + // If there are vectorized iterations, perform them with vector + // instructions. + if ( inca == 1 && incy == 1 ) + { + for ( i = 0; (i + 15) < m; i += 16 ) + { + // Load the input values. + y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); + y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); + + a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); + a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); + a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg ); + a30v.v = _mm256_loadu_pd( a0 + 3*n_elem_per_reg ); + + a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); + a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); + a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg ); + a31v.v = _mm256_loadu_pd( a1 + 3*n_elem_per_reg ); + + a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); + a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg ); + a22v.v = _mm256_loadu_pd( a2 + 2*n_elem_per_reg ); + a32v.v = _mm256_loadu_pd( a2 + 3*n_elem_per_reg ); + + a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); + a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg ); + a23v.v = _mm256_loadu_pd( a3 + 2*n_elem_per_reg ); + a33v.v = _mm256_loadu_pd( a3 + 3*n_elem_per_reg ); + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); + y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v ); + y3v.v = _mm256_fmadd_pd( a30v.v, chi0v.v, y3v.v ); + + y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); + y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v ); + y3v.v = _mm256_fmadd_pd( a31v.v, chi1v.v, y3v.v ); + + y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v ); + y2v.v = _mm256_fmadd_pd( a22v.v, chi2v.v, y2v.v ); + y3v.v = _mm256_fmadd_pd( a32v.v, chi2v.v, y3v.v ); + + y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v ); + y2v.v = _mm256_fmadd_pd( a23v.v, chi3v.v, y2v.v ); + y3v.v = _mm256_fmadd_pd( a33v.v, chi3v.v, y3v.v ); + + // Store the output. + _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); + _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v ); + _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v ); + _mm256_storeu_pd( (double *)(y0 + 3*n_elem_per_reg), y3v.v ); + + y0 += n_iter_unroll * n_elem_per_reg; + a0 += n_iter_unroll * n_elem_per_reg; + a1 += n_iter_unroll * n_elem_per_reg; + a2 += n_iter_unroll * n_elem_per_reg; + a3 += n_iter_unroll * n_elem_per_reg; + } + + for ( ; (i + 11) < m; i += 12 ) + { + // Load the input values. + y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); + + a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); + a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); + a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg ); + + a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); + a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); + a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg ); + + a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); + a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg ); + a22v.v = _mm256_loadu_pd( a2 + 2*n_elem_per_reg ); + + a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); + a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg ); + a23v.v = _mm256_loadu_pd( a3 + 2*n_elem_per_reg ); + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); + y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v ); + + y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); + y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v ); + + y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v ); + y2v.v = _mm256_fmadd_pd( a22v.v, chi2v.v, y2v.v ); + + y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v ); + y2v.v = _mm256_fmadd_pd( a23v.v, chi3v.v, y2v.v ); + + // Store the output. + _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); + _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v ); + _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v ); + + y0 += 3 * n_elem_per_reg; + a0 += 3 * n_elem_per_reg; + a1 += 3 * n_elem_per_reg; + a2 += 3 * n_elem_per_reg; + a3 += 3 * n_elem_per_reg; + } + + for ( ; (i + 7) < m; i += 8 ) + { + // Load the input values. + y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + + a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); + a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); + + a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); + a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); + + a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); + a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg ); + + a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); + a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg ); + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v ); + + // Store the output. + _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); + _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v ); + + y0 += 2 * n_elem_per_reg; + a0 += 2 * n_elem_per_reg; + a1 += 2 * n_elem_per_reg; + a2 += 2 * n_elem_per_reg; + a3 += 2 * n_elem_per_reg; + } + + + for ( ; (i + 3) < m; i += 4) + { + // Load the input values. + y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + + a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); + + a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); + + a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); + + a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); + + y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); + + y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); + + y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); + + // Store the output. + _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); + + y0 += n_elem_per_reg; + a0 += n_elem_per_reg; + a1 += n_elem_per_reg; + a2 += n_elem_per_reg; + a3 += n_elem_per_reg; + } +#if 1 + for ( ; (i + 1) < m; i += 2) + { + + // Load the input values. + y4v.v = _mm_loadu_pd( y0 + 0*n_elem_per_reg ); + + a40v.v = _mm_loadu_pd( a0 + 0*n_elem_per_reg ); + + a41v.v = _mm_loadu_pd( a1 + 0*n_elem_per_reg ); + + a42v.v = _mm_loadu_pd( a2 + 0*n_elem_per_reg ); + + a43v.v = _mm_loadu_pd( a3 + 0*n_elem_per_reg ); + + // perform : y += alpha * x; + y4v.v = _mm_fmadd_pd( a40v.v, chi0v.xmm[0], y4v.v ); + + y4v.v = _mm_fmadd_pd( a41v.v, chi1v.xmm[0], y4v.v ); + + y4v.v = _mm_fmadd_pd( a42v.v, chi2v.xmm[0], y4v.v ); + + y4v.v = _mm_fmadd_pd( a43v.v, chi3v.xmm[0], y4v.v ); + + // Store the output. + _mm_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y4v.v ); + + y0 += 2; + a0 += 2; + a1 += 2; + a2 += 2; + a3 += 2; + } +#endif + // If there are leftover iterations, perform them with scalar code. + for ( ; (i + 0) < m ; ++i ) + { + double y0c = *y0; + + const double a0c = *a0; + const double a1c = *a1; + const double a2c = *a2; + const double a3c = *a3; + + y0c += chi0 * a0c; + y0c += chi1 * a1c; + y0c += chi2 * a2c; + y0c += chi3 * a3c; + + *y0 = y0c; + + a0 += 1; + a1 += 1; + a2 += 1; + a3 += 1; + + y0 += 1; + } + } + else + { + for ( i = 0; (i + 0) < m ; ++i ) + { + double y0c = *y0; + + const double a0c = *a0; + const double a1c = *a1; + const double a2c = *a2; + const double a3c = *a3; + + y0c += chi0 * a0c; + y0c += chi1 * a1c; + y0c += chi2 * a2c; + y0c += chi3 * a3c; + + *y0 = y0c; + + a0 += inca; + a1 += inca; + a2 += inca; + a3 += inca; + + y0 += incy; + } + + } +} + + diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index 161bcef1a..c9651554d 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -73,6 +73,7 @@ SCALV_KER_PROT( double, d, scalv_zen_int ) // scalv (intrinsics unrolled x10) SCALV_KER_PROT( float, s, scalv_zen_int10 ) SCALV_KER_PROT( double, d, scalv_zen_int10 ) +SCALV_KER_PROT( scomplex, c, scalv_zen_int10 ) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) @@ -86,11 +87,21 @@ COPYV_KER_PROT( double, d, copyv_zen_int ) SETV_KER_PROT(float, s, setv_zen_int) SETV_KER_PROT(double, d, setv_zen_int) +// swapv (intrinsics) +SWAPV_KER_PROT(float, s, swapv_zen_int8 ) +SWAPV_KER_PROT(double, d, swapv_zen_int8 ) + + // -- level-1f -- // axpyf (intrinsics) AXPYF_KER_PROT( float, s, axpyf_zen_int_8 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_8 ) +AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) +AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) + +AXPYF_KER_PROT( double, d, axpyf_zen_int_16x4 ) +AXPYF_KER_PROT( scomplex, c, axpyf_zen_int_4 ) // dotxf (intrinsics) DOTXF_KER_PROT( float, s, dotxf_zen_int_8 ) @@ -199,3 +210,4 @@ GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 ) + diff --git a/kernels/zen2/1f/bli_axpyf_zen_int_5.c b/kernels/zen2/1f/bli_axpyf_zen_int_5.c deleted file mode 100644 index 5a919b622..000000000 --- a/kernels/zen2/1f/bli_axpyf_zen_int_5.c +++ /dev/null @@ -1,599 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2020, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "immintrin.h" -#include "blis.h" - -/* Union data structure to access AVX registers - One 256-bit AVX register holds 8 SP elements. */ -typedef union -{ - __m256 v; - float f[8] __attribute__((aligned(64))); -} v8sf_t; - -/* Union data structure to access AVX registers -* One 256-bit AVX register holds 4 DP elements. */ -typedef union -{ - __m256d v; - double d[4] __attribute__((aligned(64))); -} v4df_t; - - -void bli_saxpyf_zen_int_5 - ( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - float* restrict alpha, - float* restrict a, inc_t inca, inc_t lda, - float* restrict x, inc_t incx, - float* restrict y, inc_t incy, - cntx_t* restrict cntx - ) -{ - const dim_t fuse_fac = 5; - - const dim_t n_elem_per_reg = 8; - const dim_t n_iter_unroll = 2; - - dim_t i; - - float* restrict a0; - float* restrict a1; - float* restrict a2; - float* restrict a3; - float* restrict a4; - - float* restrict y0; - - v8sf_t chi0v, chi1v, chi2v, chi3v; - v8sf_t chi4v; - - v8sf_t a00v, a01v, a02v, a03v; - v8sf_t a04v; - - v8sf_t a10v, a11v, a12v, a13v; - v8sf_t a14v; - - v8sf_t y0v, y1v; - - float chi0, chi1, chi2, chi3; - float chi4; - - // If either dimension is zero, or if alpha is zero, return early. - if ( bli_zero_dim2( m, b_n ) || bli_seq0( *alpha ) ) return; - - // If b_n is not equal to the fusing factor, then perform the entire - // operation as a loop over axpyv. - if ( b_n != fuse_fac ) - { -#ifdef BLIS_CONFIG_ZEN2 - for ( i = 0; i < b_n; ++i ) - { - float* a1 = a + (0 )*inca + (i )*lda; - float* chi1 = x + (i )*incx; - float* y1 = y + (0 )*incy; - float alpha_chi1; - - bli_scopycjs( conjx, *chi1, alpha_chi1 ); - bli_sscals( *alpha, alpha_chi1 ); - - bli_saxpyv_zen_int10 - ( - conja, - m, - &alpha_chi1, - a1, inca, - y1, incy, - cntx - ); - } - -#else - saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); - - for ( i = 0; i < b_n; ++i ) - { - float* a1 = a + (0 )*inca + (i )*lda; - float* chi1 = x + (i )*incx; - float* y1 = y + (0 )*incy; - float alpha_chi1; - - bli_scopycjs( conjx, *chi1, alpha_chi1 ); - bli_sscals( *alpha, alpha_chi1 ); - - f - ( - conja, - m, - &alpha_chi1, - a1, inca, - y1, incy, - cntx - ); - } - -#endif - return; - } - - // At this point, we know that b_n is exactly equal to the fusing factor. - - a0 = a + 0*lda; - a1 = a + 1*lda; - a2 = a + 2*lda; - a3 = a + 3*lda; - a4 = a + 4*lda; - y0 = y; - - chi0 = *( x + 0*incx ); - chi1 = *( x + 1*incx ); - chi2 = *( x + 2*incx ); - chi3 = *( x + 3*incx ); - chi4 = *( x + 4*incx ); - - - // Scale each chi scalar by alpha. - bli_sscals( *alpha, chi0 ); - bli_sscals( *alpha, chi1 ); - bli_sscals( *alpha, chi2 ); - bli_sscals( *alpha, chi3 ); - bli_sscals( *alpha, chi4 ); - - // Broadcast the (alpha*chi?) scalars to all elements of vector registers. - chi0v.v = _mm256_broadcast_ss( &chi0 ); - chi1v.v = _mm256_broadcast_ss( &chi1 ); - chi2v.v = _mm256_broadcast_ss( &chi2 ); - chi3v.v = _mm256_broadcast_ss( &chi3 ); - chi4v.v = _mm256_broadcast_ss( &chi4 ); - - // If there are vectorized iterations, perform them with vector - // instructions. - if ( inca == 1 && incy == 1 ) - { - for ( i = 0; (i + 15) < m; i += 16 ) - { - // Load the input values. - y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); - y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); - - a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg ); - a10v.v = _mm256_loadu_ps( a0 + 1*n_elem_per_reg ); - - a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg ); - a11v.v = _mm256_loadu_ps( a1 + 1*n_elem_per_reg ); - - a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg ); - a12v.v = _mm256_loadu_ps( a2 + 1*n_elem_per_reg ); - - a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg ); - a13v.v = _mm256_loadu_ps( a3 + 1*n_elem_per_reg ); - - a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg ); - a14v.v = _mm256_loadu_ps( a4 + 1*n_elem_per_reg ); - - // perform : y += alpha * x; - y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v ); - y1v.v = _mm256_fmadd_ps( a10v.v, chi0v.v, y1v.v ); - - y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v ); - y1v.v = _mm256_fmadd_ps( a11v.v, chi1v.v, y1v.v ); - - y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v ); - y1v.v = _mm256_fmadd_ps( a12v.v, chi2v.v, y1v.v ); - - y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v ); - y1v.v = _mm256_fmadd_ps( a13v.v, chi3v.v, y1v.v ); - - y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v ); - y1v.v = _mm256_fmadd_ps( a14v.v, chi4v.v, y1v.v ); - - - // Store the output. - _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v ); - _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v ); - - y0 += n_iter_unroll * n_elem_per_reg; - a0 += n_iter_unroll * n_elem_per_reg; - a1 += n_iter_unroll * n_elem_per_reg; - a2 += n_iter_unroll * n_elem_per_reg; - a3 += n_iter_unroll * n_elem_per_reg; - a4 += n_iter_unroll * n_elem_per_reg; - } - - for( ; (i + 7) < m; i += 8 ) - { - // Load the input values. - y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); - - a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg ); - a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg ); - a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg ); - a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg ); - a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg ); - - - // perform : y += alpha * x; - y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v ); - y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v ); - y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v ); - y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v ); - y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v ); - - // Store the output. - _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v ); - - y0 += n_elem_per_reg; - a0 += n_elem_per_reg; - a1 += n_elem_per_reg; - a2 += n_elem_per_reg; - a3 += n_elem_per_reg; - a4 += n_elem_per_reg; - } - - // If there are leftover iterations, perform them with scalar code. - for ( ; (i + 0) < m ; ++i ) - { - double y0c = *y0; - - const float a0c = *a0; - const float a1c = *a1; - const float a2c = *a2; - const float a3c = *a3; - const float a4c = *a4; - - y0c += chi0 * a0c; - y0c += chi1 * a1c; - y0c += chi2 * a2c; - y0c += chi3 * a3c; - y0c += chi4 * a4c; - - *y0 = y0c; - - a0 += 1; - a1 += 1; - a2 += 1; - a3 += 1; - a4 += 1; - y0 += 1; - } - } - else - { - for ( i = 0; (i + 0) < m ; ++i ) - { - double y0c = *y0; - - const float a0c = *a0; - const float a1c = *a1; - const float a2c = *a2; - const float a3c = *a3; - const float a4c = *a4; - - y0c += chi0 * a0c; - y0c += chi1 * a1c; - y0c += chi2 * a2c; - y0c += chi3 * a3c; - y0c += chi4 * a4c; - - *y0 = y0c; - - a0 += inca; - a1 += inca; - a2 += inca; - a3 += inca; - a4 += inca; - y0 += incy; - } - - } -} - - -// ----------------------------------------------------------------------------- - -void bli_daxpyf_zen_int_5 - ( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - double* restrict alpha, - double* restrict a, inc_t inca, inc_t lda, - double* restrict x, inc_t incx, - double* restrict y, inc_t incy, - cntx_t* restrict cntx - ) -{ - const dim_t fuse_fac = 5; - - const dim_t n_elem_per_reg = 4; - const dim_t n_iter_unroll = 2; - - dim_t i; - - double* restrict a0; - double* restrict a1; - double* restrict a2; - double* restrict a3; - double* restrict a4; - - double* restrict y0; - - v4df_t chi0v, chi1v, chi2v, chi3v; - v4df_t chi4v; - - v4df_t a00v, a01v, a02v, a03v; - v4df_t a04v; - - v4df_t a10v, a11v, a12v, a13v; - v4df_t a14v; - - v4df_t y0v, y1v; - - double chi0, chi1, chi2, chi3; - double chi4; - - // If either dimension is zero, or if alpha is zero, return early. - if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return; - - // If b_n is not equal to the fusing factor, then perform the entire - // operation as a loop over axpyv. - if ( b_n != fuse_fac ) - { -#ifdef BLIS_CONFIG_ZEN2 - for ( i = 0; i < b_n; ++i ) - { - double* a1 = a + (0 )*inca + (i )*lda; - double* chi1 = x + (i )*incx; - double* y1 = y + (0 )*incy; - double alpha_chi1; - - bli_dcopycjs( conjx, *chi1, alpha_chi1 ); - bli_dscals( *alpha, alpha_chi1 ); - - bli_daxpyv_zen_int10 - ( - conja, - m, - &alpha_chi1, - a1, inca, - y1, incy, - cntx - ); - } - -#else - daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); - - for ( i = 0; i < b_n; ++i ) - { - double* a1 = a + (0 )*inca + (i )*lda; - double* chi1 = x + (i )*incx; - double* y1 = y + (0 )*incy; - double alpha_chi1; - - bli_dcopycjs( conjx, *chi1, alpha_chi1 ); - bli_dscals( *alpha, alpha_chi1 ); - - f - ( - conja, - m, - &alpha_chi1, - a1, inca, - y1, incy, - cntx - ); - } - -#endif - return; - } - - // At this point, we know that b_n is exactly equal to the fusing factor. - - a0 = a + 0*lda; - a1 = a + 1*lda; - a2 = a + 2*lda; - a3 = a + 3*lda; - a4 = a + 4*lda; - y0 = y; - - chi0 = *( x + 0*incx ); - chi1 = *( x + 1*incx ); - chi2 = *( x + 2*incx ); - chi3 = *( x + 3*incx ); - chi4 = *( x + 4*incx ); - - - // Scale each chi scalar by alpha. - bli_dscals( *alpha, chi0 ); - bli_dscals( *alpha, chi1 ); - bli_dscals( *alpha, chi2 ); - bli_dscals( *alpha, chi3 ); - bli_dscals( *alpha, chi4 ); - - // Broadcast the (alpha*chi?) scalars to all elements of vector registers. - chi0v.v = _mm256_broadcast_sd( &chi0 ); - chi1v.v = _mm256_broadcast_sd( &chi1 ); - chi2v.v = _mm256_broadcast_sd( &chi2 ); - chi3v.v = _mm256_broadcast_sd( &chi3 ); - chi4v.v = _mm256_broadcast_sd( &chi4 ); - - // If there are vectorized iterations, perform them with vector - // instructions. - if ( inca == 1 && incy == 1 ) - { - for ( i = 0; (i + 7) < m; i += 8 ) - { - // Load the input values. - y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); - y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); - - a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); - a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); - - a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); - a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); - - a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); - a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg ); - - a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); - a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg ); - - a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg ); - a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg ); - - // perform : y += alpha * x; - y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); - y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); - - y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); - y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); - - y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); - y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v ); - - y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); - y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v ); - - y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v ); - y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v ); - - - // Store the output. - _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v ); - _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v ); - - y0 += n_iter_unroll * n_elem_per_reg; - a0 += n_iter_unroll * n_elem_per_reg; - a1 += n_iter_unroll * n_elem_per_reg; - a2 += n_iter_unroll * n_elem_per_reg; - a3 += n_iter_unroll * n_elem_per_reg; - a4 += n_iter_unroll * n_elem_per_reg; - } - - for( ; (i + 3) < m; i += 4 ) - { - // Load the input values. - y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); - - a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); - a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); - a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); - a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); - a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg ); - - - // perform : y += alpha * x; - y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); - y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); - y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); - y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); - y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v ); - - // Store the output. - _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v ); - - y0 += n_elem_per_reg; - a0 += n_elem_per_reg; - a1 += n_elem_per_reg; - a2 += n_elem_per_reg; - a3 += n_elem_per_reg; - a4 += n_elem_per_reg; - } - - // If there are leftover iterations, perform them with scalar code. - for ( ; (i + 0) < m ; ++i ) - { - double y0c = *y0; - - const double a0c = *a0; - const double a1c = *a1; - const double a2c = *a2; - const double a3c = *a3; - const double a4c = *a4; - - y0c += chi0 * a0c; - y0c += chi1 * a1c; - y0c += chi2 * a2c; - y0c += chi3 * a3c; - y0c += chi4 * a4c; - - *y0 = y0c; - - a0 += 1; - a1 += 1; - a2 += 1; - a3 += 1; - a4 += 1; - y0 += 1; - } - } - else - { - for ( i = 0; (i + 0) < m ; ++i ) - { - double y0c = *y0; - - const double a0c = *a0; - const double a1c = *a1; - const double a2c = *a2; - const double a3c = *a3; - const double a4c = *a4; - - y0c += chi0 * a0c; - y0c += chi1 * a1c; - y0c += chi2 * a2c; - y0c += chi3 * a3c; - y0c += chi4 * a4c; - - *y0 = y0c; - - a0 += inca; - a1 += inca; - a2 += inca; - a3 += inca; - a4 += inca; - y0 += incy; - } - - } -} - diff --git a/test/test_copyv.c b/test/test_copyv.c new file mode 100644 index 000000000..a85004f12 --- /dev/null +++ b/test/test_copyv.c @@ -0,0 +1,218 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" + +//#define BLIS_ACCURACY_TEST +#ifdef BLIS_ACCURACY_TEST + +bool scompare_result( int n, float *x, int incx, float *y, int incy ) +{ + for ( int i = 0; i < n; i++ ) + { + if ( (*x) != (*y) ) + { + printf( "%4f != %4f at location %d\n", *x, *y, i ); + return FALSE; + } + x += incx; + y += incy; + } + return TRUE; +} + +bool dcompare_result( int n, double *x, int incx, double *y, int incy ) +{ + for ( int i = 0; i < n; i++ ) + { + if ( (*x) != (*y) ) + { + printf( "%4f != %4f at location %d\n", *x, *y, i ); + return FALSE; + } + x += incx; + y += incy; + } + return TRUE; +} + +#endif + + +int main( int argc, char** argv ) +{ + obj_t x, y; + dim_t n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int n_input, sizeof_dt; + int r, n_repeats; + num_t dt; + + double dtime; + double dtime_save; + double gbps; + + //bli_init(); + + n_repeats = 100000; + +#ifndef PRINT + p_begin = 200; + p_end = 100000; + p_inc = 200; + + n_input = -1; +#else + p_begin = 16; + p_end = 16; + p_inc = 1; + + n_input = 16; +#endif + +#if 1 + // dt = BLIS_FLOAT; + dt = BLIS_DOUBLE; +#else + //dt = BLIS_SCOMPLEX; + dt = BLIS_DCOMPLEX; +#endif + + if ( dt == BLIS_FLOAT ) sizeof_dt = sizeof( float ); + else if ( dt == BLIS_DOUBLE ) sizeof_dt = sizeof( double ); + + printf( "executable\t n\t GBs per sec\n" ); + + for ( p = p_begin; p <= p_end; p += p_inc ) + { + + if ( n_input < 0 ) n = p * ( dim_t )abs( n_input ); + else n = ( dim_t ) n_input; + + bli_obj_create( dt, n, 1, 0, 0, &x ); + bli_obj_create( dt, n, 1, 0, 0, &y ); + + bli_randm( &x ); + + dtime_save = DBL_MAX; + + for ( r = 0; r < n_repeats; ++r ) + { + + dtime = bli_clock(); + +#ifdef BLIS + bli_copyv( &x, + &y ); +#else + if ( bli_is_float( dt ) ) + { + f77_int nn = bli_obj_length( &x ); + f77_int incx = bli_obj_vector_inc( &x ); + float* xp = bli_obj_buffer( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + float* yp = bli_obj_buffer( &y ); + + scopy_( &nn, + xp, &incx, + yp, &incy ); + + } + else if ( bli_is_double( dt ) ) + { + + f77_int nn = bli_obj_length( &x ); + f77_int incx = bli_obj_vector_inc( &x ); + double* xp = bli_obj_buffer( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + double* yp = bli_obj_buffer( &y ); + + dcopy_( &nn, + xp, &incx, + yp, &incy ); + } +#endif + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + +#ifdef BLIS_ACCURACY_TEST + if ( dt == BLIS_FLOAT ) + { + int nn = bli_obj_length( &x ); + int incx = bli_obj_vector_inc( &x ); + float* xp = bli_obj_buffer( &x ); + int incy = bli_obj_vector_inc( &y ); + float* yp = bli_obj_buffer( &y ); + if ( scompare_result( nn, xp, incx, yp, incy ) ) + printf( "Copy Successful\n" ); + else + printf( "ALERT!!! Copy Failed\n" ); + } + if ( dt == BLIS_DOUBLE ) + { + int nn = bli_obj_length( &x ); + int incx = bli_obj_vector_inc( &x ); + double* xp = bli_obj_buffer( &x ); + int incy = bli_obj_vector_inc( &y ); + double* yp = bli_obj_buffer( &y ); + if ( dcompare_result( nn, xp, incx, yp, incy ) ) + printf( "Copy Successful\n" ); + else + printf( "ALERT!!! Copy Failed\n" ); + } +#endif + } + + // Size of the vectors are incrementd by 1000, to test wide range of inputs. + if ( p >= 1000 ) p_inc = 1000; + if ( p >= 10000 ) p_inc = 10000; + gbps = ( n * sizeof_dt ) / ( dtime_save * 1.0e9 ); + +#ifdef BLIS + printf( "data_copyv_blis\t" ); +#else + printf( "data_copyv_%s\t", BLAS ); +#endif + printf( "%4lu\t %7.2f\n", + ( unsigned long )n, gbps ); + + bli_obj_free( &x ); + bli_obj_free( &y ); + } + + //bli_finalize(); + + return 0; +} diff --git a/test/test_swapv.c b/test/test_swapv.c new file mode 100644 index 000000000..4d8d35eac --- /dev/null +++ b/test/test_swapv.c @@ -0,0 +1,180 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020-2022, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "blis.h" + +// n x incx y incy +//void dswap_( int*, double*, int*, double*, int* ); +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t x, y; + dim_t n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int n_input; + int r, n_repeats; + num_t dt; + + double dtime; + double dtime_save; + double gflops; + + bli_init(); + + n_repeats = 3; + +#ifndef PRINT + p_begin = 40; + p_end = 8000; + p_inc = 40; + + n_input = -1; +#else + p_begin = 16; + p_end = 16; + p_inc = 1; + + n_input = -1; +#endif + +#if 1 + dt = BLIS_FLOAT; + //dt = BLIS_DOUBLE; +#else + //dt = BLIS_SCOMPLEX; + dt = BLIS_DCOMPLEX; +#endif + + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ; +#ifdef BLIS + printf( "data_swapv_blis" ); +#else + printf( "data_swapv_%s", BLAS ); +#endif + printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )0, 0.0 ); + + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) + { + + if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + + bli_obj_create( dt, n, 1, 0, 0, &x ); + bli_obj_create( dt, n, 1, 0, 0, &y ); + + bli_randm( &x ); + bli_randm( &y ); + + dtime_save = 1.0e9; + + for ( r = 0; r < n_repeats; ++r ) + { + + dtime = bli_clock(); + +#ifdef PRINT + bli_printm( "x", &x, "%4.1f", "" ); + bli_printm( "y", &y, "%4.1f", "" ); +#endif + +#ifdef BLIS + + bli_swapv( &x, + &y ); +#else + if ( bli_is_float( dt ) ) + { + f77_int nn = bli_obj_length( &x ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + float* xp = bli_obj_buffer( &x ); + float* yp = bli_obj_buffer( &y ); + + sswap_( &nn, + xp, &incx, + yp, &incy ); + + } + else if ( bli_is_double( dt ) ) + { + + f77_int nn = bli_obj_length( &x ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + double* xp = bli_obj_buffer( &x ); + double* yp = bli_obj_buffer( &y ); + + dswap_( &nn, + xp, &incx, + yp, &incy ); + } +#endif + +#ifdef PRINT + bli_printm( "X after", &x, "%4.1f", "" ); + bli_printm( "Y after", &y, "%4.1f", "" ); + + exit(1); +#endif + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( n ) / ( dtime_save * 1.0e9 ); + +#ifdef BLIS + printf( "data_swapv_blis" ); +#else + printf( "data_swapv_%s", BLAS ); +#endif + printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )n, gflops ); + + bli_obj_free( &x ); + bli_obj_free( &y ); + } + + bli_finalize(); + + return 0; +} From cf06364327bd2d21d606392371ff3c5962bee5ba Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 29 Mar 2022 16:18:25 -0500 Subject: [PATCH 043/230] Fixed typo in BLAS gemm3m call to _check(). Details: - Fixed an unresolved symbol issue leftover from #590 whereby ?gemm3m_() as defined in bla_gemm3m.c was referencing bla_gemm3m_check(), which does not exist. It should have simply called the _check() function for gemm. --- frame/compat/extra/bla_gemm3m.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frame/compat/extra/bla_gemm3m.c b/frame/compat/extra/bla_gemm3m.c index 11d542e69..4533375f0 100644 --- a/frame/compat/extra/bla_gemm3m.c +++ b/frame/compat/extra/bla_gemm3m.c @@ -67,7 +67,7 @@ void PASTEF77(ch,blasname) \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ - PASTEBLACHK(blasname) \ + PASTEBLACHK(blisname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ @@ -162,7 +162,7 @@ void PASTEF77(ch,blasname) \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ - PASTEBLACHK(blasname) \ + PASTEBLACHK(blisname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ From bee7678b2558a691ac850819dbe33fefe4fdbee3 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 31 Mar 2022 14:09:39 -0500 Subject: [PATCH 044/230] CREDITS file update. --- CREDITS | 1 + 1 file changed, 1 insertion(+) diff --git a/CREDITS b/CREDITS index 85ed97c6a..b701598cf 100644 --- a/CREDITS +++ b/CREDITS @@ -105,6 +105,7 @@ but many others have contributed code and feedback, including Meghana Vankadari @Meghana-vankadari (AMD) Kiran Varaganti @kvaragan (AMD) Natalia Vassilieva (Hewlett Packard Enterprise) + @h-vetinari Andrew Wildman @awild82 (University of Washington) Zhang Xianyi @xianyi (Chinese Academy of Sciences) Benda Xu @heroxbd From 99bb9002f1aff598d347eae2821a3f7bdd1f48e8 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 1 Apr 2022 08:10:59 -0500 Subject: [PATCH 045/230] ReleaseNotes.md update in advance of next version. --- docs/ReleaseNotes.md | 110 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md index ce6f29a1c..ccb4d9f0e 100644 --- a/docs/ReleaseNotes.md +++ b/docs/ReleaseNotes.md @@ -4,6 +4,7 @@ ## Contents +* [Changes in 0.9.0](ReleaseNotes.md#changes-in-090) * [Changes in 0.8.1](ReleaseNotes.md#changes-in-081) * [Changes in 0.8.0](ReleaseNotes.md#changes-in-080) * [Changes in 0.7.0](ReleaseNotes.md#changes-in-070) @@ -39,6 +40,115 @@ * [Changes in 0.0.2](ReleaseNotes.md#changes-in-002) * [Changes in 0.0.1](ReleaseNotes.md#changes-in-001) +## Changes in 0.9.0 +April 1, 2022 + +Improvements present in 0.9.0: + +Framework: +- Added various fields to `obj_t` that relate to storing function pointers to custom `packm` kernels, microkernels, etc as well as accessor functions to set and query those fields. (Devin Matthews) +- Enabled user-customized `packm` microkernels and variants via the aforementioned new `obj_t` fields. (Devin Matthews) +- Moved edge-case handling out of the macrokernel and into the `gemm` and `gemmtrsm` microkernels. This also required updating of APIs and definitions of all existing microkernels in `kernels` directory. Edge-case handling functionality is now facilitated via new preprocessor macros found in `bli_edge_case_macro_defs.h`. (Devin Matthews) +- Avoid `gemmsup` thread barriers when not packing A or B. This boosts performance for many small multithreaded problems. (Field Van Zee, AMD) +- Allow the 1m method to operate normally when single and double real-domain microkernels mix row and column I/O preference. (Field Van Zee, Devin Matthews, RuQing Xu) +- Removed support for execution of complex-domain level-3 operations via the 3m and 4m methods. +- Refactored `herk`, `her2k`, `syrk`, `syr2k` in terms of `gemmt`. (Devin Matthews) +- Defined `setijv` and `getijv` to set/get vector elements. +- Defined `eqsc`, `eqv`, and `eqm` operations to test equality between two scalars, vectors, or matrices. +- Added new bounds checking to `setijm` and `getijm` to prevent use of negative indices. +- Renamed `membrk` files/variables/functions to `pba`. +- Store error-checking level as a thread-local variable. (Devin Matthews) +- Add `err_t*` "return" parameter to `bli_malloc_*()` and friends. +- Switched internal mutexes of the `sba` and `pba` to static initialization. +- Changed return value method of `bli_pack_get_pack_a()`, `bli_pack_get_pack_b()`. +- Fixed a bug that allows `bli_init()` to be called more than once (without segfaulting). (@lschork2, Minh Quan Ho, Devin Matthews) +- Removed a sanity check in `bli_pool_finalize()` that prevented BLIS from being re-initialized. (AMD) +- Fixed insufficient `pool_t`-growing logic in `bli_pool.c`, and always allocate at least one element in `.block_ptrs` array. (Minh Quan Ho) +- Cleanups related to the error message array in `bli_error.c`. (Minh Quan Ho) +- Moved language-related definitions from `bli_macro_defs.h` to a new header, `bli_lang_defs.h`. +- Renamed `BLIS_SIMD_NUM_REGISTERS` to `BLIS_SIMD_MAX_NUM_REGISTERS` and `BLIS_SIMD_SIZE` to `BLIS_SIMD_MAX_SIZE` for improved clarity. (Devin Matthews) +- Many minor bugfixes. +- Many cleanups, including removal of old and commented-out code. + +Compatibility: +- Expanded BLAS layer to include support for `?axpby_()` and `?gemm_batch_()`. (Meghana Vankadari, AMD) +- Added `gemm3m` APIs to BLAS and CBLAS layers. (Bhaskar Nallani, AMD) +- Handle `?gemm_()` invocations where m or n is unit by calling `?gemv_()`. (Dipal M Zambare, AMD) +- Removed option to finalize BLIS after every BLAS call. +- Updated default definitions of `bli_slamch()` and `bli_dlamch()` to use constants from standard C library rather than values computed at runtime. (Devin Matthews) + +Kernels: +- Added 512-bit SVE-based `a64fx` subconfiguration that uses empirically-tuned blocksizes (Stepan Nassyr, RuQing Xu) +- Added a vector-length agnostic `armsve` subconfig that computes blocksizes via an analytical model. (Stepan Nassyr) +- Added vector-length agnostic d/s/sh `gemm` kernels for Arm SVE. (Stepan Nassyr) +- Added `gemmsup` kernels to the `armv8a` kernel set for use in new Apple Firestorm subconfiguration. (RuQing Xu) +- Added 512-bit SVE `dpackm` kernels (16xk and 10xk) with in-register transpose. (RuQing Xu) +- Extended 256-bit SVE `dpackm` kernels by Linaro Ltd. to 512-bit for size 12xk. (RuQing Xu) +- Reorganized register usage in `bli_gemm_armv8a_asm_d6x8.c` to accommodate clang. (RuQing Xu) +- Added `saxpyf`/`daxpyf`/`caxpyf` kernels to `zen` kernel set. (Dipal M Zambare, AMD) +- Added `vzeroupper` instruction to `haswell` microkernels. (Devin Matthews) +- Added explicit `beta == 0` handling in s/d `armsve` and `armv7a` `gemm` microkernels. (Devin Matthews) +- Added a unique tag to branch labels to accommodate clang. (Devin Matthews, Jeff Hammond) +- Fixed a copy-paste bug in the loading of `kappa_i` in the two assembly `cpackm` kernels in `haswell` kernel set. (Devin Matthews) +- Fixed a bug in Mx1 `gemmsup` `haswell` kernels whereby the `vhaddpd` instruction is used with uninitialized registers. (Devin Matthews) +- Fixed a bug in the `power10` microkernel I/O. (Nicholai Tukanov) +- Many other Arm kernel updates and fixes. (RuQing Xu) + +Extras: +- Added support for addons, which are similar to sandboxes but do not require the user to implement any particular operation. +- Added a new `gemmlike` sandbox to allow rapid prototyping of `gemm`-like operations. +- Various updates and improvements to the `power10` sandbox, including a new testsuite. (Nicholai Tukanov) + +Build system: +- Added explicit support for AMD's Zen3 microarchitecture. (Dipal M Zambare, AMD, Field Van Zee) +- Added runtime microarchitecture detection for Arm. (Dave Love, RuQing Xu, Devin Matthews) +- Added a new `configure` option `--[en|dis]able-amd-frame-tweaks` that allows BLIS to compile certain framework files (each with the `_amd` suffix) that have been customized by AMD for improved performance (provided that the targeted configuration is eligible). By default, the more portable counterparts to these files are compiled. (Field Van Zee, AMD) +- Added an explicit compiler predicate (`is_win`) for Windows in `configure`. (Devin Matthews) +- Use `-march=haswell` instead of `-march=skylake-avx512` on Windows. (Devin Matthews, @h-vetinari) +- Fixed `configure` breakage on MacOSX by accepting either `clang` or `LLVM` in vendor string. (Devin Matthews) +- Blacklist clang10/gcc9 and older for `armsve` subconfig. +- Added a `configure` option to control whether or not to use `@rpath`. (Devin Matthews) +- Added armclang detection to `configure`. (Devin Matthews) +- Use `@path`-based install name on MacOSX and use relocatable `RPATH` entries for testsuite binaries. (Devin Matthews) +- For environment variables `CC`, `CXX`, `FC`, `PYTHON`, `AR`, and `RANLIB`, `configure` will now print an error message and abort if a user specifies a specific tool and that tool is not found. (Field Van Zee, Devin Matthews) +- Added symlink to `blis.pc.in` for out-of-tree builds. (Andrew Wildman) +- Register optimized real-domain `copyv`, `setv`, and `swapv` kernels in `zen` subconfig. (Dipal M Zambare, AMD) +- Added Apple Firestorm (A14/M1) subconfiguration, `firestorm`. (RuQing Xu) +- Added `armsve` subconfig to `arm64` configuration family. (RuQing Xu) +- Allow using clang with the `thunderx2` subconfiguration. (Devin Matthews) +- Fixed a subtle substitution bug in `configure`. (Chengguo Sun) +- Updated top-level Makefile to reflect a dependency on the "flat" `blis.h` file for the BLIS and BLAS testsuite objects. (Devin Matthews) +- Mark `xerbla_()` as a "weak" symbol on MacOSX. (Devin Matthews) +- Fixed a long-standing bug in `common.mk` whereby the header path to `cblas.h` was omitted from the compiler flags when compiling CBLAS files within BLIS. +- Added a custom-made recursive `sed` script to `build` directory. +- Minor cleanups and fixes to `configure`, `common.mk`, and others. + +Testing: +- Fixed a race condition in the testsuite when the SALT option (simulate application-level threading) is enabled. (Devin Matthews) +- Test 1m method execution during `make check`. (Devin Matthews) +- Test `make install` in Travis CI. (Devin Matthews) +- Test C++ in Travis CI to make sure `blis.h` is C++-compatible. (Devin Matthews) +- Disabled SDE testing of pre-Zen microarchitectures via Travis CI. +- Added Travis CI support for testing Arm SVE. (RuQing Xu) +- Updated SDE usage so that it is downloaded from a separate repository (ci-utils) in our GitHub organization. (Field Van Zee, Devin Matthews) +- Updated octave scripts in `test/3` to be robust against missing datasets as well as to fixed a few minor issues. +- Added `test_axpbyv.c` and `test_gemm_batch.c` test driver files to `test` directory. (Meghana Vankadari, AMD) +- Support all four datatypes in `her`, `her2`, `herk`, and `her2k` drivers in `test` directory. (Madan mohan Manokar, AMD) + +Documentation: +- Added documentation for: `setijv`, `getijv`, `eqsc`, `eqv`, `eqm`. +- Added `docs/Addons.md`. +- Added dedicated "Performance" and "Example Code" sections to `README.md`. +- Updated `README.md`. +- Updated `docs/Sandboxes.md`. +- Updated `docs/Multithreading.md`. (Devin Matthews) +- Updated `docs/KernelHowTo.md`. +- Updated `docs/Performance.md` to report Fujitsu A64fx (512-bit SVE) results. (RuQing Xu) +- Updated `docs/Performance.md` to report Graviton2 Neoverse N1 results. (Nicholai Tukanov) +- Updated `docs/FAQ.md` with new questions. +- Fixed typos in `docs/FAQ.md`. (Gaëtan Cassiers) +- Various other minor fixes. + ## Changes in 0.8.1 March 22, 2021 From 14c86f66b20901b60ee276da355c1b62642c18d2 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 1 Apr 2022 08:12:06 -0500 Subject: [PATCH 046/230] Version file update (0.9.0) --- version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version b/version index 6f4eebdf6..ac39a106c 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.8.1 +0.9.0 From 88cab8383ca90ddbb4cf13e69b7d44a1663a4425 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 1 Apr 2022 08:12:06 -0500 Subject: [PATCH 047/230] CHANGELOG update (0.9.0) --- CHANGELOG | 2886 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 2882 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 13eaa52ca..27bb039b5 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,10 +1,2888 @@ -commit 8535b3e11d2297854991c4272932ce4974dda629 (HEAD -> master, tag: 0.8.1) +commit 14c86f66b20901b60ee276da355c1b62642c18d2 (HEAD -> master, tag: 0.9.0) +Author: Field G. Van Zee +Date: Fri Apr 1 08:12:06 2022 -0500 + + Version file update (0.9.0) + +commit 99bb9002f1aff598d347eae2821a3f7bdd1f48e8 (origin/master, origin/HEAD) +Author: Field G. Van Zee +Date: Fri Apr 1 08:10:59 2022 -0500 + + ReleaseNotes.md update in advance of next version. + +commit bee7678b2558a691ac850819dbe33fefe4fdbee3 (origin/dev, origin/amd, dev, amd) +Author: Field G. Van Zee +Date: Thu Mar 31 14:09:39 2022 -0500 + + CREDITS file update. + +commit cf06364327bd2d21d606392371ff3c5962bee5ba +Author: Field G. Van Zee +Date: Tue Mar 29 16:18:25 2022 -0500 + + Fixed typo in BLAS gemm3m call to _check(). + + Details: + - Fixed an unresolved symbol issue leftover from #590 whereby ?gemm3m_() + as defined in bla_gemm3m.c was referencing bla_gemm3m_check(), which + does not exist. It should have simply called the _check() function for + gemm. + +commit 1ec020b33ece1681c0041e2549eed2bd4c6cf356 +Author: Dipal M Zambare <71366780+dzambare@users.noreply.github.com> +Date: Wed Mar 30 02:45:36 2022 +0530 + + AMD kernel updates; frame-specific AMD updates. (#597) + + Details: + - Allow building BLIS with certain framework files (each with the '_amd' + suffix) that have been customized by AMD for Zen-based hardware. These + customized files were derived from portable versions of the same files + (i.e., those without the '_amd' suffix). Whether the portable or AMD- + specific files are compiled is now controlled by a new configure + option, --[en|dis]able-amd-frame-tweaks. This option is disabled by + default in vanilla BLIS, though AMD may choose to enable it by default + in their fork. For now, the added AMD-specific files are: + - bli_gemv_unf_var2_amd.c + - bla_copy_amd.c + - bla_gemv_amd.c + These files reside in 'amd' subdirectories found within the directory + housing their generic counterparts. + - Register optimized real-domain copyv, setv, and swapv kernels in + bli_cntx_init_zen.c. + - Various minor updates to level-1v kernels in 'zen' kernel set. + - Added caxpyf kernel as well as saxpyf and multiple daxpyf kernels to + the 'zen' kernel set + - If the problem passed to ?gemm_() in bla_gemm.c has a unit m or n dim, + call gemv instead and return early. + - Combined variable declarations with their initialization in various + level-2 and level-3 BLAS compatibility files, and also inserted + 'const' qualifer in those same declaration statements. + - Moved frame/compat/bla_gemmt.c and .h to frame/compat/extra/ . + - Added copyv and swapv test drivers to 'test' directory. + - Whitespace, comment changes. + +commit 0db2bd5341c5c3ed5f1cc2bffa90952735efa45f +Author: Bhaskar Nallani +Date: Fri Mar 25 05:11:55 2022 +0530 + + Added BLAS/CBLAS APIs for gemm3m. (#590) + + Details: + - Created ?gemm3m_() and cblas_?gemm3m() APIs that (for now) simply + invoke the 1m implementation unconditionally. (Note that these APIs + bypass sup handling.) + - Added BLAS prototypes for gemm3m in frame/compat/bla_gemm3m.h. + - Added CBLAS prototypes for gemm3m in frame/compat/cblas/src/cblas.h. + - Relocated: + frame/compat/cblas/src/cblas_?gemmt.c + files into + frame/compat/cblas/src/extra/ + - Relocated frame/compat/bla_gemmt.? into frame/compat/extra/ . + - Minor reorganization of prototypes and cpp macro directives in + bli_blas.h, cblas.h, and cblas_f77.h. + - Trival whitespace change to cblas_zgemm.c. + +commit d6810000e961fe807dc5a7db81180a8355f3eac0 +Author: Devin Matthews +Date: Mon Mar 14 10:29:54 2022 -0500 + + Update Multithreading.md + + Add notes about `BLIS_IR_NT` (should typically be 1) and `BLIS_JR_NT` (should typically be small, e.g. <= 4). [ci skip] + +commit f1dbb0e514f53a3240d3a6cbdc3306b01a2206f5 +Author: Field G. Van Zee +Date: Fri Mar 11 13:38:28 2022 -0600 + + Trival whitespace change; commit log addendum. + + Details: + - A co-attribution to Mithun Mohan was inadvertently omitted from the + commit log for headline change in the previous commit, 7c07b47. + +commit 7c07b477e432adbbce5812ed9341ba3092b03976 +Author: Field G. Van Zee +Date: Fri Mar 11 13:28:50 2022 -0600 + + Avoid gemmsup barriers when not packing A or B. (#622) + + Details: + - Implemented a multithreaded optimization for the special (and common) + case of employing the gemmsup code path when the user requests + (implicitly or explicitly) that neither A nor B be packed during + computation. This optimization takes the form of a greatly reduced + code branch in bli_thrinfo_sup_create_for_cntl(), which avoids a + broadcast and two barriers, and results in higher performance when + obtaining two-way or higher parallelism within BLIS. Thanks to + Bhaskar Nallani of AMD for proposing this change via issue #605. + - Added an early return branch to bli_thrinfo_create_for_cntl() that + detects and quickly handles cases where no parallelism is being + obtained within BLIS (i.e., single-threaded execution). Note that + this special case handling was/is already present in + bli_thrinfo_sup_create_for_cntl(). + - CREDITS file update. + +commit cad10410b2305bc0e328c5f2517ab02593b53428 +Author: Ivan Korostelev +Date: Thu Mar 10 09:58:14 2022 -0600 + + POWER10: edge cases in microkernel (#620) + + Use new API for POWER10 gemm microkernel + +commit 71851a0549276b17db18a0a0c8ab4f54493bf033 +Author: Field G. Van Zee +Date: Tue Mar 8 17:38:09 2022 -0600 + + Fixed level-3 performance bug in haswell ukernels. + + Details: + - Fixed a performance regression affecting nearly all level-3 operations + that use the 'haswell' sgemm and dgemm microkernels. This regression + was introduced in 54fa28b, caused by an ill-formed conditional + expression in the assembly code that controls whether cache lines of C + should be prefetched as rows or as columns. Essentially, the two + branches were reversed, causing incomplete prefetching to occur for + both row- and column-stored instances of matrix C. Thanks to Devin + Matthews for his help finding and fixing this bug. + +commit 84732bf95634ac606c5f2661d9474318e366c386 +Author: Field G. Van Zee +Date: Mon Feb 28 12:19:31 2022 -0600 + + Revamp how tools are handled/checked by configure. + + Details: + - Consolidate handling of tools that are specifiable via CC, CXX, FC, + PYTHON, AR, and RANLIB into one bash function, select_tool_w_env(). + - If the user specifies a tool via an environment variable (e.g. + CC=gcc) and that tool does not seem valid, print an error message + and abort configure, unless the tool is optional (e.g. CXX or FC), + in which case a warning message is printed instead. + - The definition of "seems valid" above amounts to: + - responding to at least one of a basic set of command line options + (e.g. --version, -V, -h) if the os_name is Linux (since GNU tools + tend to respond to flags such as --version) or if the tool in + question is CC, CXX, FC, or PYTHON (which tend to respond to the + expected flags regardless of OS) + - the binary merely existing for AR and RANLIB on Darwin/OSX/BSD. + (These OSes tend to have non-GNU versions of ar and ranlib, which + typically do not respond to --version and friends.) + - This PR addresses #584. Thanks to Devin Matthews for suggesting some + of the changes in this commit. + +commit d5146582b1f1bcdccefe23925d3b114d40cd7e31 +Author: RuQing Xu +Date: Wed Feb 23 03:35:46 2022 +0900 + + ArmSVE Ensure Non-zero Block Size (#615) + + Fixes #613. There are several macros/environment variables which need to be tuned to get good cache block sizes. It would be nice to have a way of getting values automatically. + +commit 4d8352309784403ed6719528968531ffb4483947 +Author: RuQing Xu +Date: Wed Feb 23 01:03:47 2022 +0900 + + Add armsve to arm64 Metaconfig (#614) + + Availability of the `armsve` subconfig is controlled by the compiler version (gcc/clang). Tested for SVE and non-SVE. Fixes #612. + +commit c9700f369aa84fc00f36c4b817ffb7dab72b865d +Author: Field G. Van Zee +Date: Tue Feb 15 15:36:52 2022 -0600 + + Renamed SIMD-related macro constants for clarity. + + Details: + - Renamed the following macros defined in bli_kernel_macro_defs.h: + + BLIS_SIMD_NUM_REGISTERS -> BLIS_SIMD_MAX_NUM_REGISTERS + BLIS_SIMD_SIZE -> BLIS_SIMD_MAX_SIZE + + Also updated all instances of these macros elsewhere, including + subconfigurations, source code, and documentation. Thanks to Devin + Matthews for suggesting this change. + +commit ee9ff988c49f16696679d4c6cd3dcfcac7295be7 +Author: Field G. Van Zee +Date: Tue Feb 15 15:01:51 2022 -0600 + + Move edge cases to gemmtrsm ukrs; doc updates. + + Details: + - Moved edge-case handling into the gemmtrsm microkernel. This required + changing the microkernel API to take m and n dimension parameters as + well as updating all existing gemmtrsm microkernel function pointer + types, function signatures, and related definitions to take m and n + dimensions. Also updated all existing gemmtrsm kernels in the + 'kernels' directory (which for now is limited to haswell and penryn + kernel sets, plus native and 1m-based reference kernels in + 'ref_kernels') to take m and n dimensions, and implemented edge-case + handling within those microkernels via a collection of new C + preprocessor macros defined within bli_edge_case_macro_defs.h. Note + that the edge-case handling for gemm-like operations had already + been relocated into the gemm microkernel in 54fa28b. + - Added desriptive comments to GEMM_UKR_SETUP_CT() and related macros in + bli_edge_case_macro_defs.h to allow for easier reading. + - Updated docs/KernelsHowTo.md to reflect above changes. Also cleaned up + the bullet under "Implementation Notes for gemm" that covers alignment + issues. (Thanks to Ivan Korostelev for pointing out the confusing and + outdated language in issue #591.) + - Other minor tweaks to KernelsHowTo.md. + +commit 25061593460767221e1066f9d720fa6676bbed8f +Author: Devin Matthews +Date: Sun Feb 13 20:11:55 2022 -0600 + + Don't use `-Wl,-flat-namespace`. + + Flat namespaces can cause problems due to conflicting system libraries, + etc., so just mark `xerbla_` as a weak symbol on macOS instead. + +commit 5a4d3f5208d3d8cc1827f8cc90414c764b7ebab3 +Author: Devin Matthews +Date: Sun Feb 13 17:28:30 2022 -0600 + + Use -flat_namespace option to link on macOS + + Fixes #611. + +commit 26742910a087947780a089360e2baf82ea109e01 +Author: Devin Matthews +Date: Sun Feb 13 16:53:45 2022 -0600 + + Update CC_VENDOR logic + + Look for `GCC` in addition to `gcc` to handle weird conda version strings. [ci skip] + +commit 2f3872e01d51545c687ae2c8b2650e00552111a7 +Author: RuQing Xu +Date: Mon Feb 7 17:14:49 2022 +0900 + + ArmSVE Adopts Label Wrapper + + For clang (& armclang?) compilation. + + Hopefully solves #609 . + +commit 72089bb2917b78d99cf4f27c69125bf213ee54e6 +Author: RuQing Xu +Date: Sat Feb 5 16:56:04 2022 +0900 + + ArmSVE Use Predicate in M-Direction + + No need to query MR during kernel runtime. + +commit 9cc897f37455d52fbba752e3801f1a9d4a5bfdc1 +Author: Ruqing Xu +Date: Thu Feb 3 16:40:02 2022 +0000 + + Fix SVE Compil. + +commit b5df1811f1bc8212b2cda6bb97b79819afe236a8 +Author: RuQing Xu +Date: Thu Feb 3 02:31:29 2022 +0900 + + Armv8a, ArmSVE: Simplify Gen-C + +commit 35195bb5cea5d99eb3eaf41e3815137d14ceb52d +Author: Devin Matthews +Date: Mon Jan 31 10:29:50 2022 -0600 + + Add armclang detection to configure. + + armclang is treated as regular clang. Fixes #606. [ci skip] + +commit 0be9282cdccf73342d8571d3f7971a9b0af72363 +Author: Field G. Van Zee +Date: Wed Jan 26 17:46:24 2022 -0600 + + Updated zen3 macro constant names. + + Details: + - In config/zen3/bli_family_zen3.h, renamed: + BLIS_SMALL_MATRIX_A_THRES_M_GEMMT -> _M_SYRK + BLIS_SMALL_MATRIX_A_THRES_N_GEMMT -> _N_SYRK + Thanks to Jeff Diamond for helping spot the stale _SYRK naming. + +commit 0ab20c0e72402ba0b17fe2c3ed3e16bf2ace0fd3 +Author: Jeff Hammond +Date: Thu Jan 13 07:29:56 2022 -0800 + + the Apple local label thing is required by Clang in general + + @egaudry and I both saw this issue on Linux with Clang 10. + + ``` + Compiling obj/thunderx2/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.o ('thunderx2' CFLAGS for kernels) + kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c:171:49: fatal error: invalid symbol redefinition + " \n\t" + ^ + :90:5: note: instantiated into assembly here + .SLOOPKITER: + ^ + 1 error generated. + ``` + + Signed-off-by: Jeff Hammond + +commit 81f93be0561c705ae6823d19e40849facc40bef7 +Author: Devin Matthews +Date: Mon Jan 10 10:19:47 2022 -0600 + + Fix row-/column-major pref. in 16x8 haswell sgemm ukr (unused) + +commit 268ce1f29a717d18304713ecc25a2eafe41838c7 +Author: Devin Matthews +Date: Mon Jan 10 10:17:17 2022 -0600 + + Relax alignment constraints + + Remove alignment of temporary AB buffer in edge case handling macros unless alignment is specifically requested (e.g. Core2, SDB/IVB). Fixes #595. + +commit 3f2440b0226d5e23a43d12105d74aa917cd6c610 +Author: Field G. Van Zee +Date: Thu Jan 6 14:57:36 2022 -0600 + + Added m, n dims to gemmd/gemmlike ukernel calls. + + Details: + - Updated the gemmd addon and the gemmlike sandbox code to use the new + microkernel calling sequence, which now includes m and n dimensions so + that the microkernel has all the information necessary to handle edge + cases. Thanks to Jeff Diamond for catching this, which ideally would + have been included in commit 54fa28b. + - Retired var2 of both gemmd and gemmlike to 'attic' directories and + removed their corresponding prototypes. In both cases, var2 was a + variant of the block-panel algorithm where edge-case handling was + abstracted away to a microkernel wrapper. (Since this is now the + official behavior of BLIS microkernels, I saw no need to have it + included as a separate code path.) + - Comment updates. + +commit 864bfab4486ac910ef9a366e9ade4b45a39747fc +Author: Field G. Van Zee +Date: Tue Jan 4 15:10:34 2022 -0600 + + CREDITS file update. + +commit 466b68a3ad118342dc49a8130b7b02f5e7748521 +Author: Devin Matthews +Date: Sun Jan 2 14:59:41 2022 -0600 + + Add unique tag to branch labels for Apple ARM64. + + Add `%=` tag to branch labels, which expands to a unique identifier for each inline assembly block. This prevents duplicate symbol errors on Apple Silicon (#594). Fixes #594. [ci skip] since we can't test Apple Silicon anyways... + +commit 08174a2f6ebbd8ed5aa2bc4edc45da80962f06bb +Author: RuQing Xu +Date: Sat Jan 1 21:35:19 2022 +0900 + + Evict Requirement for SVE GEMM + + For 8<= GCC < 10 compatibility. + +commit 54fa28bd847b389215cffb57a83dc9b3dce79c86 +Author: Devin Matthews +Date: Fri Dec 24 08:00:33 2021 -0600 + + Move edge cases to gemm ukr; more user-custom mods. (#583) + + Details: + - Moved edge-case handling into the gemm microkernel. This required + changing the microkernel API to take m and n dimension parameters. + This required updating all existing gemm microkernel function pointer + types, function signatures, and related definitions to take m and n + dimensions. We also updated all existing kernels in the 'kernels' + directory to take m and n dimensions, and implemented edge-case + handling within those microkernels via a collection of new C + preprocessor macros defined within bli_edge_case_macro_defs.h. Also + removed the assembly code that formerly would handle general stride + IO on the microtile, since this can now be handled by the same code + that does edge cases. + - Pass the obj_t.ker_fn (of matrix C) into bli_gemm_cntl_create() and + bli_trsm_cntl_create(), where this function pointer is used in lieu of + the default macrokernel when it is non-NULL, and ignored when it is + NULL. + - Re-implemented macrokernel in bli_gemm_ker_var2.c to be a single + function using byte pointers rather that one function for each + floating-point datatype. Also, obtain the microkernel function pointer + from the .ukr field of the params struct embedded within the obj_t + for matrix C (assuming params is non-NULL and contains a non-NULL + value in the .ukr field). Communicate both the gemm microkernel + pointer to use as well as the params struct to the microkernel via + the auxinfo_t struct. + - Defined gemm_ker_params_t type (for the aforementioned obj_t.params + struct) in bli_gemm_var.h. + - Retired the separate _md macrokernel for mixed datatype computation. + We now use the reimplemented bli_gemm_ker_var2() instead. + - Updated gemmt macrokernels to pass m and n dimensions into microkernel + calls. + - Removed edge-case handling from trmm and trsm macrokernels. + - Moved most of bli_packm_alloc() code into a new helper function, + bli_packm_alloc_ex(). + - Fixed a typo bug in bli_gemmtrsm_u_template_noopt_mxn.c. + - Added test/syrk_diagonal and test/tensor_contraction directories with + associated code to test those operations. + +commit 961d9d509dd94f3a66f7095057e3dc8eb6d89839 +Author: Kiran +Date: Wed Dec 8 03:00:38 2021 +0530 + + Re-add BLIS_ENABLE_ZEN_BLOCK_SIZES macro for 'zen'. + + Details: + - Added previously-deleted cpp macro block to bli_cntx_init_zen.c + targeting the Naples microarchitecture that enabled different cache + blocksizes when the number of threads exceeds 16. This commit + represents PR #573. + +commit cf7d616a2fd58e293b496770654040818bf5609c +Author: Devin Matthews +Date: Thu Dec 2 17:10:03 2021 -0600 + + Enable user-customized packm ukernel/variant. (#549) + + Details: + - Added four new fields to obj_t: .pack_fn, .pack_params, .ker_fn, and + .ker_params. These fields store pointers to functions and data that + will allow the user to more flexibly create custom operations while + recycling BLIS's existing partitioning infrastructure. + - Updated typed API to packm variant and structure-aware kernels to + replace the diagonal offset with panel offsets, and changed strides + of both C and P to inc/ldim semantics. Updated object API to the packm + variant to include rntm_t*. + - Removed the packm variant function pointer from the packm cntl_t node + definition since it has been replaced by the .pack_fn pointer in the + obj_t. + - Updated bli_packm_int() to read the new packm variant function pointer + from the obj_t and call it instead of from the cntl_t node. + - Moved some of the logic of bli_l3_packm.c to a new file, + bli_packm_alloc.c. + - Rewrote bli_packm_blk_var1.c so that it uses byte (char*) pointers + instead of typed pointers, allowing a single function to be used + regardless of datatype. This obviated having a separate implementation + in bli_packm_blk_var1_md.c. Also relegated handling of scalars to a + new function, bli_packm_scalar(). + - Employed a new standard whereby right-hand matrix operands ("B") are + always packed as column-stored row panels -- that is, identically to + that of left-hand matrix operands ("A"). This means that while we pack + matrix A normally, we actually pack B in a transposed state. This + allowed us to simplify a lot of code throughout the framework, and + also affected some of the logic in bli_l3_packa() and _packb(). + - Simplified bli_packm_init.c in light of the new B^T convention + described above. bli_packm_init()--which is now called from within + bli_packm_blk_var1()--also now calls bli_packm_alloc() and returns + a bool that indicates whether packing should be performed (or + skipped). + - Consolidated bli_gemm_int() and bli_trsm_int() into a bli_l3_int(), + which, among other things, defaults the new .pack_fn field of the + obj_t to bli_packm_blk_var1() if the field is NULL. + - Defined a new function, bli_obj_reset_origin(), which permanently + refocuses the view of an object so that it "forgets" any offsets from + its original pointer. This function also sets the object's root field + to itself. Calls to bli_obj_reset_origin() for each matrix operand + appear in the _front() functions, after the obj_t's are aliased. This + resetting of the underlying matrices' origins is needed in preparation + for more advanced features from within custom packm kernels. + - Redefined bli_pba_rntm_set_pba() from a regular function to a static + inline function. + - Updated gemm_ukr, gemmtrsm_ukr, and trsm_ukr testsuite modules to use + libblis_test_pobj_create() to create local packed objects. Previously, + these packed objects were created by calling lower-level functions. + +commit e229e049ca08dfbd45794669df08a71dba892925 +Author: Field G. Van Zee +Date: Wed Dec 1 17:36:22 2021 -0600 + + Added recu-sed.sh script to 'build' directory. + + Details: + - Added a recursive sed script to the 'build' directory. + +commit 12c66a4acc77bf4927b01e2358e2ac10b61e0a53 +Author: Field G. Van Zee +Date: Fri Nov 19 14:43:53 2021 -0600 + + Minor updates to README.md, docs/Addons.md. + + Details: + - Add additional mentions of addons to README.md, including in the + "What's New" section. + - Removed mention of sandboxes from the long list of advantages + provided by BLIS. + - Very minor description update to opening line of Addons.md. + +commit a4bc03b990fe0572001eb6409efd12cd70677dcf +Author: Field G. Van Zee +Date: Fri Nov 19 13:29:00 2021 -0600 + + Brief mention/link to Addons.md in README.md. + + Details: + - Add a blurb about the new addons feature to the "Documentation for + BLIS developers" section of the README.md, which also links to the + Addons.md document. + +commit b727645eb7a8df39dee74068f734da66322fe0b3 +Merge: 9be97c15 7bde468c +Author: Field G. Van Zee +Date: Fri Nov 19 13:22:09 2021 -0600 + + Merge branch 'dev' + +commit 9be97c150e19fa58bca30cb993a6509ae21e2025 +Author: Madan mohan Manokar <86282872+madanm3@users.noreply.github.com> +Date: Thu Nov 18 00:46:46 2021 +0530 + + Support all four dts in test/test_her[2][k].c (#578) + + Details: + - Replaced the hard-coded calls to double-precision real syr, syr2, + syrk, and syrk in the corresponding standalone test drivers in the + 'test' directory with conditional branches that will call the + appropriate BLAS interface depending on which datatype is enabled. + Thanks to Madan mohan Manokar for this improvement. + - CREDITS file update. + +commit 26e4b6b29312b472c3cadf95ccdf5240764777f4 +Author: Dipal M Zambare <71366780+dzambare@users.noreply.github.com> +Date: Thu Nov 18 00:32:00 2021 +0530 + + Added support for AMD's Zen3 microarchitecture. + + Details: + - Added a new 'zen3' subconfiguration targeting support for the AMD Zen3 + microarchitecture (#561). Thanks to AMD for this contribution. + - Restructured clang and AOCC support for zen, zen2, and zen3 + make_defs.mk files. The clang and AOCC version detection now happens + in configure, not in the subconfigurations' makefile fragments. That + is, we've added logic to configure that detects the version of + clang/AOCC, outputs an appropriate variable to config.mk + (ie: CLANG_OT_*, AOCC_OT_*), and then checks for it within the + makefile fragment (as is currently done for the GCC_OT_* variables). + - Added configure support for a GCC_OT_10_1_0 variable (and associated + substitution anchor) to communicate whether the gcc version is older + than 10.1.0, and use this variable to check for recent enough versions + of gcc to use -march=znver3 in the zen3 subconfig. + - Inlined the contents of config/zen/amd_config.mk into the zen and zen2 + make_defs.mk so that the files are self-contained, harmonizing the + format of all three Zen-based subconfigurations' make_defs.mk files. + - Added indenting (with spaces) of GNU make conditionals for easier + reading in zen, zen2, and zen3 make_defs.mk files. + - Adjusted the range of models checked by bli_cpuid_is_zen() (which was + previously 0x00 ~ 0xff and is now 0x00 ~ 0x2f) so that it is + completely disjoint from the models checked by bli_cpuid_is_zen2() + (0x30 ~ 0xff). This is normally necessary because Zen and Zen2 + microarchitectures share the same family (23, or 0x17), and so the + model code is the only way to differentiate the two. But in our case, + fixing the model range for zen *wasn't* actually necessary since we + checked for zen2 first, and therefore the wide zen range acted like + the 'else' of an 'if-else' statement. That said, the change helps + improve clarity for the reader by encoding useful knowledge, which + was obtained from https://en.wikichip.org/wiki/amd/cpuid . + - Added zen2.def and zen3.def files to the collection in travis/cpuid. + Note that support for zen, zen2, and zen3 is now present, and while + all the three microarchitectures have identical instruction sets from + the perspective of BLIS microkernels, they each correspond to + different subconfigurations and therefore merit separate testing. + Thanks to Devin Matthews for his guidance in hacking these files as + slight modifications of zen.def. + - Enabled testing of zen2 and zen3 via the SDE in travis/do_sde.sh. + Now, zen, zen2, and zen3 are tested through the SDE via Travis CI + builds. + - Updated travis/do_sde.sh to grab the SDE tarball from a new ci-utils + repository on GitHub rather than on Intel's website. This change was + made in an attempt to circumvent recent troubles with Travis CI not + being able to download the SDE directly from Intel's website via curl. + Thanks to Devin Matthews for suggesting the idea. + - Updated travis/do_sde.sh to grab the latest version (8.69.1) of the + Intel SDE from the flame/ci-utils repository. + - Updated .travis.yml to use gcc 9. The file was previously using gcc 8, + which did not support -march=znver2. + - Created amd64_legacy umbrella family in config_registry for targeting + older (bulldozer, piledriver, steamroller, and excavator) + microarchitectures and moved those same subconfigs out of the amd64 + umbrella family. However, x86_64 retains amd64_legacy as a constituent + member. + - Fixed a bug in configure related to the building of the so-called + config list. When processing the contents of config_registry, + configure creates a series of structures and lists that allow for + various mappings related to configuration families, subconfigs, and + kernel sets. Two of those lists are built via substitution of + umbrella families with their subconfig members, and one of those + lists was improperly performing the substitution in a way that would + erroneously match on partial umbrella family names. That code was + changed to match the code that was already doing the substitution + properly, via substitute_words(). Also added comments noting the + importance of using substitute_words() in both instances. + - Comment updates. + +commit 74c0c622216aba0c24aa2c3a923811366a160cf5 +Author: Field G. Van Zee +Date: Tue Nov 16 16:06:33 2021 -0600 + + Reverted cbc88fe. + + Details: + - Reverted the annotation of some markdown code blocks with 'bash' + after realizing that the in-browser syntax highlighting was not + worthwhile. + +commit cbc88feb51b949ce562d044cf9f99c4e46bb8a39 +Author: Field G. Van Zee +Date: Tue Nov 16 16:02:39 2021 -0600 + + Marked some markdown shell code blocks as 'bash'. + + Details: + - Annotated the code blocks that represent shell commands and output as + 'bash' in README.md and BuildSystem.md. + +commit 78cd1b045155ddf0b9ec6e2ab815f2b216ad9a9e +Author: Field G. Van Zee +Date: Tue Nov 16 15:53:40 2021 -0600 + + Added 'Example Code' section to README.md. + + Details: + - Inserted a new 'Example Code' section into the README.md immediately + after the 'Getting Started' section. Thanks to Devin Matthews for + recommending this addition. + - Moved the 'Performance' section of the README down slightly so that it + appears after the 'Documentation' section. + +commit 7bde468c6f7ecc4b5322d2ade1ae9c0b88e6b9f3 +Author: Field G. Van Zee +Date: Sat Nov 13 16:39:37 2021 -0600 + + Added support for addons. + + Details: + - Implemented a new feature called addons, which are similar to + sandboxes except that there is no requirement to define gemm or any + other particular operation. + - Updated configure to accept --enable-addon= or -a syntax + for requesting an addon be included within a BLIS build. configure now + outputs the list of enabled addons into config.mk. It also outputs the + corresponding #include directives for the addons' headers to a new + companion to the bli_config.h header file named bli_addon.h. Because + addons may wish to make use of existing BLIS types within their own + definitions, the addons' headers must be included sometime after that + of bli_config.h (which currently is #included before bli_type_defs.h). + This is why the #include directives needed to go into a new top-level + header file rather than the existing bli_config.h file. + - Added a markdown document, docs/Addons.md, to explain addons, how to + build with them, and what assumptions their authors should keep in + mind as they create them. + - Added a gemmlike-like implementation of sandwich gemm called 'gemmd' + as an addon in addon/gemmd. The code uses a 'bao_' prefix for local + functions, including the user-level object and typed APIs. + - Updated .gitignore so that git ignores bli_addon.h files. + +commit 7bc8ab485e89cfc6032932e57929e208a28f4be5 +Author: Meghana-vankadari <74656386+Meghana-vankadari@users.noreply.github.com> +Date: Fri Nov 12 04:16:14 2021 +0530 + + Added BLAS/CBLAS APIs for axpby, gemm_batch. (#566) + + Details: + - Expanded the BLAS compatibility layer to include support for + ?axpby_() and ?gemm_batch_(). The former is a straightforward + BLAS-like interface into the axpbyv operation while the latter + implements a batched gemm via loops over bli_?gemm(). Also + expanded the CBLAS compatibility layer to include support for + cblas_?axpby() and cblas_?gemm_batch(), which serve as wrappers to + the corresponding (new) BLAS-like APIs. Thanks to Meghana Vankadari + for submitting these new APIs via #566. + - Fixed a long-standing bug in common.mk that for some reason never + manifested until now. Previously, CBLAS source files were compiled + *without* the location of cblas.h being specified via a -I flag. + I'm not sure why this worked, but it may be due to the fact that + the cblas.h file resided in the same directory as all of the CBLAS + source, and perhaps compilers implicitly add a -I flag for the + directory that corresponds to the location of the source file being + compiled. This bug only showed up because some CBLAS-like source code + was moved into an 'extra' subdirectory of that frame/compat/cblas/src + directory. After moving the code, compilation for those files failed + (because the cblas.h header file, presumably, could not be found in + the same location). This bug was fixed within common.mk by explicitly + adding the cblas.h directory to the list of -I flags passed to the + compiler. + - Added test_axpbyv.c and test_gemm_batch.c files to 'test' directory, + and updated test/Makefile to build those drivers. + - Fixed typo in error message string in cblas_sgemm.c. + +commit 28b0982ea70c21841fb23802d38f6b424f8200e1 +Author: Devin Matthews +Date: Wed Nov 10 12:34:50 2021 -0600 + + Refactored her[2]k/syr[2]k in terms of gemmt. (#531) + + Details: + - Renamed herk macrokernels and supporting files and functions to gemmt, + which is possible since at the macrokernel level they are identical. + Then recast herk/her2k/syrk/syr2k in terms of gemmt within the expert + level-3 oapi (bli_l3_oapi_ex.c) while also redefining them as literal + functions rather than cpp macros that instantiate multiple functions. + Thanks to Devin Matthews for his efforts on this issue (#531). + - Check that the maximum stack buffer size is sufficiently large + relative to the register blocksizes for each datatype, and do so when + the context is initialized rather than when an operation is called. + Note that with this change, users who pass in their own contexts into + the expert interfaces currently will *not* have any checks performed. + Thanks to Devin Matthews for suggesting this change. + +commit cfa3db3f3465dc58dbbd842f4462e4b49e7768b4 +Author: Field G. Van Zee +Date: Wed Nov 3 18:13:56 2021 -0500 + + Fixed bug in mixed-dt gemm introduced in e9da642. + + Details: + - Fixed a bug that broke certain mixed-datatype gemm behavior. This + bug was introduced recently in e9da642 when the code that performs + the operation transposition (for microkernel IO preference purposes) + was moved up so that it occurred sooner. However, when I moved that + code, I failed to notice that there was a cpp-protected "if" + conditional that applied to the entire code block that was moved. Once + the code block was relocated, the orphaned if-statement was now + (erroneously) glomming on to the next thing that happened to be in the + function, which happened to be the call to bli_rntm_set_ways_for_op(), + causing a rather odd memory exhaustion error in the sba due to the + num_threads field of the rntm_t still being -1 (because the rntm_t + field were never processed as they should have been). Thanks to + @ArcadioN09 (Snehith) for reporting this error and helpfully including + relevant memory trace output. + +commit f065a8070f187739ec2b34417b8ab864a7de5d7e +Author: Field G. Van Zee +Date: Thu Oct 28 16:05:43 2021 -0500 + + Removed support for 3m, 4m induced methods. + + Details: + - Removed support for all induced methods except for 1m. This included + removing code related to 3mh, 3m1, 4mh, 4m1a, and 4m1b as well as any + code that existed only to support those implementations. These + implementations were rarely used and posed code maintenance challenges + for BLIS's maintainers going forward. + - Removed reference kernels for packm that pack 3m and 4m micropanels, + and removed 3m/4m-related code from bli_cntx_ref.c. + - Removed support for 3m/4m from the code in frame/ind, then reorganized + and streamlined the remaining code in that directory. The *ind(), + *nat(), and *1m() APIs were all removed. (These additional API layers + no longer made as much sense with only one induced method (1m) being + supported.) The bli_ind.c file (and header) were moved to frame/base + and bli_l3_ind.c (and header) and bli_l3_ind_tapi.h were moved to + frame/3. + - Removed 3m/4m support from the code in frame/1m/packm. + - Removed 3m/4m support from trmm/trsm macrokernels and simplified some + pointer arithmetic that was previously expressed in terms of the + bli_ptr_inc_by_frac() static inline function (whose definition was + also removed). + - Removed the following subdirectories of level-0 macro headers from + frame/include/level0: ri3, rih, ri, ro, rpi. The level-0 scalar macros + defined in these directories were used exclusively for 3m and 4m + method codes. + - Simplified bli_cntx_set_blkszs() and bli_cntx_set_ind_blkszs() in + light of 1m being the only induced method left within BLIS. + - Removed dt_on_output field within auxinfo_t and its associated + accessor functions. + - Re-indexed the 1e/1r pack schemas after removing those associated with + variants of the 3m and 4m methods. This leaves two bits unused within + the pack format portion of the schema bitfield. (See bli_type_defs.h + for more info.) + - Spun off the basic and expert interfaces to the object and typed APIs + into separate files: bli_l3_oapi.c and bli_l3_oapi_ex.c; bli_l3_tapi.c + and bli_l3_tapi_ex.c. + - Moved the level-3 operation-specific _check function calls from the + operations' _front() functions to the corresponding _ex() function of + the object API. (This change roughly maintains where the _check() + functions are called in the call stack but lays the groundwork for + future changes that may come to the level-3 object APIs.) Minor + modifications to bli_l3_check.c to allow the check() functions to be + called from the expert interface APIs. + - Removed support within the testsuite for testing the aforementioned + induced methods, and updated the standalone test drivers in the 'test' + directory so reflect the retirement of those induced methods. + - Modified the sandbox contract so that the user is obliged to define + bli_gemm_ex() instead of bli_gemmnat(). (This change was made in light + of the *nat() functions no longer existing.) Also updated the existing + 'power10' and 'gemmlike' sandboxes to come into compliance with the + new sandbox rules. + - Updated BLISObjectAPI.md, BLISTypedAPI.md, Testsuite.md documentation + to reflect the retirement of 3m/4m, and also modified Sandboxes.md to + bring the document into alignment with new conventions. + - Updated various comments; removed segments of commented-out code. + +commit e8caf200a908859fa5f5ea2049911a9bdaa3d270 +Author: Field G. Van Zee +Date: Mon Oct 18 13:04:15 2021 -0500 + + Updated do_sde.sh to get SDE from GitHub. + + Details: + - Updated travis/do_sde.sh so that the script downloads the SDE tarball + from a new ci-utils repository on GitHub rather than from Intel's + website. This change is being made in an attempt to circumvent Travis + CI's recent troubles with downloading the SDE from Intel's website via + curl. Thanks to Devin Matthews for suggesting the idea. + +commit 290ff4b1c26737b074d5abbf76966bc22af8c562 +Author: Field G. Van Zee +Date: Thu Oct 14 16:09:43 2021 -0500 + + Disable SDE testing of old AMD microarchitectures. + + Details: + - Skip testing on piledriver, steamroller, and excavator platforms + in travis/do_sde.sh. + +commit 514fd101742dee557e5eb43d0023a221ae8a7172 +Author: Field G. Van Zee +Date: Thu Oct 14 13:50:28 2021 -0500 + + Fixed substitution bug in configure. + + Details: + - Fixed a bug in configure related to the building of the so-called + config list. When processing the contents of config_registry, + configure creates a series of structures and list that allow for + various mappings related to configuration families, subconfigs, + and kernel sets. Two of those lists are built via subsitituion + of umbrella families with their subconfig members, and one of + those lists was improperly performing the subtitution in a way + that would erroneously match on partial umbrella family names. + That code was changed to match the code that was already doing + the subtitution properly, via substitute_words(). + - Added comments noting the importance of using substitute_words() + in both instances. + +commit e9da6425e27a9d63c9fef92afc2dd750c601ccd7 +Author: Field G. Van Zee +Date: Wed Oct 13 14:15:38 2021 -0500 + + Allow use of 1m with mixing of row/col-pref ukrs. + + Details: + - Fixed a bug that broke the use of 1m for dcomplex when the single- + precision real and double-precision real ukernels had opposing I/O + preferences (row-preferential sgemm ukernel + column-preferential + dgemm ukernel, or vice versa). The fix involved adjusting the API + to bli_cntx_set_ind_blkszs() so that the induced method context init + function (e.g., bli_cntx_init__ind()) could call that + function for only one datatype at a time. This allowed the blocksize + scaling (which varies depending on whether we're doing 1m_r or 1m_c) + to happen on a per-datatype basis. This fixes issue #557. Thanks to + Devin Matthews and RuQing Xu for helping discover and report this bug. + - The aforementioned 1m fix required moving the 1m_r/1m_c logic from + bli_cntx_ref.c into a new function, bli_l3_set_schemas(), which is + called from each level-3 _front() function. The pack_t schemas in the + cntx_t were also removed entirely, along with the associated accessor + functions. This in turn required updating the trsm1m-related virtual + ukernels to read the pack schema for B from the auxinfo_t struct + rather than the context. This also required slight tweaks to + bli_gemm_md.c. + - Repositioned the logic for transposing the operation to accommodate + the microkernel IO preference. This mostly only affects gemm. Thanks + to Devin Matthews for his help with this. + - Updated dpackm pack ukernels in the 'armsve' kernel set to avoid + querying pack_t schemas from the context. + - Removed the num_t dt argument from the ind_cntx_init_ft type defined + in bli_gks.c. The context initialization functions for induced methods + were previously passed a dt argument, but I can no longer figure out + *why* they were passed this value. To reduce confusion, I've removed + the dt argument (including also from the function defintion + + prototype). + - Commented out setting of cntx_t schemas in bli_cntx_ind_stage.c. This + breaks high-leve implementations of 3m and 4m, but this is okay since + those implementations will be removed very soon. + - Removed some older blocks of preprocessor-disabled code. + - Comment update to test_libblis.c. + +commit 81e103463214d589071ccbe2d90b8d7c19a186e4 +Author: Minh Quan Ho <1337056+hominhquan@users.noreply.github.com> +Date: Wed Oct 13 20:28:02 2021 +0200 + + Alloc at least 1 elem in pool_t block_ptrs. (#560) + + Details: + - Previously, the block_ptrs field of the pool_t was allowed to be + initialized as any unsigned integer, including 0. However, a length of + 0 could be problematic given that malloc(0) is undefined and therefore + variable across implementations. As a safety measure, we check for + block_ptrs array lengths of 0 and, in that case, increase them to 1. + - Co-authored-by: Minh Quan Ho + +commit 327481a4b0acf485d0cbdd8635dd9b886ba3f2a7 +Author: Minh Quan Ho <1337056+hominhquan@users.noreply.github.com> +Date: Tue Oct 12 19:53:04 2021 +0200 + + Fix insufficient pool-growing logic in bli_pool.c. (#559) + + Details: + - The current mechanism for growing a pool_t doubles the length of the + block_ptrs array every time the array length needs to be increased + due to new blocks being added. However, that logic did not take in + account the new total number of blocks, and the fact that the caller + may be requesting more blocks that would fit even after doubling the + current length of block_ptrs. The code comments now contain two + illustrating examples that show why, even after doubling, we must + always have at least enough room to fit all of the old blocks plus + the newly requested blocks. + - This commit also happens to fix a memory corruption issue that stems + from growing any pool_t that is initialized with a block_ptrs length + of 0. (Previously, the memory pool for packed buffers of C was + initialized with a block_ptrs length of 0, but because it is unused + this bug did not manifest by default.) + - Co-authored-by: Minh Quan Ho + +commit 32a6d93ef6e2af5e486dfd5e46f8272153d3d53d +Merge: 408906fd 2604f407 +Author: Devin Matthews +Date: Sat Oct 9 15:53:54 2021 -0500 + + Merge pull request #543 from xrq-phys/armsve-packm-fix + + ARMSVE Block SVE-Intrinsic Kernels for GCC 8-9 + +commit 408906fdd8892032aa11bd061b7971128f453bef +Merge: 4277fec0 ccf16289 +Author: Devin Matthews +Date: Sat Oct 9 15:50:25 2021 -0500 + + Merge pull request #542 from xrq-phys/armsve-zgemm + + Arm SVE CGEMM / ZGEMM Natural Kernels + +commit ccf16289d2e71fd9511ccf2d13dcebbfa29deabc +Author: RuQing Xu +Date: Fri Oct 8 12:34:14 2021 +0900 + + Arm SVE C/ZGEMM Fix FMOV 0 Mistake + + FMOV [hsd]M, #imm does not allow zero immediate. + Use wzr, xzr instead. + +commit 82b61283b2005f900101056e6df2a108258db602 +Author: RuQing Xu +Date: Fri Oct 8 12:17:29 2021 +0900 + + SH Kernel Unused Eigher + +commit 1749dfa493054abd2e4ddba7cb21278d337e4f74 +Author: RuQing Xu +Date: Fri Oct 8 12:11:53 2021 +0900 + + Arm SVE C/ZGEMM Support *beta==0 + +commit 4b648e47daad256ab8ab698173a97f71ab9f75eb +Author: RuQing Xu +Date: Wed Sep 22 16:42:09 2021 +0900 + + Arm SVE Config armsve Use ZGEMM/CGEMM + +commit f76ea905e216cf640975e6319c6d2f54aeafed2e +Author: RuQing Xu +Date: Tue Sep 21 20:38:44 2021 +0900 + + Arm SVE: Update Perf. Graph + + Pic. size seems a bit different from upstream. + Generaged w/ MATLAB. Open to any change. + +commit 66a018e6ad00d9e8967b67e1aa3e23b20a7efdfe +Author: RuQing Xu +Date: Mon Sep 20 00:16:11 2021 +0900 + + Arm SVE CGEMM 2Vx10 Unindex Process Alpha=1.0 + +commit 9e1e781cb59f8fadb2a10a02376d3feac17ce38d +Author: RuQing Xu +Date: Sun Sep 19 23:30:42 2021 +0900 + + Arm SVE ZGEMM 2Vx10 Unindex Process Alpha=1.0 + +commit f7c6c2b119423e7ba7a24ae2156790e076071cba +Author: RuQing Xu +Date: Thu Sep 16 01:47:42 2021 +0900 + + A64FX Config Use ZGEMM/CGEMM + +commit e4cabb977d038688688aca39b366f98f9c36b7eb +Author: RuQing Xu +Date: Thu Sep 16 01:34:26 2021 +0900 + + Arm SVE Typo Fix ZGEMM/CGEMM C Prefetch Reg + +commit b677e0d61b23f26d9536e5c363fd6bbab6ee1540 +Author: RuQing Xu +Date: Thu Sep 16 01:18:54 2021 +0900 + + Arm SVE Add SGEMM 2Vx10 Unindexed + +commit 3f68e8309f2c5b31e25c0964395a180a80014d36 +Author: RuQing Xu +Date: Thu Sep 16 01:00:54 2021 +0900 + + Arm SVE ZGEMM Support Gather Load / Scatt. St. + +commit c19db2ff826e2ea6ac54569e8aa37e91bdf7cabe +Author: RuQing Xu +Date: Wed Sep 15 23:39:53 2021 +0900 + + Arm SVE Add ZGEMM 2Vx10 Unindexed + +commit e13abde30b9e0e381c730c496e74bc7ae062a674 +Author: RuQing Xu +Date: Wed Sep 15 04:19:45 2021 +0900 + + Arm SVE Add ZGEMM 2Vx7 Unindexed + +commit 49b9d7998eb86f340ae7b26af3e5a135d6a8feee +Author: RuQing Xu +Date: Tue Sep 14 04:02:47 2021 +0900 + + Arm SVE Add ZGEMM 2Vx8 Unindexed + +commit 4277fec0d0293400497ae8bcfc32be5e62319ae9 +Merge: 2329d990 f44149f7 +Author: Devin Matthews +Date: Thu Oct 7 13:47:22 2021 -0500 + + Merge pull request #533 from xrq-phys/arm64-hi-bw + + ARMv8 PACKM and GEMMSUP Kernels + Apple Firestorm Subconfig + +commit 2329d99016fe1aeb86da4552295f497543cea311 (origin/1m_row_col_problem) +Author: Devin Matthews +Date: Thu Oct 7 12:37:58 2021 -0500 + + Update Travis CI badge + + [ci skip] + +commit f44149f787ae3d4b53d9c4d8e6f23b2818b7770d +Author: RuQing Xu +Date: Fri Oct 8 02:35:58 2021 +0900 + + Armv8 Trash New Bulk Kernels + + - They didn't make much improvements. + - Can't register row-preferral and column-preferral ukrs at the same time. + Will break 1m. + +commit 70b52cadc5ef4c16431e1876b407019e6286614e +Author: Devin Matthews +Date: Thu Oct 7 12:34:35 2021 -0500 + + Enable testing 1m in `make check`. + +commit 2604f4071300d109f28c8438be845aeaf3ec44e4 +Author: RuQing Xu +Date: Thu Oct 7 02:39:00 2021 +0900 + + Config ArmSVE Unregister 12xk. Move 12xk to Old + +commit 1e3200326be9109eb0f8c7b9e4f952e45700cbba +Author: RuQing Xu +Date: Thu Oct 7 02:37:14 2021 +0900 + + Revert __has_include(). Distinguish w/ BLIS_FAMILY_** + +commit a4066f278a5c06f73b16ded25f115ca4b7728ecb +Author: RuQing Xu +Date: Thu Oct 7 02:26:05 2021 +0900 + + Register firestorm into arm64 Metaconfig + +commit d7a3372247c37568d142110a1537632b34b8f2ff +Author: RuQing Xu +Date: Thu Oct 7 02:25:14 2021 +0900 + + Armv8 DGEMMSUP Fix Edge 6x4 Switch Case Typo + +commit 2920dde5ac52e09f84aa42990aab8340421522ce +Author: RuQing Xu +Date: Thu Oct 7 02:01:45 2021 +0900 + + Armv8 DGEMMSUP Fix 8x4m Store Inst. Typo + +commit 14b13583f1802c002e195b3b48874b3ebadbeb20 +Author: Devin Matthews +Date: Wed Oct 6 10:22:34 2021 -0500 + + Add test for Apple M1 (firestorm) + + This test will run on Linux, but all the kernels should run just fine. This does not test autodetection but then none of the other ARM tests do either. + +commit a024715065532400da6257b8b3124ca5aecda405 +Author: RuQing Xu +Date: Thu Oct 7 00:15:54 2021 +0900 + + Firestorm CPUID Dispatcher + + Commenting out due to possibly a Xcode bug. + +commit b9da6d55fec447d05c8b67f34ce83617123d8357 +Author: RuQing Xu +Date: Wed Oct 6 12:25:54 2021 +0900 + + Armv8 GEMMSUP Edge Cases Require Signed Ints + + Fix a bug in bli_gemmsup_rd_armv8a_asm_d6x8m.c. + For safety upon similar strategies in the future, + change all [mn]_[iter/left] into signed ints. + +commit 34919de3df5dda7a06fc09dcec12ca46dc8b26f4 +Author: Devin Matthews +Date: Sat Oct 2 18:48:50 2021 -0500 + + Make error checking level a thread-local variable. + + Previously, this was a global variable. Setting the value was synchronized via a mutex but reading the value was not. Of course, these accesses are almost certainly atomic, but there is still the possibility of one thread attempting to set the value and then reading the value set by another thread. For correct operation under user threading (e.g. pthreads), this should probably be thread-local with no mutex. + +commit c3024993c3d50236fad112822215f066496c5831 +Author: Devin Matthews +Date: Tue Oct 5 15:20:27 2021 -0500 + + Fix data race in testsuite. + +commit 353a0d82572f26e78102cee25693130ce6e0ea5b +Author: Devin Matthews +Date: Tue Oct 5 14:24:17 2021 -0500 + + Update .appveyor.yml + + [ci skip] + +commit 4bfadf9b561d4ebe0bbaf8b6d332f07ff531d618 +Author: RuQing Xu +Date: Wed Oct 6 01:51:26 2021 +0900 + + Firestorm Block Size Fixes + +commit 40baf83f0ea2749199b93b5a8ac45c01794b008c +Author: RuQing Xu +Date: Wed Oct 6 01:00:52 2021 +0900 + + Armv8 Handle *beta == 0 for GEMMSUP ??r Case. + +commit 079fbd42ce8cf7ea67a939b0f80f488de5821319 +Merge: f5c03e9f 9905f443 +Author: Devin Matthews +Date: Mon Oct 4 17:21:48 2021 -0500 + + Merge branch 'master' into arm64-hi-bw + +commit 9905f44347eea4c57ef4927b81f1c63e76a92739 +Merge: 6d3036e3 64a421f6 +Author: Devin Matthews +Date: Mon Oct 4 15:58:59 2021 -0500 + + Merge pull request #553 from flame/rpath-fix + + Add an option to use an @rpath-dependent install_name on macOS + +commit 6d3036e31d8a2c1acbc1260489eeb8f535a8f97a +Merge: 53377fcc eaa554aa +Author: Devin Matthews +Date: Mon Oct 4 15:58:43 2021 -0500 + + Merge pull request #545 from hominhquan/clean_error + + bli_error: more cleanup on the error strings array + +commit 53377fcca91e595787b38e2a47780ac0c35a7e7c +Merge: d0a0b4b8 80c5366e +Author: Devin Matthews +Date: Mon Oct 4 15:45:53 2021 -0500 + + Merge pull request #554 from flame/armsve-cleanup + + Move unused ARM SVE kernels to "old" directory. + +commit 80c5366e4a9b8b72d97fba1eab89bab8989c44f4 +Author: Devin Matthews +Date: Mon Oct 4 15:40:28 2021 -0500 + + Move unused ARM SVE kernels to "old" directory. + +commit 64a421f6983ab5bc0b55df30a2ddcfff5bfd73be +Author: Devin Matthews +Date: Mon Oct 4 13:40:43 2021 -0500 + + Add an option to control whether or not to use @rpath. + + Adds `--enable-rpath/--disable--rpath` (default disabled) to use an install_name starting with @rpath/. Otherwise, set the install_name to the absolute path of the install library, which was the previous behavior. + +commit c4a31683dd6f4da3065d86c11dd998da5192740a +Author: Devin Matthews +Date: Mon Oct 4 13:27:10 2021 -0500 + + Fix $ORIGIN usage on linux. + +commit d0a0b4b841fce56b7b2d3c03c5d93ad173ce2b97 +Author: Dave Love +Date: Mon Oct 4 18:03:04 2021 +0000 + + Arm micro-architecture dispatch (#344) + + Details: + - Reworked support for ARM hardware detection in bli_cpuid.c to parse + the result of a CPUID-like instruction. + - Added a64fx support to bli_gks.c. + - #include arm64 and arm32 family headers from bli_arch_config.h. + - Fix the ordering of the "armsve" and "a64fx" strings in the + config_name string array in bli_arch.c. The ordering did not match + the ordering of the corresponding arch_t values in bli_type_defs.h, + as it should have all along. + - Added clang support to make_defs.mk in arm64, cortexa53, cortexa57 + subconfigs. + - Updated arm64 and arm32 families in config_registry. + - Updated docs/HardwareSupport.md to reflect added ARM support. + - Thanks to Dave Love, RuQing Xu, and Devin Matthews for their + contributions in this PR (#344). + +commit 91408d161a2b80871463ffb6f34c455bdfb72492 +Author: Devin Matthews +Date: Mon Oct 4 11:37:48 2021 -0500 + + Use @path-based install name on MacOS and use relocatable RPATH entries for testsuite inaries. + + - RPATH entries (and DYLD_LIBRARY_PATH) do nothing on macOS unless the install_name of the library starts with @rpath/. While the install_name can be set to the absolute install path, this makes the installation non-relocatable. When using @path in the install_name, install paths within the normal DYLD_LIBRARY_PATH work with no changes on the user side, but for install paths off the beaten track, users must specify an RPATH entry when linking (or modify DYLD_LIBRARY_PATH at runtime). Perhaps this could be made into a configure-time option. + - Having relocable testsuite binaries is not necessarily a priority but it is easy to do with @executable_path (macOS) or $ORIGIN (linux/BSD). + +commit f5c03e9fe808f9bd8a3e0c62786334e13c46b0fc +Author: RuQing Xu +Date: Sun Oct 3 16:51:51 2021 +0900 + + Armv8 Handle *beta == 0 for GEMMSUP ?rc Case. + +commit abc648352c591e26ceee436bd3a45400115b70c5 +Author: RuQing Xu +Date: Sun Oct 3 13:14:19 2021 +0900 + + Armv8 Fix 6x8 Row-Maj Ukr + + - Fixed for 6x8 only, 4x4 & 4x8 pending; + - Installed to config firestorm as benchmark seems to show better perf: + Old: + blis_dgemm_ukr_c 6 8 320 36.87 2.43e-17 PASS + blis_dgemm_ukr_c 6 8 352 40.55 1.04e-17 PASS + blis_dgemm_ukr_c 6 8 384 44.24 5.68e-17 PASS + blis_dgemm_ukr_c 6 8 416 41.67 3.51e-17 PASS + blis_dgemm_ukr_c 6 8 448 34.41 2.94e-17 PASS + blis_dgemm_ukr_c 6 8 480 42.53 2.35e-17 PASS + + New: + blis_dgemm_ukr_r 6 8 352 50.69 1.59e-17 PASS + blis_dgemm_ukr_r 6 8 384 49.15 5.55e-17 PASS + blis_dgemm_ukr_r 6 8 416 50.44 2.86e-17 PASS + blis_dgemm_ukr_r 6 8 448 46.92 3.12e-17 PASS + blis_dgemm_ukr_r 6 8 480 48.08 4.08e-17 PASS + +commit 0a45bc0fbc7aee3876c315ed567fc37f19cdc57f +Merge: 5013a6cb 13dbd5b5 +Author: Devin Matthews +Date: Sat Oct 2 18:59:43 2021 -0500 + + Merge pull request #552 from flame/armsve_beta_0 + + Add explicit handling for beta == 0 in armsve sd and armv7a d gemm ukrs. + +commit 13dbd5b5d3dbf27e33ecf0e98d43c97019a6339d +Author: Devin Matthews +Date: Sat Oct 2 20:40:25 2021 +0000 + + Apply patch from @xrq-phys. + +commit ae0eeeaf77c77892db17027cef10b95ec97c904f +Author: Devin Matthews +Date: Wed Sep 29 16:42:33 2021 -0500 + + Add explicit handling for beta == 0 in armsve sd and armv7a d gemm ukrs. + +commit 5013a6cb7110746c417da96e4a1308ef681b0b88 +Author: Field G. Van Zee +Date: Wed Sep 29 10:38:50 2021 -0500 + + More edits and fixes to docs/FAQ.md. + +commit b36fb0fbc5fda13d9a52cc64953341d3d53067ee +Author: Field G. Van Zee +Date: Tue Sep 28 18:47:45 2021 -0500 + + Fixed newly broken link to CREDITS in FAQ.md. + +commit 3442d4002b3bfffd8848f72103b30691df2b19b1 +Author: Field G. Van Zee +Date: Tue Sep 28 18:43:23 2021 -0500 + + More minor fixes to FAQ.md and Sandboxes.md. + +commit 89aaf00650d6cc19b83af2aea6c8d04ddd3769cb +Author: Field G. Van Zee +Date: Tue Sep 28 18:34:33 2021 -0500 + + Updates to FAQ.md, Sandboxes.md, and README.md. + + Details: + - Updated FAQ.md to include two new questions, reordered an existing + question, and also removed an outdated and redundant question about + BLIS vs. AMD BLIS. + - Updated Sandboxes.md to use 'gemmlike' as its main example, along with + other smaller details. + - Added ARM as a funder to README.md. + +commit c52c43115ec2264fda9380c48d9e6bb1e1ea2ead +Merge: 1fc23d21 1f527a93 +Author: Field G. Van Zee +Date: Sun Sep 26 15:56:54 2021 -0500 + + Merge branch 'dev' + +commit 1fc23d2141189c7b583a5bff2cffd87fd5261444 +Author: Field G. Van Zee +Date: Tue Sep 21 14:54:20 2021 -0500 + + Safelist 'master', 'dev', 'amd' branches. + + Details: + - Modified .travis.yml so that only commits to 'master', 'dev', and + 'amd' branches get built by Travis CI. Thanks to Devin Matthews for + helping to track down the syntax for this change. + +commit 1f527a93b996093e06ef7a8e94fb47ee7e690ce0 +Author: Field G. Van Zee +Date: Mon Sep 20 17:56:36 2021 -0500 + + Re-enable and fix fb93d24. + + Details: + - Re-enabled the changes made in fb93d24. + - Defined BLIS_ENABLE_SYSTEM in bli_arch.c, bli_cpuid.c, and bli_env.c, + all of which needed the definition (in addition to config_detect.c) in + order for the configure-time hardware detection binary to be compiled + properly. Thanks to Minh Quan Ho for helping identify these additional + files as needing to be updated. + - Added additional comments to all four source files, most notably to + prompt the reader to remember to update all of the files when updating + any of the files. Also made the cpp code in each of the files as + consistent/similar as possible. + - Refer to issues #532 and PR #546 for more history. + +commit 7b39c1492067de941f81b49a3b6c1583290336fd +Author: Field G. Van Zee +Date: Mon Sep 20 16:13:50 2021 -0500 + + Reverted fb93d24. + + Details: + - The latest changes in fb93d24 are still causing problems. Reverting + and preparing to move them to a branch. + +commit fb93d242a4fef4694ce2680436da23087bbdd5fe +Author: Field G. Van Zee +Date: Mon Sep 20 15:42:08 2021 -0500 + + Re-enable and fix 8e0c425 (BLIS_ENABLE_SYSTEM). + + Details: + - Re-enable the changes originally made in 8e0c425 but quickly reverted + in 2be78fc. + - Moved the #include of bli_config.h so that it occurs before the + #include of bli_system.h. This allows the #define BLIS_ENABLE_SYSTEM + or #define BLIS_DISABLE_SYSTEM in bli_config.h to be processed by the + time it is needed in bli_system.h. This change should have been + in the original 8e0c425, but was accidentally omitted. Thanks to Minh + Quan Ho for catching this. + - Add #define BLIS_ENABLE_SYSTEM to config_detect.c so that the proper + cpp conditional branch executes in bli_system.h when compiling the + hardware detection binary. The changes made in 8e0c425 were an attempt + to support the definition of BLIS_OS_NONE when configuring with + --disable-system (in issue #532). That commit failed because, aside + from the required but omitted header reordering (second bullet above), + AppVeyor was unable to compile the hardware detection binary as a + result of missing Windows headers. This commit, which builds on PR + #546, should help fix that issue. Thanks to Minh Quan Ho for his + assistance and patience on this matter. + +commit eaa554aa52b879d181fdc87ba0bfad3ab6131517 +Author: Minh Quan HO +Date: Wed Sep 15 15:39:36 2021 +0200 + + bli_error: more cleanup on the error strings array + + - There was redundance between the macro BLIS_MAX_NUM_ERR_MSGS (=200) and + the enum BLIS_ERROR_CODE_MAX (-170), while they both mean the same thing: + the maximal number of error codes/messages. + - The previous initialization of error messages at compile time ignored that + the 'bli_error_string' array still occupies useless memory due to 2D char[][] + declaration. Instead, it should be just an array of pointers, pointing at + strings in .rodata section. + - This commit does the two modifications: + * retired macros BLIS_MAX_NUM_ERR_MSGS and BLIS_MAX_ERR_MSG_LENGTH everywhere + * switch bli_error_string from char[][] to char *[] to reduce its footprint + from 40KB (200*200) to 1.3KB (170*sizeof(char*)). + (No problem to use the enum BLIS_ERROR_CODE_MAX at compile-time, + since compiler is smart enough to determine its value is 170.) + +commit 52f29f739dbbb878c4cde36dbe26b82847acd4e9 +Author: Field G. Van Zee +Date: Fri Sep 17 08:38:29 2021 -0500 + + Removed last vestige of #define BLIS_NUM_ARCHS. + + Details: + - Removed the commented-out #define BLIS_NUM_ARCHS in bli_type_defs.h + and its associated (now outdated) comments. BLIS_NUM_ARCHS has been + part of the arch_t enum for some time now, and so this change is + mostly about removing any opportunity for confusion for people who + may be reading the code. Thanks to Minh Quan Ho for leading me to + cleanup. + +commit 849aae09f4fbf8d7abf11f4df1471f1d057e874b +Author: Field G. Van Zee +Date: Thu Sep 16 14:47:45 2021 -0500 + + Added new packm var3 to 'gemmlike'. + + Details: + - Defined a new packm variant for the 'gemmlike' sandbox. This new + variant (bls_l3_packm_var3.c) parallelizes the packing operation over + the k dimension rather than the m or n dimensions. Note that the + gemmlike implementation still uses var1 by default, and use of the new + code would require changing bls_l3_packm_a.c and/or bls_l3_packm_b.c + so that var3 is called instead. Thanks to Jeff Diamond for proposing + this (perhaps NUMA-friendly) solution. + +commit b6f71fd378b7cd0cdc5c780e0b8c975a7abde998 +Merge: 9293a68e e3dc1954 +Author: Devin Matthews +Date: Thu Sep 16 12:24:33 2021 -0500 + + Merge pull request #544 from flame/haswell-gemmsup-fpe + + Fix more copy-paste errors in the haswell gemmsup code. + +commit e3dc1954ffb5eee2a8b41fce85ba589f75770eea +Author: Devin Matthews +Date: Thu Sep 16 10:59:37 2021 -0500 + + Fix problem where uninitialized registers are included in vhaddpd in the Mx1 gemmsup kernels for haswell. + + The fix is to use the same (valid) source register twice in the horizontal addition. + +commit 5191c43faccf45975f577c60b9089abee25722c9 +Author: Devin Matthews +Date: Thu Sep 16 10:16:17 2021 -0500 + + Fix more copy-paste errors in the haswell gemmsup code. + + Fixes #486. + +commit 30c29b256ef13f0141ca9e9169cbdc7a45ce3a61 +Author: RuQing Xu +Date: Thu Sep 16 05:01:03 2021 +0900 + + Arm SVE Exclude SVE-Intrinsic Kernels for GCC 8-9 + + Affected configs: a64fx. + +commit bffa85be59dece8e756b9444e762f18892c06ee1 +Author: RuQing Xu +Date: Thu Sep 16 04:31:45 2021 +0900 + + Arm SVE: Correct PACKM Ker Name: Intrinsic Kers + + SVE-Intrinsic-based kernels ought not to use asm in their names. + +commit 9293a68eb6557a9ea43a846435908c3d52d4218b +Merge: ade10f42 98ce6e8b +Author: Devin Matthews +Date: Fri Sep 10 14:13:29 2021 -0500 + + Merge pull request #534 from flame/cxx_test + + Add test to Travis using C++ compiler to make sure blis.h is C++-compatible + +commit 98ce6e8bc916e952510872caa60d818d62a31e69 +Author: Devin Matthews +Date: Fri Sep 10 14:12:13 2021 -0500 + + Do a fast test on OSX. [ci skip] + +commit c76fcad0c2836e7140b6bef3942e0a632a5f2cda +Author: Devin Matthews +Date: Fri Sep 10 13:57:02 2021 -0500 + + Fix AArch64 tests and consolidate some other tests. + +commit e486d666ffefee790d5e39895222b575886ac1ea +Author: Devin Matthews +Date: Fri Sep 10 13:50:16 2021 -0500 + + Use C++ cross-compiler for ARM tests. + +commit fbb3560cb8e2aeab205c47c2b096d4fa306d93db +Author: Devin Matthews +Date: Fri Sep 10 13:38:27 2021 -0500 + + Attempt to fix cxx-test for OOT builds. + +commit 9c0064f3f67d59263c62d57ae19605562bb87cc2 +Author: Devin Matthews +Date: Fri Sep 10 10:39:04 2021 -0500 + + Fix config_name in bli_arch.c + +commit ade10f427835d5274411cafc9618ac12966eb1e7 +Author: Field G. Van Zee +Date: Fri Aug 27 12:47:12 2021 -0500 + + Updated travis-ci.org link in README.md to .com. + +commit 2be78fc97777148c83d20b8509e38aa1fc1b4540 +Author: Field G. Van Zee +Date: Fri Aug 27 12:17:26 2021 -0500 + + Disabled (at least temporarily) commit 8e0c425. + + Details: + - Reverted changes in 8e0c425 due to AppVeyor build failures that we do + not yet understand. + +commit 820f11a4694aee5f234e24277aecca40885ae9d4 +Author: RuQing Xu +Date: Fri Aug 27 13:40:26 2021 +0900 + + Arm Whole GEMMSUP Call Route is Asm/Int Optimized + + - `ref2` call in `bli_gemmsup_rv_armv8a_asm_d6x8m.c` is commented out. + - `bli_gemmsup_rv_armv8a_asm_d4x8m.c` contains a tail `ref2` call but + it's not called by any upper routine. + +commit 8e0c4255de52a0a5cffecbebf6314aa52120ebe4 +Author: Field G. Van Zee +Date: Thu Aug 26 15:29:18 2021 -0500 + + Define BLIS_OS_NONE when using --disable-system. + + Details: + - Modified bli_system.h so that the cpp macro BLIS_OS_NONE is defined + when BLIS_DISABLE_SYSTEM is defined. Otherwise, the previous OS- + detecting macro conditionals are considered. This change is to + accommodate a solution to a cross-compilation issue described in + #532. + +commit d6eb70fbc382ad7732dedb4afa01cf9f53e3e027 +Author: Field G. Van Zee +Date: Thu Aug 26 13:12:39 2021 -0500 + + Updated stale calls to malloc_intl() in gemmlike. + + Details: + - Updated two out-of-date calls to bli_malloc_intl() within the gemmlike + sandbox. These calls to malloc_intl(), which resided in + bls_l3_decor_pthreads.c, were missing the err_t argument that the + function uses to report errors. Thanks to Jeff Diamond for helping + isolate this issue. + +commit 2f7325b2b770a15ff8aaaecc087b22238f0c67b7 +Author: Field G. Van Zee +Date: Mon Aug 23 15:04:05 2021 -0500 + + Blacklist clang10/gcc9 and older for 'armsve'. + + Details: + - Prohibit use of clang 10.x and older or gcc 9.x and older for the + 'armsve' subconfiguration. Addresses issue #535. + +commit 7e2951e61fda1c325d6a76ca9956253482d84924 +Author: RuQing Xu +Date: Mon Aug 23 17:06:44 2021 +0900 + + Arm: DGEMMSUP `Macro' Edge Cases Stop Calling Ref + + Ref cannot handle panel strides (packed cases) thus cannot be called + from the beginning of `gemmsup` (i.e. cannot be dispatch target of + gemmsup to other sizes.) + +commit 4fd82b0e9348553d83e258bd4969e49a81f8fcf0 +Author: RuQing Xu +Date: Mon Aug 23 05:18:32 2021 +0900 + + Header Typo + +commit 35409ebe67557c0e7cf5ced138c8166c9c1c909f +Author: RuQing Xu +Date: Mon Aug 23 04:51:47 2021 +0900 + + Arm: DGEMMSUP ??r(rv) Invoke Edge Size + + Plus some fix at edges. + + TODO: Should ensure that no ref kernel appear in beginning of gemmsup + kernels. As ref does not recognise panel stride. + +commit a361492c24fdd919ee037763fc6523e8d7d2967a +Author: RuQing Xu +Date: Mon Aug 23 01:13:39 2021 +0900 + + Arm: DGEMMSUP ?rc(rd) Invoke Edge Size + +commit eaea67401c2ab31f2e51eede59725f64c1a21785 +Merge: 5fc65cdd e320ec6d +Author: Devin Matthews +Date: Sat Aug 21 16:09:31 2021 -0500 + + Merge branch 'master' into cxx_test + +commit 5fc65cdd9e4134c5dcb16d21cd4a79ff426ca9f3 +Author: Devin Matthews +Date: Sat Aug 21 15:59:27 2021 -0500 + + Add test to Travis using C++ compiler to make sure blis.h is C++-compatible. + +commit e320ec6d5cd44e03cb2e2faa1d7625e84f76d668 +Author: Field G. Van Zee +Date: Fri Aug 20 17:15:20 2021 -0500 + + Moved lang defs from _macro_def.h to _lang_defs.h. + + Details: + - Moved miscellaneous language-related definitions, including defs + related to the handling of the 'restrict' keyword, from the top half + of bli_macro_defs.h into a new file, bli_lang_defs.h, which is now + #included immediately after "bli_system.h" in blis.h. This change is + an attempt to fix a report of recent breakage of C++ compilers due + to the recent introduction of 'restrict' in bli_type_defs.h (which + previously was being included *before* bli_macro_defs.h and its + restrict handling therein. Thanks to Ivan Korostelev for reporting + this issue in #527. + - CREDITS file update. + +commit e6799b26a6ecf1e80661a77d857d1c9e9adf50dc +Author: RuQing Xu +Date: Sat Aug 21 02:39:38 2021 +0900 + + Arm: Implement GEMMSUP Fallback Method + + bli_dgemmsup_rv_armv8a_int_6x4mn + +commit 7d5903d8d7570090eb37c592094424d1c64805d1 +Author: RuQing Xu +Date: Sat Aug 21 01:55:50 2021 +0900 + + Arm64 Fix: Support Alpha/Beta in GEMMSUP Intrin + + Forgot to support `alpha`/`beta` in gemmsup_armv8a_int. + +commit 3b275f810b2479eb5d6cf2296e97a658cf1bb769 +Author: Field G. Van Zee +Date: Thu Aug 19 16:06:46 2021 -0500 + + Minor tweaks to gemmlike sandbox. + + Details: + - In the gemmlike sandbox, changed the loop index variable of inner + loop of packm_cxk() from 'd' to 'i' (and likewise for the + corresponding inlined code within packm_var2()). + - Pack matrices A and B using packm_var1() instead of packm_var2(). + +commit 3eccfd456e7e84052c9a429dcde1183a7ecfaa48 +Author: Field G. Van Zee +Date: Thu Aug 19 13:22:10 2021 -0500 + + Added local _check() code to gemmlike sandbox. + + Details: + - Added code to the gemmlike sandbox that handles parameter checking. + Previously, the gemmlike implementation called bli_gemm_check(), which + resides within the BLIS framework proper. Certain modifications that a + user may wish to perform on the sandbox, such as adding a new matrix + or vector operand, would have required additional checks, and so these + changes make it easier for such a person to implement those checks for + their custom gemm-like operation. + +commit 7144230cdb0653b70035ddd91f7f41e06ad8d011 +Author: Field G. Van Zee +Date: Wed Aug 18 13:25:39 2021 -0500 + + README.md citation updates (e.g. BLIS7 bibtex). + +commit 4a955e939044cfd2048cf9f3e33024e3ad1fbe00 +Author: Field G. Van Zee +Date: Mon Aug 16 13:49:27 2021 -0500 + + Tweaks to gemmlike to facilitate 3rd party mods. + + Details: + - Changed the implementation in the 'gemmlike' sandbox to more easily + allow others to provide custom implementations of packm. These changes + include: + - Calling a local version of packm_cxk() that can be modified. This + version of packm_cxk() uses inlined loops in packm_cxk() rather + than querying the context for packm kernels (or even using scal2m). + - Providing two variants of packm, one of which calls the + aforementioned packm_cxk(), the other of which inlines the contents + of packm_cxk() into the variant itself, making it self-contained. + To switch from one to the other, simply change which function gets + called within bls_packm_a() and bls_packm_b(). + - Simplified and cleaned up some variant names in both variants of + packm, relative to their parent code. + +commit 2c0b4150e40c83ea814f69ca766da74c19ed0a58 +Merge: c99fae50 4b8ed99d +Author: Devin Matthews +Date: Sat Aug 14 18:41:35 2021 -0500 + + Merge pull request #527 from flame/obj_t_makeover + + Implement proposed new function pointer fields for obj_t. + +commit 4b8ed99d926876fbf54c15468feae4637268eb6b +Author: Field G. Van Zee +Date: Fri Aug 13 15:31:10 2021 -0500 + + Whitespace tweaks. + +commit c99fae50ac3de0b5380a085aeebebfe67a645407 +Merge: e6d68bc4 4f70eb79 +Author: Devin Matthews +Date: Fri Aug 13 14:48:00 2021 -0500 + + Merge pull request #530 from flame/fix_clang_warnings + + Clean up some warnings that show up on clang/OSX. + +commit e6d68bc4fd0981bea90d7f045779cacfe53f6ae8 +Merge: 20a1c401 ec06b6a5 +Author: Devin Matthews +Date: Fri Aug 13 14:47:46 2021 -0500 + + Merge pull request #529 from flame/fix_make_check_dependencies + + Add dependency on the "flat" blis.h file for the BLIS and BLAS testuite objects. + +commit 1772db029e10e0075b5a59d3fb098487b1ad542a +Author: Devin Matthews +Date: Fri Aug 13 14:46:35 2021 -0500 + + Add row- and column-strides for A/B in obj_ukr_fn_t. + +commit 4f70eb7913ad3ded193870361b6da62b20ec3823 +Author: Devin Matthews +Date: Fri Aug 13 11:12:43 2021 -0500 + + Clean up some warnings that show up on clang/OSX. + +commit 3cddce1e2a021be6064b90af30022b99cbfea986 +Author: Devin Matthews +Date: Thu Aug 12 22:32:34 2021 -0500 + + Remove schema field on obj_t (redundant) and add new API functions. + +commit ec06b6a503a203fa0cdb23273af3c0e3afeae7fa +Author: Devin Matthews +Date: Thu Aug 12 19:27:31 2021 -0500 + + Add dependency on the "flat" blis.h file for the BLIS and BLAS testsuite objects. + + This fixes a bug where "make -j check" may fail after a change to one or more header files, or where testsuite code doesn't get properly recompiled after internal changes. + +commit 20a1c4014c999063e6bc1cfa605b152454c5cbf4 +Author: Field G. Van Zee +Date: Thu Aug 12 14:44:04 2021 -0500 + + Disabled sanity check in bli_pool_finalize(). + + Details: + - Disabled a sanity check in bli_pool_finalize() that was meant to alert + the user if a pool_t was being finalized while some blocks were still + checked out. However, this is exactly the situation that might happen + when a pool_t is re-initialized for a larger blocksize, and currently + bli_pool_reinit() is implemeneted as _finalize() followed by _init(). + So, this sanity check is not universally appropriate. Thanks to + AMD-India for reporting this issue. + +commit e366665cd2b5ae8d7683f5ba2de345df0a41096f +Author: Field G. Van Zee +Date: Thu Aug 12 14:06:53 2021 -0500 + + Fixed stale API calls to membrk API in gemmlike. + + Details: + - Updated stale calls to the bli_membrk API within the 'gemmlike' + sandbox. This API is now called bli_pba (packed block allocator). + Ideally, this forgotten update would have been included as part of + 21911d6, which is when the branch where the membrk->pba changes was + introduced was merged into 'master'. + - Comment updates. + +commit e38ca28689f31c5e5bd2347704dc33042e5ea176 +Author: RuQing Xu +Date: Fri Aug 13 03:21:19 2021 +0900 + + Added Apple Firestorm (A14/M1) Subconfig + + - Use the same bulk kernel as Cortex-A53 / ThunderX2; + - Larger block size; + - Use gemmsup kernels for double precision. + +commit 3df0e9b653fbb1293cad93010273eea579e753d9 +Author: RuQing Xu +Date: Sat Jul 17 04:21:53 2021 +0900 + + Arm64 8x4 Kernel Use Less Regs + +commit 4e7e225057a05b9722ce65ddf75a9c31af9fbf36 +Author: RuQing Xu +Date: Wed Jun 9 15:46:36 2021 +0900 + + Armv8-A Supplimentary GEMMSUP Sizes for RD + +commit c792d506ba09530395c439051727631fd164f59a +Author: RuQing Xu +Date: Sat Jun 5 04:20:24 2021 +0900 + + Armv8-A Fix GEMMSUP-RD Kernels on GNU Asm + + Suffixed NEON opcode is not supported by GNU assembler + +commit ce4473520975c2c8790c82c65a69d75f8ad758ea +Author: RuQing Xu +Date: Sat Jun 5 04:08:14 2021 +0900 + + Armv8-A Adjust Types for PACKM Kernels + + GCC does not have full NEON intrinsics support. + +commit 8a32d19af85b61af92fcab1c316fb3be1a8d42ce +Author: RuQing Xu +Date: Sat Jun 5 03:31:30 2021 +0900 + + Armv8-A GEMMSUP-RD 6x8m + + Armv8-A now has a complete set of GEMMSUP kernels.. + +commit afd0fa6ad1889ed073f781c8aa8635f99e76b601 +Author: RuQing Xu +Date: Sat Jun 5 01:19:01 2021 +0900 + + Armv8-A GEMMSUP-RD 6x8n + +commit 3c5f7405148ab142dee565d00da331d95a7a07b9 +Author: RuQing Xu +Date: Fri Jun 4 21:50:51 2021 +0900 + + Armv8-A s/d Packing Kernels Fix Typo + + For GCC. + +commit 49b05df7929ec3abc0d27b475d2d406116fe2682 +Author: RuQing Xu +Date: Fri Jun 4 18:04:59 2021 +0900 + + Armv8-A Introduced s/d Packing Kernels + + Sizes according to the 2014 kernels. + +commit c3faf93168c3371ff48a2d40d597bdb27021cad4 +Author: RuQing Xu +Date: Thu Jun 3 23:09:05 2021 +0900 + + Armv8-A DGEMMSUP 6x8m Kernel + + Recommended kernels set: + ... + BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, + BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, + BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, + BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, + BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, + BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, + ... + bli_blksz_init ( &blkszs[ BLIS_MR ], -1, 6, -1, -1, + -1, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 ); + ... + +commit 3efe707b5500954941061d4c2363d6ed41d17233 +Author: RuQing Xu +Date: Thu Jun 3 17:20:57 2021 +0900 + + Armv8-A DGEMMSUP Adjustments + +commit 8ed8f5e625de9b77a0f14883283effe79af01771 +Author: RuQing Xu +Date: Thu Jun 3 16:37:37 2021 +0900 + + Armv8-A Add More DGEMMSUP + + - Add 6x8 GEMMSUP. + - Adjust prefetching. + - Workaround for Clang's disability to handle reg clobbering. + - Subproduct 6x8 row-major GEMM <- incomplete. + +commit a9ba79ea14de3b5a271e5970cb473d3c52e2fa5f +Author: RuQing Xu +Date: Wed Jun 2 15:04:29 2021 +0900 + + Armv8-A Add GEMMSUP 4x8n Kernel + + - Compile w/ both GCC & Clang. + - Edge cases use ref-kernels. + - Can give performance boost in some contexts. + +commit df40efe8fbfd399d76c6000ec03791a9b76ffbdf +Author: RuQing Xu +Date: Wed Jun 2 00:04:20 2021 +0900 + + Armv8-A Add Part of GEMMSUP 8x4m Kernel + + - Compile w/ both GCC & Clang + - Only block part is implement. Edge cases WIP + - Not Optimal kernel scheme. Should do 4x8 instead + +commit 66399992881316514f64d68ec9eb60a87d53f674 +Author: RuQing Xu +Date: Sat May 29 05:52:05 2021 +0900 + + Armv8A DGEMM 4x4 Kernel WIP. Slow + + Quite slow. + +commit a29c16394ccef02d29141c79b71fb408e20073e6 +Author: RuQing Xu +Date: Sat May 29 04:58:45 2021 +0900 + + Armv8-A Add 8x4 Kernel WIP + + Test result: a bit lower GFlOps than 6x8. + +commit 64a1f786d58001284aa4f7faf9fae17f0be7a018 +Author: Devin Matthews +Date: Wed Aug 11 17:53:12 2021 -0500 + + Implement proposed new function pointer fields for obj_t. + + The added fields: + 1. `pack_t schema`: storing the pack schema on the object allows the macrokernel to act accordingly without side-channel information from the rntm_t and cntx_t. The pack schema and "pack_[ab]" fields could be removed from those structs. + 2. `void* user_data`: this field can be used to store any sort of additional information provided by the user. The pointer is propagated to submatrix objects and copies, but is otherwise ignored by the framework and the default implementations of the following three fields. User-specified pack, kernel, or ukr functions can do whatever they want with the data, and the user is 100% responsible for allocating, assigning, and freeing this buffer. + 3. `obj_pack_fn_t pack`: the function called when a matrix is packed. This functions receives the expected arguments, as well as a mdim_t and mem_t* as memory must be allocated inside this function, and behavior may differ based on which matrix is being backed (i.e. transposition for B). This could also be achieved by passing a desired pack schema, but this would require additional information to travel down the control tree. + 4. `obj_ker_fn_t ker`: the function called when we get to the "second loop", or the macro-kernel. Behavior may depend on the pack schemas of the input matrices. The default implementation would perform the inner two loops around the ukr, and then call either the default ukr or a user-supplied one (next field). + 5. `obj_ukr_fn_t ukr`: the function called by the default macrokernel. This would replace the various current "virtual" microkernels, and could also be used to supply user-defined behavior. Users could supply both a custom kernel (above) and microkernel, although the user-specified kernel does **not** necessarily have to call the ukr function specified on the obj_t. + + Note that no macros or functions for accessing these new fields have been defined yet. That is next once these are finalized. Addresses https://github.com/flame/blis/projects/1#card-62357687. + +commit a32257eeab2e9946e71546a05a1847a39341ec6b +Author: Field G. Van Zee +Date: Thu Aug 5 16:23:02 2021 -0500 + + Fixed bli_init.c compile-time error on OSX clang. + + Details: + - Fixed a compile-time error in bli_init.c when compiling with OSX's + clang. This error was introduced in 868b901, which introduced a + post-declaration struct assignment where the RHS was a struct + initialization expression (i.e. { ... }). This use of struct + initializer expressions apparently works with gcc despite it not + being strict C99. The fix included in this commit declares a temporary + variable for the purposes of being initialized to the desired value, + via the struct initializer, and then copies the temporary struct (via + '=' struct assignment) to the persistent struct. Thanks to Devin + Matthews for his help with this. + +commit c8728cfbd19ecde9d43af05829e00bcfe7d86eed +Author: Field G. Van Zee +Date: Thu Aug 5 15:17:09 2021 -0500 + + Fixed configure breakage on OSX clang. + + Details: + - Accept either 'clang' or 'LLVM' in vendor string when greping for + the version number (after determining that we're working with clang). + Thanks to Devin Matthews for this fix. + +commit 868b90138e64c873c780d9df14150d2a370a7a42 +Author: Field G. Van Zee +Date: Wed Aug 4 18:31:01 2021 -0500 + + Fixed one-time use property of bli_init() (#525). + + Details: + - Fixes a rather obvious bug that resulted in segmentation fault + whenever the calling application tried to re-initialize BLIS after + its first init/finalize cycle. The bug resulted from the fact that + the bli_init.c APIs made no effort to allow bli_init() to be called + subsequent times at all due to it, and bli_finalize(), being + implemented in terms of pthread_once(). This has been fixed by + resetting the pthread_once_t control variable for initialization + at the end of bli_finalize_apis(), and by resetting the control + variable for finalization at the end of bli_init_apis(). Thanks to + @lschork2 for reporting this issue (#525), and to Minh Quan Ho and + Devin Matthews for suggesting the chosen solution. + - CREDITS file update. + +commit 8dba1e752c6846a85dea50907135bbc5cbc54ee5 +Author: Field G. Van Zee +Date: Tue Jul 27 12:38:24 2021 -0500 + + CREDITS file update. + +commit cc9206df667b7c710b57b190b8ad351176de53b8 +Author: Field G. Van Zee +Date: Fri Jul 16 15:48:37 2021 -0500 + + Added Graviton2 Neoverse N1 performance results. + + Details: + - Added single-threaded and multithreaded performance results to + docs/Performance.md. These results were gathered on a Graviton2 + Neoverse N1 server. Special thanks to Nicholai Tukanov for + collecting these results via the Arm-HPC/AWS hackaton. + - Corrected what was supposed to be a temporary tweak to the legend + labels in test/3/octave/plot_l3_perf.m. + +commit fab5c86d68137b59800715efb69214c0a7e458a7 +Merge: 84f9dcd4 d073fc9a +Author: Devin Matthews +Date: Tue Jul 13 16:46:21 2021 -0500 + + Merge pull request #516 from nicholaiTukanov/p10-sandbox-rework + + P10 sandbox rework + +commit 84f9dcd449fa7a4cf4087fca8ec4ca0d10e9b801 +Author: Devin Matthews +Date: Tue Jul 13 16:45:44 2021 -0500 + + Remove unnecesary windows/zen2 directory. + +commit 21911d6ed3438ca4ba942d05851ba5d7e9835586 +Merge: 17729cf4 689fa0f4 +Author: Field G. Van Zee +Date: Fri Jul 9 18:10:46 2021 -0500 + + Merge branch 'dev' + +commit 17729cf449919d1db9777cea5b65d2efc77e2692 +Author: Devin Matthews +Date: Fri Jul 9 14:59:48 2021 -0500 + + Add vzeroupper to Haswell microkernels. (#524) + + Details: + - Added vzeroupper instruction to the end of all 'gemm' and 'gemmtrsm' + microkernels so as to avoid a performance penalty when mixing AVX + and SSE instructions. These vzeroupper instructions were once part + of the haswell kernels, but were inadvertently removed during a source + code shuffle some time ago when we were managing duplicate 'haswell' + and 'zen' kernel sets. Thanks to Devin Matthews for tracking this down + and re-inserting the missing instructions. + +commit c9a7f59aa84daa54d8f8c771f1f1ef2bd8730da2 +Merge: 75f03907 9a8e649c +Author: Devin Matthews +Date: Thu Jul 8 14:00:38 2021 -0500 + + Merge pull request #522 from flame/windows-avx512 + + Fix Win64 AVX512 bug. + +commit 9a8e649c5ac89eba951bbee7136ca28aeb24d731 +Author: Devin Matthews +Date: Wed Jul 7 15:23:57 2021 -0500 + + Fix Win64 AVX512 bug. + + Use `-march=haswell` for kernels. Fixes #514. + +commit 75f03907c58385b656c8bd35d111db245814a9f3 +Author: Devin Matthews +Date: Wed Jul 7 15:44:11 2021 -0500 + + Add comment about make checkblas on Windows + + [ci skip] + +commit 4651583b1204a965e4aa672c7ad6de60f3ab1600 +Merge: 69205ac2 174f7fc9 +Author: Devin Matthews +Date: Wed Jul 7 01:11:20 2021 -0500 + + Merge pull request #520 from flame/travis-ci-install + + Test installation in Travis CI + +commit 69205ac266947723ad4d7bb028b7521fe5c76991 +Author: Field G. Van Zee +Date: Tue Jul 6 20:39:22 2021 -0500 + + CREDITS file update. + + Details: + - Thanks to Chengguo Sun for submitting #515 (5ef7f68). + - Thanks to Andrew Wildman for submitting #519 (551c6b4). + - Whitespace update to configure (spaces to tabs). + +commit 174f7fc9a11712c7bd1a61510bdc5c262b3e8e1f +Author: Devin Matthews +Date: Tue Jul 6 19:35:55 2021 -0500 + + Test installation in Travis CI + +commit 551c6b4ee8cd9dd2e1d1b46c8dde09eb50b91b2c +Merge: 78eac6a0 f648df4e +Author: Devin Matthews +Date: Tue Jul 6 19:32:53 2021 -0500 + + Merge pull request #519 from awild82/oot_build_bugfix + + Fix installation from out-of-tree builds + +commit f648df4e5588f069b2db96f8be320ead0c1967ef +Author: Andrew Wildman +Date: Tue Jul 6 16:35:12 2021 -0700 + + Add symlink to blis.pc.in for out-of-tree builds + +commit 78eac6a0ab78c995c3f4e46a9e87388b5c3e1af6 +Author: Devin Matthews +Date: Tue Jul 6 11:05:43 2021 -0500 + + Revert "Always run `make check`." + + This reverts commit a201a53440c51244739aaee20e3309b50121cc68. + +commit a201a53440c51244739aaee20e3309b50121cc68 +Author: Devin Matthews +Date: Mon Jul 5 21:39:18 2021 -0500 + + Always run `make check`. + + I'm concerned that problems may lurk for `x86_64` builds on Windows which may be uncovered by a fuller `make check`. + +commit 5ef7f684dc75fc707c82f919e0836615f90a2627 +Merge: aaa10c87 ad6231cc +Author: Devin Matthews +Date: Mon Jul 5 21:35:07 2021 -0500 + + Merge pull request #515 from chengguosun/bug-fix + + Fixed configure script bug. + +commit ad6231cca3fc1e477752ecd31b1ee2323398a642 +Author: sunchengguo +Date: Tue Jul 6 07:30:00 2021 -0400 + + Fixed configure script bug. + Details: + - Fixed kernel list string substitution error by adding function substitute_words in configure script. + if the string contains zen and zen2, and zen need to be replaced with another string, then zen2 + also be incorrectly replaced. + +commit d073fc9acac9d702556cab9fbbb3a253eeb1f998 +Author: nicholaiTukanov +Date: Fri Jul 2 19:54:33 2021 -0500 + + Update POWER10.md + +commit 907226c0af4afb6323b4e02be4f73f5fb89cddaf +Author: nicholaiTukanov +Date: Fri Jul 2 19:47:18 2021 -0500 + + Rework POWER10 sandbox + + - Add a testsuite for gathering performance (in GFLOPs) and measuring correctness for the POWER10 GEMM reduced precision/integer kernels. + - Reworked GENERIC_GEMM template to hardcode the cache parameters. + - Remove kernel wrapper that checked that only allowed matrices that weren't transposed or conjugated. However, the kernels still assume the matrices are not transposed. This wrapper was removed for performance reasons. + - Renamed and restructured files and functions for clarity. + - Editted the POWER10 document to reflect new changes. + +commit aaa10c87e19449674a4ca30fa3b6392bb22c3a66 +Author: Field G. Van Zee +Date: Mon Jun 21 17:53:52 2021 -0500 + + Skip clearing temp microtile in gemmlike sandbox. + + Details: + - Removed code from gemmlike sandbox files bls_gemm_bp_var1.c and + bls_gemm_bp_var2.c that initializes the elements of the temporary + microtile to zero. This code, introduced recently in 7f7d726, did + not actually fix any bug (despite that commit's log entry). The + microtile does not need to be initialized because it is completely + overwritten by a "beta = 0" invocation of gemm prior to it being + read. Any NaNs or Infs present at the outset would have no impact + on the output matrix C. Thanks to Devin Matthews for reminding me + of this. + +commit bc10a3f2ff518360c32bea825b3eb62a9e4c8a77 +Merge: bf727636 6548ceba +Author: Devin Matthews +Date: Fri Jun 18 19:01:08 2021 -0500 + + Merge pull request #492 from flame/thunderx2-clang + + Allow clang for ThunderX2 config + +commit bf727636632a368f3247dc8ab1d4b6119e9c511a +Merge: e28f2a2d 5fc93e28 +Author: Devin Matthews +Date: Fri Jun 18 18:59:43 2021 -0500 + + Merge pull request #506 from xrq-phys/arm64-mac + + BLIS on Darwin_Aarch64 + +commit e28f2a2dfcff14e7094fce0b279b3a917b3ab98c +Merge: d10e05bb 56ffca6a +Author: Devin Matthews +Date: Tue Jun 15 19:35:07 2021 -0500 + + Merge pull request #513 from nicholaiTukanov/asm_warning_p9_fix + + Fix assembler warning in POWER9 DGEMM + +commit 56ffca6a9bc67432a7894298739895f406e5f467 +Author: nicholai +Date: Tue Jun 15 18:17:39 2021 -0500 + + Fix asm warning + +commit 689fa0f40399bde1acc5367d6dd4e8fc4eb6f3ea +Merge: b683d01b d10e05bb +Author: Field G. Van Zee +Date: Sun Jun 13 19:44:14 2021 -0500 + + Merge branch 'master' into dev + +commit d10e05bbd1ce45ce2c0dfe5c64daae2633357b3f +Author: Field G. Van Zee +Date: Sun Jun 13 19:36:16 2021 -0500 + + Sandbox header edits trigger full library rebuild. + + Details: + - Adjusted the top-level Makefile so that any change to a sandbox header + file will result in blis.h being regenerated along with a full + recompilation of the library. Previously, sandbox files were omitted + from the list of header files that, when touched, could trigger a full + rebuild. Why was it like that previously? Because originally we only + envisioned using sandboxes to *replace* gemm, not augment the library + with new functionality. When replacing gemm, blis.h does not need to + contain any local sandbox defintions in order for the user to be able + to (indirectly) use that sandbox. But if you are adding functions to + the library, those functions need to be prototyped so the compiler + can perform type checking against the user's invocation of those new + functions. Thanks to Jeff Diamond for helping us discover this + deficiency in the build system. + +commit 7c3eb44efaa762088c190bb820ef6a3c87db8f65 +Author: Devin Matthews +Date: Wed Jun 2 11:28:22 2021 -0500 + + Add vhsubpd/vhsubpd. + + Horizontal subtraction instructions added to bli_x86_asm_macros.h, currently unused [ci skip]. + +commit 7f7d72610c25f511ba8cd2a53be7b59bdb80f3f3 +Author: Field G. Van Zee +Date: Mon May 31 16:50:18 2021 -0500 + + Fixed bugs in cpackm kernels, gemmlike code. + + Details: + - Fixed intermittent bugs in bli_packm_haswell_asm_c3xk.c and + bli_packm_haswell_asm_c8xk.c whereby the imaginary component of the + kappa scalar was incorrectly loaded at an offset of 8 bytes (instead + of 4 bytes) from the real component. This was almost certainly a copy- + paste bug carried over from the corresonding zpackm kernels. Thanks to + Devin Matthews for bringing this to my attention. + - Added missing code to gemmlike sandbox files bls_gemm_bp_var1.c and + bls_gemm_bp_var2.c that initializes the elements of the temporary + microtile to zero. (This bug was never observed in output but rather + noticed analytically. It probably would have also manifested as + intermittent failures, this time involving edge cases.) + - Minor commented-out/disabled changes to testsuite/src/test_gemm.c + relating to debugging. + +commit 5fc93e280614b4a21a9cff36cf873b4b9407285b +Author: RuQing Xu +Date: Sat May 29 18:44:47 2021 +0900 + + Armv8A Rename Regs for Safe Darwin Compile + + Avoid x18 use in FP32 kernel: + - C address lines x[18-26] renamed to x[19-27] (reg index +1) + - Original role of x27 fulfilled by x5 which is free after k-loop pert. + + FP64 does not require changing since x18 is not used there. + +commit 9f4a4a3cfb2244e4024445e127dafd2a11f39fc5 +Author: RuQing Xu +Date: Sat May 29 17:21:28 2021 +0900 + + Armv8A Rename Regs for Clang Compile: FP32 Part + + Roughly the same as 916e1fa , additionally with x15 clobbering removed. + - x15: Not used at all. + + Compilation w/ Clang shows warning about x18 reservation, but + compilation itself is OK and all tests got passed. + +commit 916e1fa8be3cea0e3e2a4a7e8b00027ac2ee7780 +Author: RuQing Xu +Date: Sat May 29 16:46:52 2021 +0900 + + Armv8A Rename Regs for Clang Compile: FP64 Part + + - x7, x8: Used to store address for Alpha and Beta. + As Alpha & Beta was not used in k-loops, use x0, x1 to load + Alpha & Beta's addresses after k-loops are completed, since A & B's + addresses are no longer needed there. + This "ldr [addr]; -> ldr val, [addr]" would not cause much performance + drawback since it is done outside k-loops and there are plenty of + instructions between Alpha & Beta's loading and usage. + - x9: Used to store cs_c. x9 is multiplied by 8 into x10 and not used + any longer. Directly loading cs_c and into x10 and scale by 8 spares + x9 straightforwardly. + - x11, x12: Not used at all. Simply remove from clobber list. + - x13: Alike x9, loaded and scaled by 8 into x14, except that x13 is + also used in a conditional branch so that "cmp x13, #1" needs to be + modified into "cmp x14, #8" to completely free x13. + - x3, x4: Used to store next_a & next_b. Untouched in k-loops. Load + these addresses into x0 and x1 after Alpha & Beta are both loaded, + since then neigher address of A/B nor address of Alpha/Beta is needed. + +commit 7fabd896af773623ed01820a71bbff432e8a7d25 +Author: RuQing Xu +Date: Sat May 29 16:28:03 2021 +0900 + + Asm Flag Mingling for Darwin_Aarch64 + + Apple+Arm64 requires additional "tagging" of local symbols. + +commit 213dce32d2eed8b7a38c6a3f6112072b0a89ecd0 +Author: Field G. Van Zee +Date: Fri May 28 14:49:57 2021 -0500 + + Added a new 'gemmlike' sandbox. + + Details: + - Added a new sandbox called 'gemmlike', which implements sequential and + multithreaded gemm in the style of gemmsup but also unconditionally + employs packing. The purpose of this sandbox is to + (1) avoid select abstractions, such as objects and control trees, in + order to allow readers to better understand how a real-world + implementation of high-performance gemm can be constructed; + (2) provide a starting point for expert users who wish to build + something that is gemm-like without "reinventing the wheel." + Thanks to Jeff Diamond, Tze Meng Low, Nicholai Tukanov, and Devangi + Parikh for requesting and inspiring this work. + - The functions defined in this sandbox currently use the "bls_" prefix + instead of "bli_" in order to avoid any symbol collisions in the main + library. + - The sandbox contains two variants, each of which implements gemm via a + block-panel algorithm. The only difference between the two is that + variant 1 calls the microkernel directly while variant 2 calls the + microkernel indirectly, via a function wrapper, which allows the edge + case handling to be abstracted away from the classic five loops. + - This sandbox implementation utilizes the conventional gemm microkernel + (not the skinny/unpacked gemmsup kernels). + - Updated some typos in the comments of a few files in the main + framework. + +commit 82af05f54c34526a60fd2ec46656f13e1ac8f719 +Author: Field G. Van Zee +Date: Tue May 25 15:25:08 2021 -0500 + + Updated Fugaku (a64fx) performance results. + + Details: + - Updated the performance graphs (pdfs and pngs) for the Fugaku/a64fx + entry within Performance.md, and also updated the experiment details + accordingly. Thanks to RuQing Xu for re-running the BLIS and SSL2 + experiments reflected in this commit. + - In Performance.md, added an English translation of the project name + under which the Fugaku results were gathered, courtesy of RuQing Xu. + +commit e5c85da3763f73854ecd739ba3008bb467ed77c3 +Merge: cbd8d393 5feb04e2 +Author: Devin Matthews +Date: Mon May 24 16:56:22 2021 -0500 + + Merge pull request #503 from flame/windows-compiler-check + + Add explicit compiler check for Windows. + +commit cbd8d3932599485727204479fded66ac19186db4 +Merge: 6d4ab022 932dfe6a +Author: Devin Matthews +Date: Mon May 24 16:32:42 2021 -0500 + + Merge pull request #500 from xrq-phys/armsve+travis + + Upgrade Travis CI for Arm SVE + +commit 5feb04e233e1e6f81c727578ad9eae1367a2562f +Author: Devin Matthews +Date: Sun May 23 18:46:56 2021 -0500 + + Add explicit compiler check for Windows. + + Check the C compiler for a predefined macro `_WIN32` to indicate (cross-)compilation for Windows. Fixes #463. + +commit 6d4ab0223d9014ac2a66d66759536aa305be5867 +Merge: 61584ded 859fb77a +Author: Devin Matthews +Date: Sun May 23 18:39:53 2021 -0500 + + Merge pull request #502 from flame/rm-rm-dupls + + Remove `rm-dupls` function in common.mk. + +commit 859fb77a320a3ace71d25a8885c23639b097a1b6 +Author: Devin Matthews +Date: Sun May 23 18:15:23 2021 -0500 + + Remove `rm-dupls` function in common.mk. + + AMD requested removal due to unclear licensing terms; original code was from stackoverflow. The function is unused but could easily be replaced by new implementation. + +commit 932dfe6abb9617223bd26a249e53447169033f8c +Author: RuQing Xu +Date: Thu May 20 02:07:31 2021 +0900 + + Travis CI Revert Unnecessary Extras from 91d3636 + + - Removed `V=1` in make line + - Removed `CFLAGS` in configure line + - Restored `pwd` surrounding OOT line + +commit bd156a210d347a073a6939cc4adab3d9256c2e2b +Author: RuQing Xu +Date: Sun May 16 02:56:14 2021 +0900 + + Adjust TravisCI + + - ArmSVE don't test gemmt (seems Qemu-only problem); + - Clang use TravisCI-provided version instead of fixing to clang-8 + due to that clang-8 seems conflicting with TravisCI's clang-7. + +commit 91d3636031021af3712d14c9fcb1eb34b6fe2a31 +Author: RuQing Xu +Date: Sat May 15 17:05:16 2021 +0900 + + Travis Support Arm SVE + + - Updated distro to 20.04 focal aarch64-gcc-10. + This is minimal version required by aarch64-gcc-10. + SVE intrinsics would not compile without GCC >=10. + - x86 toolchains use official repo instead of ubuntu-toolchain-r/test. + 20.04 focal is not supported by that PPA at the moment. + - Add extra configuration-time options to .travis.yml. + - Add Arm SVE entry to .travis.yml. + +commit 61584deddf9b3af6d11a811e6e04328d22390202 +Author: RuQing Xu +Date: Wed May 19 23:52:29 2021 +0900 + + Added 512b SVE-based a64fx subconfig + SVE kernels. + + Details: + - Added 512-bit specific 'a64fx' subconfiguration that uses empirically + tuned block size by Stepan Nassyr. This subconfig also sets the sector + cache size and enables memory-tagging code in SVE gemm kernels. This + subconfig utilizes (16, k) and (10, k) DPACKM kernels. + - Added a vector-length agnostic 'armsve' subconfiguration that computes + blocksizes according to the analytical model. This part is ported from + Stepan Nassyr's repository. + - Implemented vector-length-agnostic [d/s/sh] gemm kernels for Arm SVE + at size (2*VL, 10). These kernels use unindexed FMLA instructions + because indexed FMLA takes 2 FMA units in many implementations. + PS: There are indexed-FLMA kernels in Stepan Nassyr's repository. + - Implemented 512-bit SVE dpackm kernels with in-register transpose + support for sizes (16, k) and (10, k). + - Extended 256-bit SVE dpackm kernels by Linaro Ltd. to 512-bit for + size (12, k). This dpackm kernel is not currently used by any + subconfiguration. + - Implemented several experimental dgemmsup kernels which would + improve performance in a few cases. However, those dgemmsup kernels + generally underperform hence they are not currently used in any + subconfig. + - Note: This commit squashes several commits submitted by RuQing Xu via + PR #424. + +commit b683d01b9c4ea5f64c8031bda816beccfbf806a0 +Author: Field G. Van Zee +Date: Thu May 13 15:23:22 2021 -0500 + + Use extra #undef when including ba/ex API headers. + + Details: + - Inserted a "#include bli_xapi_undef.h" after each usage of the basic + and expert API macro setup headers: bli_oapi_ba.h, bli_oapi_ex.h, + bli_tapi_ba.h, and bli_tapi_ex.h. This is functionally equivalent to + the previous status quo, in which each header made minimal #undef + prior to its own definitions and then a single instance of + "#include bli_xapi_undef.h" cleaned up any remaining macro defs after + all other headers were used. This commit will guarantee that macro + defs from the setup of one header (say, bli_oapi_ex.h) don't "infect" + the definitions made in a subsequent header. As with this previous + commit, this change does not fix any issue but rather attempts to + avoid creating orphaned macro definitions that are only needed within + a very limited scope. + - Removed minimal #undef from bli_?api_[ba|ex].h. + - Removed old commented-out lines from bli_?api_[ba|ex].h. + +commit d4427a5b2f5cab5d2a64c58d87416628867c2b4a +Author: Field G. Van Zee +Date: Thu May 13 13:55:11 2021 -0500 + + Minor preprocessor/header cleanup. + + Details: + - Added frame/include/bli_xapi_undef.h, which explicitly undefines all + macros defined in bli_oapi_ba.h, bli_oapi_ex.h, bli_tapi_ba.h, and + bli_tapi_ex.h. (This is for safety and good cpp coding practice, not + because it fixes anything.) + - Added #include "bli_xapi_undef.h" to bli_l1v.h, bli_l1d.h, bli_l1f.h, + bli_l1m.h, bli_l2.h, bli_l3.h, and bli_util.h. + - Comment updates to bli_oapi_ba.h, bli_oapi_ex.h, bli_tapi_ba.h, and + bli_tapi_ex.h. + - Moved frame/3/bli_l3_ft_ex.h to local 'old' directory after realizing + that nothing in BLIS used those function pointer types. Also commented + out the "#include bli_l3_ft_ex.h" directive in frame/3/bli_l3.h. + +commit 5aa63cd927b22a04e581b07d0b68ef391f4f9b1f +Author: Field G. Van Zee +Date: Wed May 12 19:53:35 2021 -0500 + + Fixed typo in cpp guard in bli_util_ft.h. + + Details: + - Changed #ifdef BLIS_OAPI_BASIC to #ifdef BLIS_TAPI_BASIC in + bli_util_ft.h. This typo was causing some types to be redefined when + they weren't supposed to be. + +commit f0e8634775094584e89f1b03811ee192f2aaf67f +Author: Field G. Van Zee +Date: Wed May 12 18:45:32 2021 -0500 + + Defined eqsc, eqv, eqm to test object equality. + + Details: + - Defined eqsc, eqv, and eqm operations, which set a bool depending on + whether the two scalars, two vectors, or two matrix operands are equal + (element-wise). eqsc and eqv support implicit conjugation and eqm + supports diagonal offset, diag, uplo, and trans parameters (in a + manner consistent with other level-1m operations). These operations + are currently housed under frame/util, at least for now, because they + are not computational in nature. + - Redefined bli_obj_equals() in terms of eqsc, eqv, and eqm. + - Documented eqsc, eqv, and eqm in BLISObjectAPI.md and BLISTypedAPI.md. + Also: + - Documented getsc and setsc in both docs. + - Reordered entry for setijv in BLISTypedAPI.md, and added separator + bars to both docs. + - Added missing "Observed object properties" clauses to various + levle-1v entries in BLISObjectAPI.md. + - Defined bli_apply_trans() in bli_param_macro_defs.h. + - Defined supporting _check() function, bli_l0_xxbsc_check(), in + bli_l0_check.c for eqsc. + - Programming style and whitespace updates to bli_l1m_unb_var1.c. + - Whitespace updates to bli_l0_oapi.c, bli_l1m_oapi.c + - Consolidated redundant macro redefinition for copym function pointer + type in bli_l1m_ft.h. + - Added macros to bli_oapi_ba.h, _ex.h, and bli_tapi_ba.h, _ex.h that + allow oapi and tapi source files to forego defining certain expert + functions. (Certain operations such as printv and printm do not need + to have both basic expert interfaces. This also includes eqsc, eqv, + and eqm.) + +commit 5d46dbee4a06ba5a422e19817836976f8574cb4f +Author: Devin Matthews +Date: Wed May 12 18:42:09 2021 -0500 + + Replace bli_dlamch with something less archaic (#498) + + Details: + - Added new implementations of bli_slamch() and bli_dlamch() that use + constants from the standard C library in lieu of dynamically-computed + values (via code inherited from netlib). The previous implementation + is still available when the cpp macro BLIS_ENABLE_LEGACY_LAMCH is + defined by the subconfiguration at compile-time. Thanks to Devin + Matthews for providing this patch, and to Stefano Zampini for + reporting the issue (#497) that prompted Devin to propose the patch. + +commit 6a89c7d8f9ac3f51b5b4d8ccb2630d908d951e6f +Author: Field G. Van Zee +Date: Sat May 1 18:54:48 2021 -0500 + + Defined setijv, getijv to set/get vector elements. + + Details: + - Defined getijv, setijv operations to get and set elements of a vector, + in bli_setgetijv.c and .h. + - Renamed bli_setgetij.c and .h to bli_setgetijm.c and .h, respectively. + - Added additional bounds checking to getijm and setijm to prevent + actions with negative indices. + - Added documentation to BLISObjectAPI.md and BLISTypedAPI.md for getijv + and setijv. + - Added documentation to BLISTypedAPI.md for getijm and setijm, which + were inadvertently missing. + - Added a new entry to the FAQ titled "Why does BLIS have vector + (level-1v) and matrix (level-1m) variations of most level-1 + operations?" + - Comment updates. + +commit 4534daffd13ed7a8983c681d3f5e9de17c9f0b96 +Author: Field G. Van Zee +Date: Tue Apr 27 18:16:44 2021 -0500 + + Minor API breakage in bli_pack API. + + Details: + - Changed bli_pack_get_pack_a() and bli_pack_get_pack_b() so that + instead of returning a bool, they set a bool that is passed in by + address. This does break the public exported API, but I expect very + few users actually use this function. (This change is being made in + preparation for a much more extensive commit relating to error + checking.) + +commit 6a4aa986ffc060d3e64ed230afe318b82630f8b2 +Author: Field G. Van Zee +Date: Fri Apr 23 13:10:01 2021 -0500 + + Fixed typo in Table of Contents. + +commit f6424b5b82160d346a09a0fbb526981ecf66cdb3 +Author: Field G. Van Zee +Date: Fri Apr 23 13:08:06 2021 -0500 + + Added dedicated Performance section to README.md. + + Details: + - Spun off the Performance.md and PerformanceSmall.md links in the + Documentation section into a new Performance section dedicated to + those two links. (The previous entries remain redundantly listed + within Documentation section.) Thanks to Robert van de Geijn for + suggesting this change. + +commit 40ce5fd241b9ad140bf57278d440f0598d7f15d8 +Merge: 6280757b 1f3461a5 +Author: Devin Matthews +Date: Wed Apr 21 09:54:25 2021 -0500 + + Merge pull request #493 from cassiersg/patch-1 + + Fix typo in FAQ.md + +commit 1f3461a5a5a88510f913451a93e3190ec1556f39 +Author: Gaëtan Cassiers +Date: Wed Apr 21 16:49:05 2021 +0200 + + Fix typo in FAQ.md + +commit 6548cebaf55a1f9bdb8417cc89dd0444d8f9c2e4 +Author: Devin Matthews +Date: Wed Apr 14 13:00:42 2021 -0500 + + Allow clang for ThunderX2 config + + Needed for compiling on e.g. Mac M1. AFAIK clang supports the same -mcpu flag for ThunderX2 as gcc. + +commit 6280757be32f90fd77d8dd9357b07d9306e6f80d +Author: Field G. Van Zee +Date: Wed Apr 7 13:03:56 2021 -0500 + + Minor updates to a64fx section of Performance.md. + +commit 1e6ed823c6cd11f9b671779f3c8bdbd2bbb40f34 +Author: RuQing Xu +Date: Thu Apr 8 02:59:26 2021 +0900 + + Additional A64fx Comments (#490) + + * Performance.md Update A64fx Comments + + - Reason for ARMPL's missing data; + - Additional envs / flags for kernel selection; + - Update BLIS SRC commit. + + * Include Another Fix in armsve-cfg-vendor + + A prototype was forgotten, causing that void* pointer was not fully returned. + +commit 2688f21a5b073950f6f187c95917fdbb5aac234a +Author: Field G. Van Zee +Date: Tue Apr 6 19:02:37 2021 -0500 + + Added Fujitsu A64fx (512-bit SVE) perf results. + + Details: + - Added single-threaded and multithreaded performance results to + docs/Performance.md. These results were gathered on the "Fugaku" + Fujitsu A64fx supercomputer at the RIKEN Center for Computational + Science in Kobe, Japan. Special thanks to RuQing Xu and Stepan + Nassyr for their work in developing and optimizing A64fx support in + BLIS and RuQing for gathering the performance data that is reflected + in these new graphs. + +commit ba3ba8da83d48397162139e11337c036a631ba79 +Author: Field G. Van Zee +Date: Tue Apr 6 18:39:58 2021 -0500 + + Minor updates and fixes to test/3/octave scripts. + + Details: + - Fixed an issue where the wrong string was being passed in for the + vendor legend string. + - Changed the graph in which the legends appear. + - Updates to runthese.m. + +commit 09bd4f4f12311131938baa9f75d27e92b664d681 +Author: Field G. Van Zee +Date: Wed Mar 31 17:09:36 2021 -0500 + + Add err_t* "return" parameter to malloc functions. + + Details: + - Added an err_t* parameter to memory allocation functions including + bli_malloc_intl(), bli_calloc_intl(), bli_malloc_user(), + bli_fmalloc_align(), and bli_fmalloc_noalign(). Since these functions + already use the return value to return the allocated memory address, + they can't communicate errors to the caller through the return value. + This commit does not employ any error checking within these functions + or their callers, but this sets up BLIS for a more comprehensive + commit that moves in that direction. + - Moved the typedefs for malloc_ft and free_ft from bli_malloc.h to + bli_type_defs.h. This was done so that what remains of bli_malloc.h + can be included after the definition of the err_t enum. (This ordering + was needed because bli_malloc.h now contains function prototypes that + use err_t.) + - Defined bli_is_success() and bli_is_failure() static functions in + bli_param_macro_defs.h. These functions provide easy checks for error + codes and will be used more heavily in future commits. + - Unfortunately, the additional err_t* argument discussed above breaks + the API for bli_malloc_user(), which is an exported symbol in the + shared library. However, it's quite possible that the only application + that calls bli_malloc_user()--indeed, the reason it is was marked for + symbol exporting to begin with--is the BLIS testsuite. And if that's + the case, this breakage won't affect anyone. Nonetheless, the "major" + part of the so_version file has been updated accordingly to 4.0.0. + +commit f9ad55ce7e12f59930605753959fcfd41a218d8d +Merge: 04502492 90508192 +Author: Field G. Van Zee +Date: Wed Mar 31 14:20:19 2021 -0500 + + Merge branch 'master' into dev + +commit 90508192f2d6ae95adc2a3ba9f4e5bad2c8d6fd2 +Author: Devin Matthews +Date: Tue Mar 30 21:16:44 2021 -0500 + + Update do_sde.sh (#489) + + Update to a newer version of SDE, and do a direct download as it seems you don't have to click-through the license anymore. + +commit 22c6b5dc4c9cc21942f8ccc30891f9b4385a9504 +Author: Nicholai Tukanov +Date: Tue Mar 30 19:07:42 2021 -0500 + + Fixed bug in power10 microkernel I/O. (#488) + + Details: + - Fixed a bug in the POWER10 DGEMM kernel whereby the microkernel did + not store the microtile result correctly due to incorrect indices + calculations. (The error was introduced when I reorganized the + 'kernels/power10/3' directory.) + +commit 04502492671456b94bcdee60b9de347b6763a32d +Author: Field G. Van Zee +Date: Sun Mar 28 19:11:43 2021 -0500 + + Always stay initialized after BLAS compat calls. + + Details: + - Removed the option to finalize BLIS after every BLAS call, which also + means that BLIS would initialize at the beginning of every BLAS call. + This option never really made sense and wasn't even implemented + properly to begin with. (Because bli_init_auto() and _finalize_auto() + were implemented in terms of bli_init_once() and _finalize_once(), + respectively, the application would have only been able to call one + BLAS routine before BLIS would find itself in a unusable, permanently + uninitialized state.) Because this option was never meant for regular + use, it never made it into configure as an actual configure-time + option, and therefore this commit only removes parts of the code + affected by the cpp macro guard BLIS_ENABLE_STAY_AUTO_INITIALIZED. + +commit 3a6f41afb8197e831b6ce2f1ae7f63735685fa0a +Author: Field G. Van Zee +Date: Sat Mar 27 17:22:14 2021 -0500 + + Renamed membrk files/vars/functions to pba. + + Details: + - Renamed the files, variables, and functions relating to the packing + block allocator from its legacy name (membrk) to its current name + (pba). This more clearly contrasts the packing block allocator with + the small block allocator (sba). + - Fixed a typo in bli_pack_set_pack_b(), defined in bli_pack.c, that + caused the function to erroneously change the value of the pack_a + field of the global rntm_t instead of the pack_b field. (Apparently + nobody has used this API yet.) + - Comment updates. + +commit 36cb4116d15cfef2d42ec4a834efd4a958f261b5 +Author: Field G. Van Zee +Date: Sat Mar 27 15:15:09 2021 -0500 + + Switch allocator mutexes to static initialization. + + Details: + - Switched the small block allocator (sba), as defined in bli_sba.c and + bli_apool.c, to static initialization of its internal mutex. Did a + similar thing for the packing block allocator (pba), which appears as + global_membrk in bli_membrk.c. + - Commented out bli_membrk_init_mutex() and bli_membrk_finalize_mutex() + to ensure they won't be used in the future. + - In bli_thrcomm_pthreads.c and .h, removed old, commented-out cpp + blocks guarded by BLIS_USE_PTHREAD_MUTEX. + +commit 159ca6f01a5f91b93513134c9470b69ff78f5354 +Author: Field G. Van Zee +Date: Wed Mar 24 15:57:32 2021 -0500 + + Made test/3/octave scripts robust to missing data. + + Details: + - Modified the octave scripts in test/3 so that the script does not + choke when one or more of the expected OpenBLAS, Eigen, or vendor data + files is missing. (The BLIS data set, however, must be complete.) When + a file is missing, that data series is simply not included on that + particular graph. Also factored out a lot of the redundant logic from + plot_panel_4x5.m into a separate function in read_data.m. + +commit 545e6c2f6d09d023b353002a9a43b11aa0c1d701 +Author: Field G. Van Zee +Date: Mon Mar 22 17:42:33 2021 -0500 + + CHANGELOG update (0.8.1) + +commit 8535b3e11d2297854991c4272932ce4974dda629 (tag: 0.8.1) Author: Field G. Van Zee Date: Mon Mar 22 17:42:33 2021 -0500 Version file update (0.8.1) -commit e56d9f2d94ed247696dda2cbf94d2ca05c7fc089 (origin/master, origin/HEAD) +commit e56d9f2d94ed247696dda2cbf94d2ca05c7fc089 Author: Field G. Van Zee Date: Mon Mar 22 17:40:50 2021 -0500 @@ -163,7 +3041,7 @@ Date: Fri Mar 5 13:53:43 2021 -0600 information, refer to the POWER10.md document that is included in 'sandbox/power10'. -commit b8dcc5bc75a746807d6f8fa22dc2123c98396bf5 (origin/dev, origin/amd, dev, amd) +commit b8dcc5bc75a746807d6f8fa22dc2123c98396bf5 Author: RuQing Xu Date: Tue Mar 2 06:58:24 2021 +0800 @@ -6796,7 +9674,7 @@ Date: Mon Oct 15 16:37:39 2018 -0500 - Updated frame/include/bli_x86_asm_macros.h with additional macros (courtsey of Devin Matthews). -commit 3612ecac98a9d36c3fcd64154121d420bb69febd (origin/nested-omp-patch) +commit 3612ecac98a9d36c3fcd64154121d420bb69febd Author: Field G. Van Zee Date: Thu Oct 11 15:16:41 2018 -0500 From 69fa915464c52f09a5971a60f521900d31a34e69 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 1 Apr 2022 08:47:46 -0500 Subject: [PATCH 048/230] Fixed broken "tagged releases" link in README.md. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 211ebd6d5..8b355470c 100644 --- a/README.md +++ b/README.md @@ -357,7 +357,7 @@ This should reveal a link for downloading the zip file. 3. **Download a source release via a tarball/zip file.** Alternatively, if you would like to stick to the code that is included in official releases, you may download either a tarball or zip file of any of -BLIS's previous [tagged releases](https://github.com/flame/blis/releases). +BLIS's previous [tagged releases](https://github.com/flame/blis/tags). We consider this option to be less than ideal for most people since it will likely mean you miss out on the latest bugfix or feature commits (in contrast to Options 1 or 2), and you also will not be able to update your code with a From b3e674db3c05ca586b159a71deb1b61d701ae5c9 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 4 Apr 2022 17:31:02 -0500 Subject: [PATCH 049/230] README.md update to link to releases page. --- README.md | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8b355470c..3803acdca 100644 --- a/README.md +++ b/README.md @@ -345,6 +345,18 @@ to executing the following command in your terminal shell: ``` git clone https://github.com/flame/blis.git ``` + At this point, you will have the latest commit of the `master` branch +checked out. If you wish to check out a particular version x.y.z, execute +the following: + ``` + git checkout x.y.z + ``` + `git` will then transform your working copy to match the state of the +commit associated with version x.y.z. You can view a list of tags at any +time by executing: + ``` + git tag --list + ``` 2. **Download a source repository via a zip file.** If you are uncomfortable with using `git` but would still like the latest @@ -356,8 +368,11 @@ This should reveal a link for downloading the zip file. 3. **Download a source release via a tarball/zip file.** Alternatively, if you would like to stick to the code that is included in -official releases, you may download either a tarball or zip file of any of -BLIS's previous [tagged releases](https://github.com/flame/blis/tags). +official releases, you may download either a tarball or zip file of BLIS's +latest [release](https://github.com/flame/blis/releases). Some older releases +are only available as [tagged](https://github.com/flame/blis/tags) commits. +(Note: downloading release x.y.z is equivalent to downloading, or checking out, +tag `x.y.z`.) We consider this option to be less than ideal for most people since it will likely mean you miss out on the latest bugfix or feature commits (in contrast to Options 1 or 2), and you also will not be able to update your code with a From ae10d9495486f589ed0320f0151b2d195574f1cf Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 6 Apr 2022 20:31:11 -0500 Subject: [PATCH 050/230] Simplify and rewrite reference packm kernels. (#610) Details: - Reorganized the way kernels are stored within the cntx_t structure so that rather than having a function pointer for every supported size of unrolled packm kernel (2xk, 3xk, 4xk, etc.), we store only two packm kernels per datatype: one to pack MRxk micropanels and one to pack NRxk micropanels. - NOTE: The "bb" (broadcast B) reference kernels have been merged into the "standard" kernels (packm [including 1er and unpackm], gemm, trsm, gemmtrsm). This replication factor is controlled by BLIS_BB[MN]_[sdcz] etc. Power9/10 needs testing since only a replication factor of 1 has been tested. armsve also needs testing since the MR value isn't available as a macro. - Simplified the bli_cntx_*() APIs to conform to the new unified kernel array within the cntx_t. Updated existing bli_cntx_init_() function definitions for all subconfigurations. - Consolidated all kernel id types (e.g. l1vkr_t, l1mkr_t, l3ukr_t, etc.) into one kernel id type: ukr_t. - Various edits, updates, and rewrites of reference kernels pursuant to the aforementioned changes. - Define compile-time macro constants (BLIS_MR_[sdcz], BLIS_NR_[sdcz], and friends) in bli_kernel_macro_defs.h, but only when the macro BLIS_IN_REF_KERNEL is defined by the build system. - Loose ends: - Still need to update documentation, including: - docs/ConfigurationHowTo.md - docs/KernelsHowTo.md to reflect changes made in this commit. --- addon/gemmd/attic/bao_gemmd_bp_var2.c | 8 +- addon/gemmd/bao_gemmd.c | 2 +- addon/gemmd/bao_gemmd_bp_var1.c | 2 +- addon/gemmd/bao_packm_cxk.c | 6 +- common.mk | 4 + config/a64fx/bli_cntx_init_a64fx.c | 100 +- config/a64fx/bli_kernel_defs_a64fx.h | 52 + config/armsve/bli_cntx_init_armsve.c | 119 +- config/armsve/bli_kernel_defs_armsve.h | 58 + config/bgq/bli_cntx_init_bgq.c | 35 +- config/bgq/bli_kernel_defs_bgq.h | 48 + config/bulldozer/bli_cntx_init_bulldozer.c | 41 +- config/bulldozer/bli_kernel_defs_bulldozer.h | 52 + config/cortexa15/bli_cntx_init_cortexa15.c | 35 +- config/cortexa15/bli_kernel_defs_cortexa15.h | 48 + config/cortexa53/bli_cntx_init_cortexa53.c | 35 +- config/cortexa53/bli_kernel_defs_cortexa53.h | 48 + config/cortexa57/bli_cntx_init_cortexa57.c | 35 +- config/cortexa57/bli_kernel_defs_cortexa57.h | 48 + config/cortexa9/bli_cntx_init_cortexa9.c | 35 +- config/cortexa9/bli_kernel_defs_cortexa9.h | 48 + config/excavator/bli_cntx_init_excavator.c | 41 +- config/excavator/bli_kernel_defs_excavator.h | 52 + config/firestorm/bli_cntx_init_firestorm.c | 143 +- config/firestorm/bli_kernel_defs_firestorm.h | 48 + config/generic/bli_kernel_defs_generic.h | 42 + config/haswell/bli_cntx_init_haswell.c | 247 +- config/haswell/bli_kernel_defs_haswell.h | 52 + config/knc/bli_cntx_init_knc.c | 34 +- config/knc/bli_kernel_defs_knc.h | 48 + config/knl/bli_cntx_init_knl.c | 71 +- config/knl/bli_kernel_defs_knl.h | 48 + config/old/armv7a/bli_cntx_init_armv7a.c | 2 +- config/old/haswellbb/bli_cntx_init_haswell.c | 2 +- config/penryn/bli_cntx_init_penryn.c | 47 +- config/penryn/bli_kernel_defs_penryn.h | 48 + config/piledriver/bli_cntx_init_piledriver.c | 41 +- .../piledriver/bli_kernel_defs_piledriver.h | 52 + config/power10/bli_cntx_init_power10.c | 102 +- config/power10/bli_kernel_defs_power10.h | 51 + config/power7/bli_cntx_init_power7.c | 32 +- config/power7/bli_kernel_defs_power7.h | 46 + config/power9/bli_cntx_init_power9.c | 101 +- config/power9/bli_kernel_defs_power9.h | 49 + .../sandybridge/bli_cntx_init_sandybridge.c | 41 +- .../sandybridge/bli_kernel_defs_sandybridge.h | 52 + config/skx/bli_cntx_init_skx.c | 53 +- config/skx/bli_kernel_defs_skx.h | 48 + .../steamroller/bli_cntx_init_steamroller.c | 41 +- .../steamroller/bli_kernel_defs_steamroller.h | 52 + config/template/bli_cntx_init_template.c | 53 +- config/template/bli_kernel_defs_template.h | 60 + config/thunderx2/bli_cntx_init_thunderx2.c | 35 +- config/thunderx2/bli_kernel_defs_thunderx2.h | 48 + config/zen/bli_cntx_init_zen.c | 337 +-- config/zen/bli_kernel_defs_zen.h | 52 + config/zen2/bli_cntx_init_zen2.c | 309 +-- config/zen2/bli_kernel_defs_zen2.h | 52 + config/zen3/bli_cntx_init_zen3.c | 344 +-- config/zen3/bli_kernel_defs_zen3.h | 52 + docs/ConfigurationHowTo.md | 24 +- frame/1/bli_l1v_tapi.c | 132 +- frame/1/other/packv/bli_packv_unb_var1.c | 2 +- frame/1/other/unpackv/bli_unpackv_unb_var1.c | 2 +- frame/1d/bli_l1d_tapi.c | 171 +- frame/1f/bli_l1f_tapi.c | 114 +- frame/1m/bli_l1m_ft_ker.h | 17 +- frame/1m/bli_l1m_ker.h | 54 +- frame/1m/bli_l1m_ker_prot.h | 19 +- frame/1m/bli_l1m_unb_var1.c | 8 +- frame/1m/{packm => other}/bli_packm_cxk.c | 35 +- frame/1m/{packm => other}/bli_packm_cxk.h | 0 frame/1m/{packm => other}/bli_packm_cxk_1er.c | 7 +- frame/1m/{packm => other}/bli_packm_cxk_1er.h | 0 .../bli_packm_struc_cxk_1er.c | 0 .../bli_packm_struc_cxk_1er.h | 0 frame/1m/{unpackm => other}/bli_unpackm_cxk.c | 8 +- frame/1m/{unpackm => other}/bli_unpackm_cxk.h | 1 + frame/1m/packm/bli_packm.h | 4 - frame/1m/packm/bli_packm_blk_var1.c | 8 +- frame/1m/packm/bli_packm_struc_cxk.c | 599 ++--- frame/1m/unpackm/bli_unpackm.h | 2 - frame/1m/unpackm/bli_unpackm_blk_var1.c | 51 +- frame/2/gemv/bli_gemv_unb_var1.c | 2 +- frame/2/gemv/bli_gemv_unb_var2.c | 2 +- frame/2/gemv/bli_gemv_unf_var1.c | 2 +- frame/2/gemv/bli_gemv_unf_var2.c | 2 +- frame/2/ger/bli_ger_unb_var1.c | 2 +- frame/2/ger/bli_ger_unb_var2.c | 2 +- frame/2/hemv/bli_hemv_unb_var1.c | 4 +- frame/2/hemv/bli_hemv_unb_var2.c | 2 +- frame/2/hemv/bli_hemv_unb_var3.c | 4 +- frame/2/hemv/bli_hemv_unb_var4.c | 2 +- frame/2/hemv/bli_hemv_unf_var1.c | 2 +- frame/2/hemv/bli_hemv_unf_var1a.c | 2 +- frame/2/hemv/bli_hemv_unf_var3.c | 2 +- frame/2/hemv/bli_hemv_unf_var3a.c | 2 +- frame/2/her/bli_her_unb_var1.c | 2 +- frame/2/her/bli_her_unb_var2.c | 2 +- frame/2/her2/bli_her2_unb_var1.c | 2 +- frame/2/her2/bli_her2_unb_var2.c | 2 +- frame/2/her2/bli_her2_unb_var3.c | 2 +- frame/2/her2/bli_her2_unb_var4.c | 2 +- frame/2/her2/bli_her2_unf_var1.c | 2 +- frame/2/her2/bli_her2_unf_var4.c | 2 +- frame/2/trmv/bli_trmv_unb_var1.c | 2 +- frame/2/trmv/bli_trmv_unb_var2.c | 2 +- frame/2/trmv/bli_trmv_unf_var1.c | 2 +- frame/2/trmv/bli_trmv_unf_var2.c | 2 +- frame/2/trsv/bli_trsv_unb_var1.c | 2 +- frame/2/trsv/bli_trsv_unb_var2.c | 2 +- frame/2/trsv/bli_trsv_unf_var1.c | 2 +- frame/2/trsv/bli_trsv_unf_var2.c | 2 +- frame/3/bli_l3_schema.c | 2 +- frame/3/bli_l3_sup.c | 2 +- frame/3/bli_l3_sup_int.c | 4 +- frame/3/bli_l3_sup_packm_var.c | 11 +- frame/3/bli_l3_sup_vars.h | 2 +- frame/3/gemm/bli_gemm_front.c | 2 +- frame/3/gemm/bli_gemm_ker_var2.c | 2 +- frame/3/gemm/bli_gemm_md.c | 22 +- frame/3/gemm/bli_gemm_md_c2r_ref.c | 4 +- frame/3/gemm/other/bli_gemm_ker_var2.c | 2 +- frame/3/gemm/other/bli_gemm_ker_var2rr.c | 2 +- frame/3/gemm/other/bli_gemm_ker_var2sl.c | 2 +- frame/3/gemmt/bli_gemmt_front.c | 2 +- frame/3/gemmt/bli_gemmt_l_ker_var2.c | 2 +- frame/3/gemmt/bli_gemmt_u_ker_var2.c | 2 +- frame/3/gemmt/other/bli_gemmt_l_ker_var2.c | 2 +- frame/3/gemmt/other/bli_gemmt_u_ker_var2.c | 2 +- frame/3/hemm/bli_hemm_front.c | 2 +- frame/3/symm/bli_symm_front.c | 2 +- frame/3/trmm/bli_trmm_front.c | 2 +- frame/3/trmm/other/bli_trmm_ll_ker_var2.c | 2 +- frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c | 4 +- frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c | 2 +- frame/3/trmm/other/bli_trmm_lu_ker_var2.c | 2 +- frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c | 2 +- frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c | 2 +- frame/3/trmm/other/bli_trmm_rl_ker_var2.c | 2 +- frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c | 2 +- frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c | 2 +- frame/3/trmm/other/bli_trmm_ru_ker_var2.c | 2 +- frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c | 2 +- frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c | 2 +- frame/3/trmm3/bli_trmm3_front.c | 2 +- frame/3/trsm/bli_trsm_ll_ker_var2.c | 2 +- frame/3/trsm/bli_trsm_lu_ker_var2.c | 2 +- frame/3/trsm/bli_trsm_rl_ker_var2.c | 2 +- frame/3/trsm/bli_trsm_ru_ker_var2.c | 2 +- frame/3/trsm/other/bli_trsm_ll_ker_var2.c | 2 +- frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c | 2 +- frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c | 2 +- frame/3/trsm/other/bli_trsm_lu_ker_var2.c | 2 +- frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c | 2 +- frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c | 2 +- frame/3/trsm/other/bli_trsm_rl_ker_var2.c | 2 +- frame/3/trsm/other/bli_trsm_ru_ker_var2.c | 2 +- frame/base/bli_cntx.c | 1586 ++---------- frame/base/bli_cntx.h | 558 +---- frame/base/bli_gks.c | 42 +- frame/base/bli_gks.h | 8 +- frame/include/bli_gentfunc_macro_defs.h | 7 + frame/include/bli_kernel_macro_defs.h | 104 + frame/include/bli_misc_macro_defs.h | 6 + frame/include/bli_param_macro_defs.h | 26 +- frame/include/bli_scalar_macro_defs.h | 7 +- frame/include/bli_type_defs.h | 227 +- frame/include/level0/bli_set0s_edge.h | 79 + kernels/penryn/1/bli_axpyv_penryn_int.c | 2 +- kernels/penryn/1/bli_dotv_penryn_int.c | 2 +- kernels/penryn/1f/bli_axpy2v_penryn_int.c | 2 +- kernels/penryn/1f/bli_axpyf_penryn_int.c | 2 +- kernels/penryn/1f/bli_dotaxpyv_penryn_int.c | 2 +- kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c | 4 +- kernels/penryn/1f/bli_dotxf_penryn_int.c | 4 +- kernels/zen/1/bli_scalv_zen_int.c | 4 +- kernels/zen/1/bli_scalv_zen_int10.c | 14 +- kernels/zen/1f/bli_axpyf_zen_int_4.c | 4 +- kernels/zen/1f/bli_axpyf_zen_int_5.c | 23 +- kernels/zen/1f/bli_axpyf_zen_int_8.c | 4 +- kernels/zen/1f/bli_dotxf_zen_int_8.c | 12 +- ref_kernels/1/bli_axpbyv_ref.c | 14 +- ref_kernels/1/bli_axpyv_ref.c | 4 +- ref_kernels/1/bli_scal2v_ref.c | 4 +- ref_kernels/1/bli_scalv_ref.c | 2 +- ref_kernels/1/bli_xpbyv_ref.c | 4 +- ref_kernels/1f/bli_axpy2v_ref.c | 2 +- ref_kernels/1f/bli_axpyf_ref.c | 2 +- ref_kernels/1f/bli_dotaxpyv_ref.c | 4 +- ref_kernels/1f/bli_dotxaxpyf_ref.c | 4 +- ref_kernels/1f/bli_dotxf_ref.c | 2 +- ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c | 4 +- ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c | 336 +++ ref_kernels/1m/bli_packm_cxc_diag_ref.c | 173 ++ ref_kernels/1m/bli_packm_cxk_1er_ref.c | 2195 +---------------- ref_kernels/1m/bli_packm_cxk_bb_ref.c | 656 ----- ref_kernels/1m/bli_packm_cxk_ref.c | 1679 +------------ ref_kernels/1m/bli_unpackm_cxk_ref.c | 814 +----- ref_kernels/3/bb/bli_gemmbb_ref.c | 141 -- ref_kernels/3/bb/bli_gemmtrsmbb_ref.c | 140 -- ref_kernels/3/bb/bli_trsmbb_ref.c | 214 -- ref_kernels/3/bli_gemm_ref.c | 256 +- ref_kernels/3/bli_gemmtrsm_ref.c | 49 +- ref_kernels/3/bli_trsm_ref.c | 43 +- ref_kernels/bli_cntx_ref.c | 433 ++-- ref_kernels/ind/bli_gemm1m_ref.c | 4 +- ref_kernels/ind/bli_gemmtrsm1m_ref.c | 56 +- ref_kernels/ind/bli_trsm1m_ref.c | 116 +- sandbox/gemmlike/attic/bls_gemm_bp_var2.c | 8 +- sandbox/gemmlike/bls_gemm.c | 2 +- sandbox/gemmlike/bls_gemm_bp_var1.c | 2 +- sandbox/gemmlike/bls_packm_cxk.c | 7 +- testsuite/src/test_trsm_ukr.c | 4 +- 214 files changed, 5180 insertions(+), 10257 deletions(-) create mode 100644 config/a64fx/bli_kernel_defs_a64fx.h create mode 100644 config/armsve/bli_kernel_defs_armsve.h create mode 100644 config/bgq/bli_kernel_defs_bgq.h create mode 100644 config/bulldozer/bli_kernel_defs_bulldozer.h create mode 100644 config/cortexa15/bli_kernel_defs_cortexa15.h create mode 100644 config/cortexa53/bli_kernel_defs_cortexa53.h create mode 100644 config/cortexa57/bli_kernel_defs_cortexa57.h create mode 100644 config/cortexa9/bli_kernel_defs_cortexa9.h create mode 100644 config/excavator/bli_kernel_defs_excavator.h create mode 100644 config/firestorm/bli_kernel_defs_firestorm.h create mode 100644 config/generic/bli_kernel_defs_generic.h create mode 100644 config/haswell/bli_kernel_defs_haswell.h create mode 100644 config/knc/bli_kernel_defs_knc.h create mode 100644 config/knl/bli_kernel_defs_knl.h create mode 100644 config/penryn/bli_kernel_defs_penryn.h create mode 100644 config/piledriver/bli_kernel_defs_piledriver.h create mode 100644 config/power10/bli_kernel_defs_power10.h create mode 100644 config/power7/bli_kernel_defs_power7.h create mode 100644 config/power9/bli_kernel_defs_power9.h create mode 100644 config/sandybridge/bli_kernel_defs_sandybridge.h create mode 100644 config/skx/bli_kernel_defs_skx.h create mode 100644 config/steamroller/bli_kernel_defs_steamroller.h create mode 100644 config/template/bli_kernel_defs_template.h create mode 100644 config/thunderx2/bli_kernel_defs_thunderx2.h create mode 100644 config/zen/bli_kernel_defs_zen.h create mode 100644 config/zen2/bli_kernel_defs_zen2.h create mode 100644 config/zen3/bli_kernel_defs_zen3.h rename frame/1m/{packm => other}/bli_packm_cxk.c (84%) rename frame/1m/{packm => other}/bli_packm_cxk.h (100%) rename frame/1m/{packm => other}/bli_packm_cxk_1er.c (94%) rename frame/1m/{packm => other}/bli_packm_cxk_1er.h (100%) rename frame/1m/{packm => other}/bli_packm_struc_cxk_1er.c (100%) rename frame/1m/{packm => other}/bli_packm_struc_cxk_1er.h (100%) rename frame/1m/{unpackm => other}/bli_unpackm_cxk.c (92%) rename frame/1m/{unpackm => other}/bli_unpackm_cxk.h (98%) create mode 100644 frame/include/level0/bli_set0s_edge.h create mode 100644 ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c create mode 100644 ref_kernels/1m/bli_packm_cxc_diag_ref.c delete mode 100644 ref_kernels/1m/bli_packm_cxk_bb_ref.c delete mode 100644 ref_kernels/3/bb/bli_gemmbb_ref.c delete mode 100644 ref_kernels/3/bb/bli_gemmtrsmbb_ref.c delete mode 100644 ref_kernels/3/bb/bli_trsmbb_ref.c diff --git a/addon/gemmd/attic/bao_gemmd_bp_var2.c b/addon/gemmd/attic/bao_gemmd_bp_var2.c index a0040fec0..9139e89b1 100644 --- a/addon/gemmd/attic/bao_gemmd_bp_var2.c +++ b/addon/gemmd/attic/bao_gemmd_bp_var2.c @@ -164,7 +164,7 @@ void PASTECH2(bao_,ch,varname) \ function pointer type. */ \ /* PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ */ \ \ /* Temporary C buffer for edge cases. Note that the strides of this @@ -175,7 +175,7 @@ void PASTECH2(bao_,ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ @@ -536,7 +536,7 @@ void PASTECH2(bao_,ch,varname) \ /* Query the context for the microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -545,7 +545,7 @@ void PASTECH2(bao_,ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/addon/gemmd/bao_gemmd.c b/addon/gemmd/bao_gemmd.c index fadc52691..01185a9d7 100644 --- a/addon/gemmd/bao_gemmd.c +++ b/addon/gemmd/bao_gemmd.c @@ -137,7 +137,7 @@ void bao_gemmd_ex // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_obj_swap( &a_local, &b_local ); diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c index 09e4df09e..689471367 100644 --- a/addon/gemmd/bao_gemmd_bp_var1.c +++ b/addon/gemmd/bao_gemmd_bp_var1.c @@ -163,7 +163,7 @@ void PASTECH2(bao_,ch,varname) \ /* Query the context for the microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = cs_c; \ diff --git a/addon/gemmd/bao_packm_cxk.c b/addon/gemmd/bao_packm_cxk.c index 645f09d79..8680c5332 100644 --- a/addon/gemmd/bao_packm_cxk.c +++ b/addon/gemmd/bao_packm_cxk.c @@ -55,15 +55,15 @@ void PASTECH2(bao_,ch,opname) \ /* Note that we use panel_dim_max, not panel_dim, to query the packm kernel function pointer. This means that we always use the same kernel, even for edge cases. */ \ - num_t dt = PASTEMAC(ch,type); \ - l1mkr_t ker_id = panel_dim_max; \ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \ \ PASTECH2(ch,opname,_ker_ft) f; \ \ /* Query the context for the packm kernel corresponding to the current panel dimension, or kernel id. If the id is invalid, the function will return NULL. */ \ - f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \ + f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* If there exists a kernel implementation for the micro-panel dimension provided, we invoke the implementation. Otherwise, we use scal2m. */ \ diff --git a/common.mk b/common.mk index 5f2d30c9b..a93f8ab24 100644 --- a/common.mk +++ b/common.mk @@ -120,6 +120,8 @@ get-refinit-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ -DBLIS_CNAME=$(1) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ + -DBLIS_IN_REF_KERNEL=1 \ + -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \ ) get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \ @@ -129,6 +131,8 @@ get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \ -DBLIS_CNAME=$(1) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ + -DBLIS_IN_REF_KERNEL=1 \ + -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \ ) get-config-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ diff --git a/config/a64fx/bli_cntx_init_a64fx.c b/config/a64fx/bli_cntx_init_a64fx.c index 5132b2824..dd920bcec 100644 --- a/config/a64fx/bli_cntx_init_a64fx.c +++ b/config/a64fx/bli_cntx_init_a64fx.c @@ -38,34 +38,42 @@ void bli_cntx_init_a64fx( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_a64fx_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, + + // packm + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk, + + BLIS_VA_END ); - // Set SVE-512 packing routine. - bli_cntx_set_packm_kers + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs ( - 2, - BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk, - // 12xk is not used and disabled for GCC 8-9 compatibility. - // BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_int_12xk, - BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -80,66 +88,18 @@ void bli_cntx_init_a64fx( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx - ); - -#if 0 - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 65, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 65, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 65, -1, -1 ); - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx + BLIS_VA_END ); - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 4, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 10, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 16, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); -#endif - // Set A64FX cache sector sizes for each PE/CMG // SC Fugaku might disable users' setting cache sizes. #if !defined(CACHE_SECTOR_SIZE_READONLY) diff --git a/config/a64fx/bli_kernel_defs_a64fx.h b/config/a64fx/bli_kernel_defs_a64fx.h new file mode 100644 index 000000000..2c5c97204 --- /dev/null +++ b/config/a64fx/bli_kernel_defs_a64fx.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 32 +#define BLIS_MR_d 16 +#define BLIS_MR_c 16 +#define BLIS_MR_z 8 + +#define BLIS_NR_s 10 +#define BLIS_NR_d 10 +#define BLIS_NR_c 10 +#define BLIS_NR_z 10 + +//#endif + diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c index ad0e68219..6339ba381 100644 --- a/config/armsve/bli_cntx_init_armsve.c +++ b/config/armsve/bli_cntx_init_armsve.c @@ -45,9 +45,6 @@ void bli_cntx_init_armsve( cntx_t* cntx ) return; blksz_t blkszs[ BLIS_NUM_BLKSZS ]; -#if 0 - blksz_t thresh[ BLIS_NUM_THRESH ]; -#endif // Set default kernel blocksizes and functions. bli_cntx_init_armsve_ref( cntx ); @@ -64,35 +61,55 @@ void bli_cntx_init_armsve( cntx_t* cntx ) bli_c_blksz_armsve(&m_r_c, &n_r_c, &k_c_c, &m_c_c, &n_c_c); bli_z_blksz_armsve(&m_r_z, &n_r_z, &k_c_z, &m_c_z, &n_c_z); - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, + cntx, + + // level-3 // These are vector-length agnostic kernels. Yet knowing mr is required at runtime. - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE, - cntx + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Set VL-specific packing routines if applicable. - if (m_r_d==16) - bli_cntx_set_packm_kers + if ( m_r_d == 16 ) + { + bli_cntx_set_ukrs ( - 2, - BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk, - BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk, - cntx + cntx, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk, + BLIS_VA_END ); - else if (m_r_d==8) - bli_cntx_set_packm_kers + } + else if ( m_r_d == 8 ) + { + bli_cntx_set_ukrs ( - 1, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk, - cntx + cntx, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk, + BLIS_VA_END ); + } // Initialize level-3 blocksize objects with architecture-specific values. // s d c z @@ -106,64 +123,16 @@ void bli_cntx_init_armsve( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx - ); - -#if 0 - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 101, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 101, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 101, -1, -1 ); - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 4, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - cntx + BLIS_VA_END ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, n_r_d, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, m_r_d, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 2048, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); -#endif } diff --git a/config/armsve/bli_kernel_defs_armsve.h b/config/armsve/bli_kernel_defs_armsve.h new file mode 100644 index 000000000..8c9c0b0dd --- /dev/null +++ b/config/armsve/bli_kernel_defs_armsve.h @@ -0,0 +1,58 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +// +// The armsve configuration handles both 256-bit and 512-bit SVE vectors, +// so it is not possible to define specific register block sizes. Thus, +// armsve can't use reference kernels! +// + +#define BLIS_MR_s -1 +#define BLIS_MR_d -1 +#define BLIS_MR_c -1 +#define BLIS_MR_z -1 + +#define BLIS_NR_s 10 +#define BLIS_NR_d 10 +#define BLIS_NR_c 10 +#define BLIS_NR_z 10 + +//#endif + diff --git a/config/bgq/bli_cntx_init_bgq.c b/config/bgq/bli_cntx_init_bgq.c index 782c441b9..d3871d8f7 100644 --- a/config/bgq/bli_cntx_init_bgq.c +++ b/config/bgq/bli_cntx_init_bgq.c @@ -43,14 +43,28 @@ void bli_cntx_init_bgq( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bgq_int_8x8, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bgq_int_8x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -65,13 +79,16 @@ void bli_cntx_init_bgq( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/bgq/bli_kernel_defs_bgq.h b/config/bgq/bli_kernel_defs_bgq.h new file mode 100644 index 000000000..bd3962e45 --- /dev/null +++ b/config/bgq/bli_kernel_defs_bgq.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_d 8 +#define BLIS_MR_z 4 + +#define BLIS_NR_d 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/bulldozer/bli_cntx_init_bulldozer.c b/config/bulldozer/bli_cntx_init_bulldozer.c index 9f6e83d6b..5b056f591 100644 --- a/config/bulldozer/bli_cntx_init_bulldozer.c +++ b/config/bulldozer/bli_cntx_init_bulldozer.c @@ -43,16 +43,32 @@ void bli_cntx_init_bulldozer( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_bulldozer_asm_8x8_fma4, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bulldozer_asm_4x6_fma4, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_bulldozer_asm_8x8_fma4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bulldozer_asm_4x6_fma4, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_bulldozer( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/bulldozer/bli_kernel_defs_bulldozer.h b/config/bulldozer/bli_kernel_defs_bulldozer.h new file mode 100644 index 000000000..ea1e58e66 --- /dev/null +++ b/config/bulldozer/bli_kernel_defs_bulldozer.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 4 +#define BLIS_MR_c 8 +#define BLIS_MR_z 4 + +#define BLIS_NR_s 8 +#define BLIS_NR_d 6 +#define BLIS_NR_c 4 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/cortexa15/bli_cntx_init_cortexa15.c b/config/cortexa15/bli_cntx_init_cortexa15.c index 7c6134ff0..28ebdef71 100644 --- a/config/cortexa15/bli_cntx_init_cortexa15.c +++ b/config/cortexa15/bli_cntx_init_cortexa15.c @@ -43,14 +43,28 @@ void bli_cntx_init_cortexa15( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -73,13 +87,16 @@ void bli_cntx_init_cortexa15( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/cortexa15/bli_kernel_defs_cortexa15.h b/config/cortexa15/bli_kernel_defs_cortexa15.h new file mode 100644 index 000000000..9c413f7f8 --- /dev/null +++ b/config/cortexa15/bli_kernel_defs_cortexa15.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 4 +#define BLIS_MR_d 4 + +#define BLIS_NR_s 4 +#define BLIS_NR_d 4 + +//#endif + diff --git a/config/cortexa53/bli_cntx_init_cortexa53.c b/config/cortexa53/bli_cntx_init_cortexa53.c index d7d786f8c..4957de04e 100644 --- a/config/cortexa53/bli_cntx_init_cortexa53.c +++ b/config/cortexa53/bli_cntx_init_cortexa53.c @@ -43,14 +43,28 @@ void bli_cntx_init_cortexa53( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -65,13 +79,16 @@ void bli_cntx_init_cortexa53( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/cortexa53/bli_kernel_defs_cortexa53.h b/config/cortexa53/bli_kernel_defs_cortexa53.h new file mode 100644 index 000000000..60292099c --- /dev/null +++ b/config/cortexa53/bli_kernel_defs_cortexa53.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 6 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/cortexa57/bli_cntx_init_cortexa57.c b/config/cortexa57/bli_cntx_init_cortexa57.c index 57d18792d..28558bc52 100644 --- a/config/cortexa57/bli_cntx_init_cortexa57.c +++ b/config/cortexa57/bli_cntx_init_cortexa57.c @@ -43,14 +43,28 @@ void bli_cntx_init_cortexa57( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -65,13 +79,16 @@ void bli_cntx_init_cortexa57( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/cortexa57/bli_kernel_defs_cortexa57.h b/config/cortexa57/bli_kernel_defs_cortexa57.h new file mode 100644 index 000000000..60292099c --- /dev/null +++ b/config/cortexa57/bli_kernel_defs_cortexa57.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 6 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/cortexa9/bli_cntx_init_cortexa9.c b/config/cortexa9/bli_cntx_init_cortexa9.c index d38e12ebb..6af3ff91c 100644 --- a/config/cortexa9/bli_cntx_init_cortexa9.c +++ b/config/cortexa9/bli_cntx_init_cortexa9.c @@ -43,14 +43,28 @@ void bli_cntx_init_cortexa9( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -65,13 +79,16 @@ void bli_cntx_init_cortexa9( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/cortexa9/bli_kernel_defs_cortexa9.h b/config/cortexa9/bli_kernel_defs_cortexa9.h new file mode 100644 index 000000000..9c413f7f8 --- /dev/null +++ b/config/cortexa9/bli_kernel_defs_cortexa9.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 4 +#define BLIS_MR_d 4 + +#define BLIS_NR_s 4 +#define BLIS_NR_d 4 + +//#endif + diff --git a/config/excavator/bli_cntx_init_excavator.c b/config/excavator/bli_cntx_init_excavator.c index adae152d5..d36865b21 100644 --- a/config/excavator/bli_cntx_init_excavator.c +++ b/config/excavator/bli_cntx_init_excavator.c @@ -43,16 +43,32 @@ void bli_cntx_init_excavator( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_excavator( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/excavator/bli_kernel_defs_excavator.h b/config/excavator/bli_kernel_defs_excavator.h new file mode 100644 index 000000000..df4a8c411 --- /dev/null +++ b/config/excavator/bli_kernel_defs_excavator.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 16 +#define BLIS_MR_d 8 +#define BLIS_MR_c 4 +#define BLIS_MR_z 2 + +#define BLIS_NR_s 3 +#define BLIS_NR_d 3 +#define BLIS_NR_c 2 +#define BLIS_NR_z 2 + +//#endif + diff --git a/config/firestorm/bli_cntx_init_firestorm.c b/config/firestorm/bli_cntx_init_firestorm.c index a15ce0344..8e4d0088d 100644 --- a/config/firestorm/bli_cntx_init_firestorm.c +++ b/config/firestorm/bli_cntx_init_firestorm.c @@ -37,32 +37,60 @@ void bli_cntx_init_firestorm( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_firestorm_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + + // packm + BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk, + BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, + + BLIS_VA_END ); - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs ( - 4, - BLIS_PACKM_8XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk, - BLIS_PACKM_12XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -73,72 +101,47 @@ void bli_cntx_init_firestorm( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 3072, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 8192, -1, -1 ); + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MT ], -1, 99, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], -1, 99, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], -1, 99, -1, -1 ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR_SUP ], -1, 6, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], -1, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], -1, 240, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], -1, 1024, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], -1, 3072, -1, -1 ); + // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx - ); - // ------------------------------------------------------------------------- + // sup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 99, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 99, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 99, -1, -1 ); + // level-3 sup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 8, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 6, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 240, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1024, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 3072, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx + BLIS_VA_END ); } diff --git a/config/firestorm/bli_kernel_defs_firestorm.h b/config/firestorm/bli_kernel_defs_firestorm.h new file mode 100644 index 000000000..60292099c --- /dev/null +++ b/config/firestorm/bli_kernel_defs_firestorm.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 6 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/generic/bli_kernel_defs_generic.h b/config/generic/bli_kernel_defs_generic.h new file mode 100644 index 000000000..db2f32947 --- /dev/null +++ b/config/generic/bli_kernel_defs_generic.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +//#endif + diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c index f2dc900ea..fe3b45147 100644 --- a/config/haswell/bli_cntx_init_haswell.c +++ b/config/haswell/bli_cntx_init_haswell.c @@ -35,79 +35,58 @@ #include "blis.h" -//GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) - void bli_cntx_init_haswell( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_haswell_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 8, + cntx, + // gemm #if 1 - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, #else - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_16x6, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_8x6, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3, FALSE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_16x6, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_8x6, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3, #endif // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, - cntx - ); + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, #if 1 - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 8, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, - BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, - BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, - BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, - cntx - ); + // packm + BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, #endif - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - cntx - ); - - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 10, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, @@ -137,7 +116,74 @@ void bli_cntx_init_haswell( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif - cntx + + // gemmsup + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // gemm +#if 1 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, +#else + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, +#endif + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -161,97 +207,54 @@ void bli_cntx_init_haswell( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 ); + // ------------------------------------------------------------------------- + + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 201, 201, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 201, 201, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 201, 201, -1, -1 ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1, + 9, 9, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 168, 72, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 4080, 4080, -1, -1 ); + // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx - ); - // ------------------------------------------------------------------------- + // gemmsup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 201, 201, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 201, 201, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 201, 201, -1, -1 ); - - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); + // level-3 sup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, -#if 0 - // Initialize the context with the sup handlers. - bli_cntx_set_l3_sup_handlers - ( - 1, - BLIS_GEMM, bli_gemmsup_ref, - cntx - ); -#endif - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 16, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1, - 9, 9, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx + BLIS_VA_END ); } diff --git a/config/haswell/bli_kernel_defs_haswell.h b/config/haswell/bli_kernel_defs_haswell.h new file mode 100644 index 000000000..c5bc8d63f --- /dev/null +++ b/config/haswell/bli_kernel_defs_haswell.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 6 +#define BLIS_MR_d 6 +#define BLIS_MR_c 3 +#define BLIS_MR_z 3 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 +#define BLIS_NR_c 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/knc/bli_cntx_init_knc.c b/config/knc/bli_cntx_init_knc.c index 198f08827..8f615588c 100644 --- a/config/knc/bli_cntx_init_knc.c +++ b/config/knc/bli_cntx_init_knc.c @@ -43,13 +43,26 @@ void bli_cntx_init_knc( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 1, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8, TRUE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -58,7 +71,7 @@ void bli_cntx_init_knc( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 8, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 120, 0, 0, 0, 160, 0, 0 ); - bli_blksz_init ( &blkszs[ BLIS_KC ], 0, 240, 0, 0, + bli_blksz_init ( &blkszs[ BLIS_KC ], 0, 240, 0, 0, 0, 300, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 14400, 0, 0 ); @@ -66,13 +79,16 @@ void bli_cntx_init_knc( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/knc/bli_kernel_defs_knc.h b/config/knc/bli_kernel_defs_knc.h new file mode 100644 index 000000000..0ae6d1b75 --- /dev/null +++ b/config/knc/bli_kernel_defs_knc.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_d 30 + +#define BLIS_NR_d 8 + +#define BLIS_PACKMR_d 32 + +//#endif + diff --git a/config/knl/bli_cntx_init_knl.c b/config/knl/bli_cntx_init_knl.c index 6da3b7a3a..87fa3176a 100644 --- a/config/knl/bli_cntx_init_knl.c +++ b/config/knl/bli_cntx_init_knl.c @@ -43,47 +43,33 @@ void bli_cntx_init_knl( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, FALSE, - cntx - ); + cntx, - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 2, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk, - BLIS_PACKM_24XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk, - cntx - ); + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, + + // packm + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk, - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - cntx - ); + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 10, #if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, #endif + // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, @@ -92,12 +78,15 @@ void bli_cntx_init_knl( cntx_t* cntx ) BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, #endif + // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, + // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + // scalv #if 0 BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, @@ -106,7 +95,20 @@ void bli_cntx_init_knl( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif - cntx + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -125,17 +127,20 @@ void bli_cntx_init_knl( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx + + BLIS_VA_END ); } diff --git a/config/knl/bli_kernel_defs_knl.h b/config/knl/bli_kernel_defs_knl.h new file mode 100644 index 000000000..ce514bb21 --- /dev/null +++ b/config/knl/bli_kernel_defs_knl.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 24 +#define BLIS_MR_d 24 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/old/armv7a/bli_cntx_init_armv7a.c b/config/old/armv7a/bli_cntx_init_armv7a.c index d4cc9e91d..acd8e6c18 100644 --- a/config/old/armv7a/bli_cntx_init_armv7a.c +++ b/config/old/armv7a/bli_cntx_init_armv7a.c @@ -66,7 +66,7 @@ void bli_cntx_init_armv7a( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, diff --git a/config/old/haswellbb/bli_cntx_init_haswell.c b/config/old/haswellbb/bli_cntx_init_haswell.c index 9e1d03503..88bd14a07 100644 --- a/config/old/haswellbb/bli_cntx_init_haswell.c +++ b/config/old/haswellbb/bli_cntx_init_haswell.c @@ -203,7 +203,7 @@ void bli_cntx_init_haswell( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + 7, // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, diff --git a/config/penryn/bli_cntx_init_penryn.c b/config/penryn/bli_cntx_init_penryn.c index 1576bf944..964438e83 100644 --- a/config/penryn/bli_cntx_init_penryn.c +++ b/config/penryn/bli_cntx_init_penryn.c @@ -43,18 +43,36 @@ void bli_cntx_init_penryn( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_penryn_asm_8x4, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4, FALSE, - //BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4, FALSE, - //BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4, FALSE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4, FALSE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4, FALSE, - cntx + cntx, + + //level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_penryn_asm_8x4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4, + //BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4, + //BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + //level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + //BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + //BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -69,13 +87,16 @@ void bli_cntx_init_penryn( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-1 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/penryn/bli_kernel_defs_penryn.h b/config/penryn/bli_kernel_defs_penryn.h new file mode 100644 index 000000000..f1e483646 --- /dev/null +++ b/config/penryn/bli_kernel_defs_penryn.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 4 + +#define BLIS_NR_s 4 +#define BLIS_NR_d 4 + +//#endif + diff --git a/config/piledriver/bli_cntx_init_piledriver.c b/config/piledriver/bli_cntx_init_piledriver.c index 4ed15e322..1c9a96fd9 100644 --- a/config/piledriver/bli_cntx_init_piledriver.c +++ b/config/piledriver/bli_cntx_init_piledriver.c @@ -43,16 +43,32 @@ void bli_cntx_init_piledriver( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_piledriver( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/piledriver/bli_kernel_defs_piledriver.h b/config/piledriver/bli_kernel_defs_piledriver.h new file mode 100644 index 000000000..df4a8c411 --- /dev/null +++ b/config/piledriver/bli_kernel_defs_piledriver.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 16 +#define BLIS_MR_d 8 +#define BLIS_MR_c 4 +#define BLIS_MR_z 2 + +#define BLIS_NR_s 3 +#define BLIS_NR_d 3 +#define BLIS_NR_c 2 +#define BLIS_NR_z 2 + +//#endif + diff --git a/config/power10/bli_cntx_init_power10.c b/config/power10/bli_cntx_init_power10.c index 14c940f99..12d9f51c6 100644 --- a/config/power10/bli_cntx_init_power10.c +++ b/config/power10/bli_cntx_init_power10.c @@ -34,35 +34,6 @@ #include "blis.h" -// Instantiate prototypes for packm kernels. -PACKM_KER_PROT( float, s, packm_6xk_bb4_power10_ref ) -PACKM_KER_PROT( double, d, packm_6xk_bb2_power10_ref ) - -// Instantiate prototypes for level-3 kernels. -GEMM_UKR_PROT( float, s, gemmbb_power10_ref ) -GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power10_ref ) -GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power10_ref ) -TRSM_UKR_PROT( float, s, trsmbb_l_power10_ref ) -TRSM_UKR_PROT( float, s, trsmbb_u_power10_ref ) - -GEMM_UKR_PROT( double, d, gemmbb_power10_ref ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power10_ref ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power10_ref ) -TRSM_UKR_PROT( double, d, trsmbb_l_power10_ref ) -TRSM_UKR_PROT( double, d, trsmbb_u_power10_ref ) - -GEMM_UKR_PROT( scomplex, c, gemmbb_power10_ref ) -GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power10_ref ) -GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power10_ref ) -TRSM_UKR_PROT( scomplex, c, trsmbb_l_power10_ref ) -TRSM_UKR_PROT( scomplex, c, trsmbb_u_power10_ref ) - -GEMM_UKR_PROT( dcomplex, z, gemmbb_power10_ref ) -GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power10_ref ) -GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power10_ref ) -TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power10_ref ) -TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power10_ref ) - void bli_cntx_init_power10( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; @@ -72,51 +43,38 @@ void bli_cntx_init_power10( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 12, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_power10_mma_8x16, TRUE, - - BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power10_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power10_ref, FALSE, - - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8, TRUE, - - BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power10_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power10_ref, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power10_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power10_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power10_ref, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power10_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power10_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power10_ref, FALSE, - cntx - ); + cntx, - // Update the context with customized virtual [gemm]trsm micro-kernels. - bli_cntx_set_l3_vir_ukrs - ( - 8, - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power10_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power10_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power10_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power10_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power10_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power10_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power10_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power10_ref, - cntx + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_power10_mma_8x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8, + + BLIS_VA_END ); - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs ( - 2, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power10_ref, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power10_ref, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // s d c z @@ -131,14 +89,16 @@ void bli_cntx_init_power10( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/power10/bli_kernel_defs_power10.h b/config/power10/bli_kernel_defs_power10.h new file mode 100644 index 000000000..4e32f1173 --- /dev/null +++ b/config/power10/bli_kernel_defs_power10.h @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 8 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 + +#define BLIS_BBN_s 4 +#define BLIS_BBN_d 2 + +//#endif + diff --git a/config/power7/bli_cntx_init_power7.c b/config/power7/bli_cntx_init_power7.c index c9caf62a6..d5ffe7dcf 100644 --- a/config/power7/bli_cntx_init_power7.c +++ b/config/power7/bli_cntx_init_power7.c @@ -43,13 +43,26 @@ void bli_cntx_init_power7( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 1, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -64,13 +77,16 @@ void bli_cntx_init_power7( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/power7/bli_kernel_defs_power7.h b/config/power7/bli_kernel_defs_power7.h new file mode 100644 index 000000000..ceec01df3 --- /dev/null +++ b/config/power7/bli_kernel_defs_power7.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_d 8 + +#define BLIS_NR_d 4 + +//#endif + diff --git a/config/power9/bli_cntx_init_power9.c b/config/power9/bli_cntx_init_power9.c index 4370ce26c..9f2d67632 100644 --- a/config/power9/bli_cntx_init_power9.c +++ b/config/power9/bli_cntx_init_power9.c @@ -34,35 +34,6 @@ #include "blis.h" -// Instantiate prototypes for packm kernels. -PACKM_KER_PROT( float, s, packm_6xk_bb4_power9_ref ) -PACKM_KER_PROT( double, d, packm_6xk_bb2_power9_ref ) - -// Instantiate prototypes for level-3 kernels. -GEMM_UKR_PROT( float, s, gemmbb_power9_ref ) -GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power9_ref ) -GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power9_ref ) -TRSM_UKR_PROT( float, s, trsmbb_l_power9_ref ) -TRSM_UKR_PROT( float, s, trsmbb_u_power9_ref ) - -GEMM_UKR_PROT( double, d, gemmbb_power9_ref ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power9_ref ) -GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power9_ref ) -TRSM_UKR_PROT( double, d, trsmbb_l_power9_ref ) -TRSM_UKR_PROT( double, d, trsmbb_u_power9_ref ) - -GEMM_UKR_PROT( scomplex, c, gemmbb_power9_ref ) -GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power9_ref ) -GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power9_ref ) -TRSM_UKR_PROT( scomplex, c, trsmbb_l_power9_ref ) -TRSM_UKR_PROT( scomplex, c, trsmbb_u_power9_ref ) - -GEMM_UKR_PROT( dcomplex, z, gemmbb_power9_ref ) -GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power9_ref ) -GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power9_ref ) -TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power9_ref ) -TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power9_ref ) - void bli_cntx_init_power9( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; @@ -72,50 +43,37 @@ void bli_cntx_init_power9( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 12, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemmbb_power9_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power9_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power9_ref, FALSE, - - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE, - - BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power9_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power9_ref, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power9_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power9_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power9_ref, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power9_ref, FALSE, - BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power9_ref, FALSE, - BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power9_ref, FALSE, - cntx - ); + cntx, - // Update the context with customized virtual [gemm]trsm micro-kernels. - bli_cntx_set_l3_vir_ukrs - ( - 8, - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power9_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power9_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power9_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power9_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power9_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power9_ref, - BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power9_ref, - BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power9_ref, - cntx + // level-3 + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, + + BLIS_VA_END ); - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs ( - 2, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power9_ref, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); @@ -131,14 +89,15 @@ void bli_cntx_init_power9( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx - ); + BLIS_VA_END + ); } diff --git a/config/power9/bli_kernel_defs_power9.h b/config/power9/bli_kernel_defs_power9.h new file mode 100644 index 000000000..debfeac5f --- /dev/null +++ b/config/power9/bli_kernel_defs_power9.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_d 12 + +#define BLIS_NR_d 6 + +#define BLIS_BBN_s 4 +#define BLIS_BBN_d 2 + +//#endif + diff --git a/config/sandybridge/bli_cntx_init_sandybridge.c b/config/sandybridge/bli_cntx_init_sandybridge.c index 1ffa5bf8b..0697a3351 100644 --- a/config/sandybridge/bli_cntx_init_sandybridge.c +++ b/config/sandybridge/bli_cntx_init_sandybridge.c @@ -43,16 +43,32 @@ void bli_cntx_init_sandybridge( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sandybridge_asm_8x8, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sandybridge_asm_8x4, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sandybridge_asm_8x8, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sandybridge_asm_8x4, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_sandybridge( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/sandybridge/bli_kernel_defs_sandybridge.h b/config/sandybridge/bli_kernel_defs_sandybridge.h new file mode 100644 index 000000000..dc1b843f6 --- /dev/null +++ b/config/sandybridge/bli_kernel_defs_sandybridge.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 8 +#define BLIS_MR_c 8 +#define BLIS_MR_z 4 + +#define BLIS_NR_s 8 +#define BLIS_NR_d 4 +#define BLIS_NR_c 4 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/skx/bli_cntx_init_skx.c b/config/skx/bli_cntx_init_skx.c index f18503a7a..3af58b38d 100644 --- a/config/skx/bli_cntx_init_skx.c +++ b/config/skx/bli_cntx_init_skx.c @@ -43,39 +43,29 @@ void bli_cntx_init_skx( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - // gemm - BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, FALSE, - cntx - ); + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - cntx - ); - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 10, #if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, #endif + // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, @@ -84,12 +74,15 @@ void bli_cntx_init_skx( cntx_t* cntx ) BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, #endif + // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, + // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + // scalv #if 0 BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, @@ -98,7 +91,20 @@ void bli_cntx_init_skx( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif - cntx + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT , FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -116,17 +122,20 @@ void bli_cntx_init_skx( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx + + BLIS_VA_END ); } diff --git a/config/skx/bli_kernel_defs_skx.h b/config/skx/bli_kernel_defs_skx.h new file mode 100644 index 000000000..2aaf477ad --- /dev/null +++ b/config/skx/bli_kernel_defs_skx.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 32 +#define BLIS_MR_d 16 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 14 + +//#endif + diff --git a/config/steamroller/bli_cntx_init_steamroller.c b/config/steamroller/bli_cntx_init_steamroller.c index 13e7f6495..4b4ecdf4e 100644 --- a/config/steamroller/bli_cntx_init_steamroller.c +++ b/config/steamroller/bli_cntx_init_steamroller.c @@ -43,16 +43,32 @@ void bli_cntx_init_steamroller( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 4, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -67,13 +83,16 @@ void bli_cntx_init_steamroller( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/steamroller/bli_kernel_defs_steamroller.h b/config/steamroller/bli_kernel_defs_steamroller.h new file mode 100644 index 000000000..df4a8c411 --- /dev/null +++ b/config/steamroller/bli_kernel_defs_steamroller.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 16 +#define BLIS_MR_d 8 +#define BLIS_MR_c 4 +#define BLIS_MR_z 2 + +#define BLIS_NR_s 3 +#define BLIS_NR_d 3 +#define BLIS_NR_c 2 +#define BLIS_NR_z 2 + +//#endif + diff --git a/config/template/bli_cntx_init_template.c b/config/template/bli_cntx_init_template.c index f2b1c8d17..4bacc5d63 100644 --- a/config/template/bli_cntx_init_template.c +++ b/config/template/bli_cntx_init_template.c @@ -45,34 +45,44 @@ void bli_cntx_init_template( cntx_t* cntx ) // Update the context with optimized native gemm micro-kernels and // their storage preferences. - bli_cntx_set_l3_nat_ukrs + bli_cntx_set_ukrs ( - 5, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_template_noopt, FALSE, - BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt, FALSE, - BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt, FALSE, - BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt, FALSE, - BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt, FALSE, - cntx - ); + cntx, - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( + // level-3 + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_template_noopt, + BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt, + BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt, + BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt, + BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt, + + // level-1f BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_template_noopt, BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_template_noopt, BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_template_noopt, BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_template_noopt, BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_template_noopt, - cntx - ); - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( + // level-1v BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_template_noopt, BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_template_noopt, - cntx + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -87,13 +97,16 @@ void bli_cntx_init_template( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/template/bli_kernel_defs_template.h b/config/template/bli_kernel_defs_template.h new file mode 100644 index 000000000..86a33d8d8 --- /dev/null +++ b/config/template/bli_kernel_defs_template.h @@ -0,0 +1,60 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +// +// Only defined for block sizes which are not taken as the default (i.e. when +// an optimized kernel is provided). +// + +#define BLIS_MR_z 4 + +#define BLIS_NR_z 4 + +// +// PACKMR/PACKNR do not need to be defined unless they are different from the +// "normal" MR/NR. +// + +//#define BLIS_PACKMR_z 4 + +//#define BLIS_PACKNR_z 4 + +//#endif + diff --git a/config/thunderx2/bli_cntx_init_thunderx2.c b/config/thunderx2/bli_cntx_init_thunderx2.c index f2b7b633d..9d1af2c99 100644 --- a/config/thunderx2/bli_cntx_init_thunderx2.c +++ b/config/thunderx2/bli_cntx_init_thunderx2.c @@ -43,14 +43,28 @@ void bli_cntx_init_thunderx2( cntx_t* cntx ) // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, - cntx + cntx, + + // level-3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // level-3 + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -65,13 +79,16 @@ void bli_cntx_init_thunderx2( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + cntx, + + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx + + BLIS_VA_END ); } diff --git a/config/thunderx2/bli_kernel_defs_thunderx2.h b/config/thunderx2/bli_kernel_defs_thunderx2.h new file mode 100644 index 000000000..60292099c --- /dev/null +++ b/config/thunderx2/bli_kernel_defs_thunderx2.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 8 +#define BLIS_MR_d 6 + +#define BLIS_NR_s 12 +#define BLIS_NR_d 8 + +//#endif + diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index 1b16cd06f..a10986b23 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -40,92 +40,107 @@ void bli_cntx_init_zen( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_zen_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 8, + cntx, // gemm - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, - cntx - ); + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, +#if 0 + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, +#endif -#if 1 - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 8, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, - BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, - BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, - BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, - cntx - ); +#if 0 + // NOTE: This set of kernels is likely broken and therefore disabled. + BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, #endif - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, + // packm + BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - - cntx - ); - - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 16, + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, // axpyv -#if 0 - BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, - BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int, -#else BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, -#endif -#if 1 // copyv BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, -#endif // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, @@ -136,25 +151,76 @@ void bli_cntx_init_zen( cntx_t* cntx ) BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, // scalv -#if 0 - BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, - BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int, -#else BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, -#endif -#if 1 // setv - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, // swapv BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // gemm + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + +#if 0 + // NOTE: This set of kernels is likely broken and therefore disabled. + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, #endif - - cntx + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -195,131 +261,74 @@ void bli_cntx_init_zen( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 512, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 512, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 440, 220, -1, -1 ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1, + 9, 9, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, -1, -1 ); +#if 0 + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, 3, 3, + 9, 9, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 512, 256, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, 2040, 1020 ); +#endif + // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx - ); - // ------------------------------------------------------------------------- + // sup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, -1, -1 ); + // gemmsup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx + BLIS_VA_END ); + // ------------------------------------------------------------------------- + +#if 0 // Initialize the context with the sup handlers. bli_cntx_set_l3_sup_handlers ( - 1, + cntx, + BLIS_GEMM, bli_gemmsup_ref, //BLIS_GEMMT, bli_gemmtsup_ref, - cntx - ); - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 16, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, -#if 0 - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, -#endif - -#if 0 - // NOTE: This set of kernels is likely broken and therefore disabled. - BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - - BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, -#endif - cntx + BLIS_VA_END ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1, - 9, 9, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 ); -#if 0 - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, - 9, 9, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); #endif - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); } - diff --git a/config/zen/bli_kernel_defs_zen.h b/config/zen/bli_kernel_defs_zen.h new file mode 100644 index 000000000..c5bc8d63f --- /dev/null +++ b/config/zen/bli_kernel_defs_zen.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 6 +#define BLIS_MR_d 6 +#define BLIS_MR_c 3 +#define BLIS_MR_z 3 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 +#define BLIS_NR_c 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c index ba728602b..c7e40b4d0 100644 --- a/config/zen2/bli_cntx_init_zen2.c +++ b/config/zen2/bli_cntx_init_zen2.c @@ -38,73 +38,94 @@ void bli_cntx_init_zen2( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_zen2_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 8, + cntx, // gemm - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, - - cntx - ); + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, + + // level-3 sup + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, +#if 0 + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, +#endif -#if 1 - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 8, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, - BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, - BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, - BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, - cntx - ); +#if 0 + // NOTE: This set of kernels is likely broken and therefore disabled. + BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, #endif - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, + // packm + BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - - cntx - ); - - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 16, + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, @@ -127,18 +148,59 @@ void bli_cntx_init_zen2( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, //swap - BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, - BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, //copy BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, //set - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, - cntx + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // gemm + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // level-3 sup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -158,130 +220,73 @@ void bli_cntx_init_zen2( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + // Initialize sup thresholds with architecture-appropriate values. + // s d c z +#if 1 + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 500, 249, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 500, 249, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 500, 249, -1, -1 ); +#else + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 100000, 100000, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 100000, 100000, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 100000, 100000, -1, -1 ); +#endif + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, -1, -1, + 9, 9, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 168, 72, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 256, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 4080, 4080, -1, -1 ); + // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx - ); - // ------------------------------------------------------------------------- + // sup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, - // Initialize sup thresholds with architecture-appropriate values. - // s d c z -#if 1 - bli_blksz_init_easy( &thresh[ BLIS_MT ], 500, 249, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 500, 249, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 500, 249, -1, -1 ); -#else - bli_blksz_init_easy( &thresh[ BLIS_MT ], 100000, 100000, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 100000, 100000, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 100000, 100000, -1, -1 ); -#endif + // level-3 sup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NC_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KC_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MC_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx + BLIS_VA_END ); + // ------------------------------------------------------------------------- + #if 0 // Initialize the context with the sup handlers. bli_cntx_set_l3_sup_handlers ( - 1, + cntx, + BLIS_GEMM, bli_gemmsup_ref, - cntx - ); -#endif + //BLIS_GEMMT, bli_gemmtsup_ref, - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 16, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, -#if 0 - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, -#endif - -#if 0 - // NOTE: This set of kernels is likely broken and therefore disabled. - BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - - BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, -#endif - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1, - 9, 9, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx + BLIS_VA_END ); +#endif } diff --git a/config/zen2/bli_kernel_defs_zen2.h b/config/zen2/bli_kernel_defs_zen2.h new file mode 100644 index 000000000..c5bc8d63f --- /dev/null +++ b/config/zen2/bli_kernel_defs_zen2.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 6 +#define BLIS_MR_d 6 +#define BLIS_MR_c 3 +#define BLIS_MR_z 3 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 +#define BLIS_NR_c 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c index 0336ddc95..3ee385ed6 100644 --- a/config/zen3/bli_cntx_init_zen3.c +++ b/config/zen3/bli_cntx_init_zen3.c @@ -37,83 +37,106 @@ void bli_cntx_init_zen3( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_zen3_ref( cntx ); // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs + // Update the context with optimized native gemm micro-kernels. + bli_cntx_set_ukrs ( - 8, + cntx, // gemm - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, - - cntx - ); + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, + // gemmsup #if 0 - // AMD: This will be enabled in other PRs. - // packm kernels - bli_cntx_set_packm_kers - ( - 2, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen, - cntx - ); + // AMD: This should be enabled in the PR which has added these kernels + // Update the context with optimized small/unpacked gemm kernels. + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, + BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, + BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, #else - // Update the context with optimized packm kernels. - bli_cntx_set_packm_kers - ( - 8, - BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, - BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, - BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, - BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, - BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, - cntx - ); + BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, + BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, + BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, + BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, + + BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, + BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, + BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, + BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, + BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, #endif - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, + // packm +#if 0 + // AMD: This will be enabled in other PRs. + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen, +#else + BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, + BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, + BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, +#endif // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - - cntx - ); - - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 16, + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, @@ -135,19 +158,75 @@ void bli_cntx_init_zen3( cntx_t* cntx ) BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, - //swap - BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, - BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + // swapv + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, - //copy + // copyv BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, - //set + // setv BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, - cntx + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + // gemm + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + + // gemmsup + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, +#if 0 + // AMD: This should be enabled in the PR which has added these kernels + // Update the context with optimized small/unpacked gemm kernels. + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, +#endif + + BLIS_VA_END ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -164,138 +243,67 @@ void bli_cntx_init_zen3( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 512, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 200, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 240, 220, -1, -1 ); + + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], 6, 6, 3, 3, + 9, 9, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], 512, 256, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], 8160, 4080, 2040, 1020 ); + // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 7, + cntx, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx - ); -// ------------------------------------------------------------------------- + // sup thresholds + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, -1, -1 ); + // gemmsup + BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP, + BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP, + BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP, + BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP, + BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP, - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx + BLIS_VA_END ); + // ------------------------------------------------------------------------- + #if 0 // Initialize the context with the sup handlers. bli_cntx_set_l3_sup_handlers ( - 2, + cntx, + BLIS_GEMM, bli_gemmsup_ref, - BLIS_GEMMT, bli_gemmtsup_ref, - cntx - ); -#endif + //BLIS_GEMMT, bli_gemmtsup_ref, -#if 0 - // AMD: This should be enabled in the PR which has added these kernels - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 28, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - cntx + BLIS_VA_END ); -#else - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 16, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, - cntx - ); - #endif - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, - 9, 9, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); } diff --git a/config/zen3/bli_kernel_defs_zen3.h b/config/zen3/bli_kernel_defs_zen3.h new file mode 100644 index 000000000..c5bc8d63f --- /dev/null +++ b/config/zen3/bli_kernel_defs_zen3.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- + +#define BLIS_MR_s 6 +#define BLIS_MR_d 6 +#define BLIS_MR_c 3 +#define BLIS_MR_z 3 + +#define BLIS_NR_s 16 +#define BLIS_NR_d 8 +#define BLIS_NR_c 8 +#define BLIS_NR_z 4 + +//#endif + diff --git a/docs/ConfigurationHowTo.md b/docs/ConfigurationHowTo.md index dcec7754c..cc1224182 100644 --- a/docs/ConfigurationHowTo.md +++ b/docs/ConfigurationHowTo.md @@ -47,7 +47,7 @@ $ ls config/haswell bli_cntx_init_haswell.c bli_family_haswell.h make_defs.mk ``` A sub-configuration (`haswell`, in this case) usually contains just three files: - * `bli_cntx_init_haswell.c`. This file contains the initialization function for a context targeting the hardware in question, in this case, Intel Haswell. A context, or `cntx_t` object, in BLIS encapsulates all of the hardware-specific information--including kernel function pointers and cache and register blocksizes--necessary to support all of the main computational operations in BLIS. The initialization function inside this file should be named the same as the filename (excluding `.c` suffix), which should begin with prefix `bli_cntx_init_` and end with the (lowercase) name of the sub-configuration. The context initialization function (in this case, `bli_cntx_init_haswell()`) is used internally by BLIS when setting up the global kernel structure--a mechanism for managing and supporting multiple microarchitectures simultaneously, so that the choice of which context to use can be deferred until the computation is ready to execute. + * `bli_cntx_init_haswell.c`. This file contains the initialization function for a context targeting the hardware in question, in this case, Intel Haswell. A context, or `cntx_t` object, in BLIS encapsulates all of the hardware-specific information--including kernel function pointers and cache and register blocksizes--necessary to support all of the main computational operations in BLIS. The initialization function inside this file should be named the same as the filename (excluding `.c` suffix), which should begin with prefix `bli_cntx_init_` and end with the (lowercase) name of the sub-configuration. The context initialization function (in this case, `bli_cntx_init_haswell()`) is used internally by BLIS when setting up the global kernel structure--a mechanism for managing and supporting multiple microarchitectures simultaneously, so that the choice of which context to use can be deferred until the computation is ready to execute. * `bli_family_haswell.h`. This header file is `#included` when the configuration in question, in this case `haswell`, was the target to `./configure`. This is where you would specify certain global parameters and settings. For example, if you wanted to specify custom implementations of `malloc()` and `free()`, this is where you would specify them. The file is oftentimes empty. (In the case of configuration families, the definitions in this file apply to the _entire_ build, and not any specific sub-configuration, but for consistency we support them for all configuration targets, whether they be singleton sub-configurations or configuration families.) * `make_defs.mk`. This makefile fragment defines the compiler and compiler flags to use during compilation. Specifically, the values defined in this file are used whenever compiling source code specific to the sub-configuration (i.e., reference kernels and optimized kernels). If the sub-configuration is the target of `configure`, then these flags are also used to compile general framework code. @@ -127,7 +127,7 @@ void bli_cntx_init_fooarch( cntx_t* cntx ) // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, @@ -143,7 +143,7 @@ _**Blocksize object array.**_ The `blkszs` array declaration is needed later in _**Reference initialization.**_ The first function call, `bli_cntx_init_fooarch_ref()`, initializes the context `cntx` with function pointers to reference implementations of all of the kernels supported by BLIS (as well as cache and register blocksizes, and other fields). This function is automatically generated by BLIS for every sub-configuration enabled at configure-time. The function prototype is generated by a preprocessor macro in `frame/include/bli_arch_config.h`. -_**Level-3 microkernels.**_ The second function call is to a variable argument function, `bli_cntx_set_l3_nat_ukrs()`, which updates `cntx` with five optimized double-precision complex level-3 microkernels. The first argument encodes the number of individual kernels being registered into the context. Every subsequent line, except for the last line, is associated with the registration of a single kernel, and each of these lines is independent of one another and can occur in any order, provided that the kernel parameters of each line occur in the same order--kernel ID, followed by datatype, followed by function name, followed by storage preference boolean (i.e., whether the microkernel prefers row storage). The last argument of the function call is the address of the context being updated, `cntx`. Notice that we are registering microkernels written for another type of hardware, `bararch`, because in our hypothetical universe `bararch` is very similar to `fooarch` and so we recycle the code between the two configurations. After the function returns, the context contains pointers to optimized double-precision level-3 real microkernels. Note that the context will still contain reference microkernels for single-precision real and complex, and double-precision complex computation, as those kernels were not updated. +_**Level-3 microkernels.**_ The second function call is to a variable argument function, `bli_cntx_set_l3_nat_ukrs()`, which updates `cntx` with five optimized double-precision complex level-3 microkernels. The first argument encodes the number of individual kernels being registered into the context. Every subsequent line, except for the last line, is associated with the registration of a single kernel, and each of these lines is independent of one another and can occur in any order, provided that the kernel parameters of each line occur in the same order--kernel ID, followed by datatype, followed by function name, followed by storage preference boolean (i.e., whether the microkernel prefers row storage). The last argument of the function call is the address of the context being updated, `cntx`. Notice that we are registering microkernels written for another type of hardware, `bararch`, because in our hypothetical universe `bararch` is very similar to `fooarch` and so we recycle the code between the two configurations. After the function returns, the context contains pointers to optimized double-precision level-3 real microkernels. Note that the context will still contain reference microkernels for single-precision real and complex, and double-precision complex computation, as those kernels were not updated. _Note:_ Currently, BLIS only allows the kernel developer to signal a preference (row or column) for `gemm` microkernels. The preference of the `gemmtrsm` and `trsm` microkernels can (and must) be set, but are ignored by the framework during execution. @@ -236,7 +236,7 @@ _**Memory alignment.**_ BLIS implements memory alignment internally, rather than ``` The value `BLIS_STACK_BUF_ALIGN_SIZE` defines the alignment of stack memory used as temporary internal buffers, such as for output matrices to the microkernel when computing edge cases. (See [implementation notes](KernelsHowTo#implementation-notes-for-gemm) for the `gemm` microkernel for details.) This value defaults to `BLIS_SIMD_ALIGN_SIZE`, which defaults to `BLIS_SIMD_MAX_SIZE`. -The value `BLIS_HEAP_ADDR_ALIGN_SIZE` defines the alignment used when allocating memory via the `malloc()` function defined by `BLIS_MALLOC_USER`. Setting this value to `BLIS_SIMD_ALIGN_SIZE` may speed up certain level-1v and -1f kernels. +The value `BLIS_HEAP_ADDR_ALIGN_SIZE` defines the alignment used when allocating memory via the `malloc()` function defined by `BLIS_MALLOC_USER`. Setting this value to `BLIS_SIMD_ALIGN_SIZE` may speed up certain level-1v and -1f kernels. The value `BLIS_HEAP_STRIDE_ALIGN_SIZE` defines the alignment used for so-called "leading dimensions" (i.e. column strides for column-stored matrices, and row strides for row-stored matrices) when creating BLIS matrices via the object-based API (e.g. `bli_obj_create()`). While setting `BLIS_HEAP_ADDR_ALIGN_SIZE` guarantees alignment for the first column (or row), creating a matrix with certain dimension values (_m_ and _n_) may cause subsequent columns (or rows) to be misaligned. Setting this value to `BLIS_SIMD_ALIGN_SIZE` is usually desirable. Additional alignment may or may not be beneficial. @@ -246,7 +246,7 @@ The value `BLIS_POOL_ADDR_ALIGN_SIZE_*` define the alignments used when allocati ### make_defs.mk -The `make_defs.mk` file primarily contains compiler and compiler flag definitions used by `make` when building a BLIS library. +The `make_defs.mk` file primarily contains compiler and compiler flag definitions used by `make` when building a BLIS library. The format of the file is mostly self-explanatory. However, we will expound on the contents here, using the `make_defs.mk` file for the `haswell` configuration as an example: ```make @@ -304,7 +304,7 @@ _**Debugging flags.**_ The `CDBGFLAGS` variable should be assigned to contain fl _**Optimization flags.**_ The `COPTFLAGS` variable should be assigned any flags relating to general compiler optimization. Usually this takes the form of `-O2` or `-O3`, but more specific optimization flags may be included as well, such as `-fomit-frame-pointer`. Note that, as with `CDBGFLAGS`, `COPTFLAGS` is conditionally assigned based on the value of `$(DEBUG_TYPE)`. A separate `CKOPTFLAGS` variable tracks optimizations flags used when compiling kernels. For most configurations, `CKOPTFLAGS` is assigned as a copy of `COPTFLAGS`, but if the kernel developer needs different optimization flags to be applied when compiling kernel source code, `CKOPTFLAGS` should be set accordingly. -_**Vectorization flags.**_ The second-to-last block sets the `CVECFLAGS`, which should be assigned any flags that must be given to the compiler in order to enable use of a vector instruction set needed or assumed by the kernel source code. Also, if you wish to enable automatic use of certain instruction sets (e.g. `-mfpmath=sse` for many Intel architectures), this is where you should set those flags. These flags often differ among compiler families, especially between `icc` and `gcc`/`clang`. +_**Vectorization flags.**_ The second-to-last block sets the `CVECFLAGS`, which should be assigned any flags that must be given to the compiler in order to enable use of a vector instruction set needed or assumed by the kernel source code. Also, if you wish to enable automatic use of certain instruction sets (e.g. `-mfpmath=sse` for many Intel architectures), this is where you should set those flags. These flags often differ among compiler families, especially between `icc` and `gcc`/`clang`. _**Variable storage/renaming.**_ Finally, the last statement commits the variables defined in the file to "storage". That is, they are copied to variable names that contain `THIS_CONFIG` as a suffix. This allows the variables for one configuration to co-exist with variables of another configuration. @@ -406,7 +406,7 @@ Some sub-configurations, for various reasons, do not rely on their own set of ke excavator: excavator/piledriver steamroller: steamroller/piledriver ``` -Here, the first line (reading from left-to-right) defines the `excavator` singleton family as containing only itself, the `excavator` sub-configuration, and also specifies that this sub-configuration must have access to the `piledriver` kernel set. The second line defines the `steamroller` singleton family in a similar manner. +Here, the first line (reading from left-to-right) defines the `excavator` singleton family as containing only itself, the `excavator` sub-configuration, and also specifies that this sub-configuration must have access to the `piledriver` kernel set. The second line defines the `steamroller` singleton family in a similar manner. **Note:** Specifying non-native kernel sets via the `/` character is only allowed when defining singleton configuration families. They may NOT appear in the definitions of umbrella families! When an umbrella family includes a singleton family that is defined to require non-native kernels, this will be accounted for during the parsing of the `config_registry` file. @@ -467,7 +467,7 @@ configure: skx: skx configure: steamroller: steamroller configure: x86_64: haswell sandybridge penryn zen excavator steamroller piledriver bulldozer generic ``` -This simply lists the sub-configurations associated with each defined configuration family (singleton or umbrella). Note that they are sorted alphabetically. +This simply lists the sub-configurations associated with each defined configuration family (singleton or umbrella). Note that they are sorted alphabetically. Next, the kernel list (actually, all kernel lists) is printed: ``` @@ -549,7 +549,7 @@ Adding support for a new set of kernels in BLIS is easy and can be done via the 2. _**Add support within the framework source code.**_ We also need to make a minor update to the framework to support the new kernels--specifically, to pull in the kernels' function prototypes. - **`frame/include/bli_arch_config.h`**. When adding support for the `knl` kernel set to the framework, we must modify this file to `#include` the `bli_kernels_knl.h` header file: + **`frame/include/bli_arch_config.h`**. When adding support for the `knl` kernel set to the framework, we must modify this file to `#include` the `bli_kernels_knl.h` header file: ```c #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" @@ -560,7 +560,7 @@ Adding support for a new set of kernels in BLIS is easy and can be done via the ## Adding a new configuration family -Adding support for a new umbrella configuration family in BLIS is fairly straightforward and can be done via the following steps. The hypothetical examples used in these steps assume you are trying to create a new configuration family `intelavx` that supports only Intel microarchitectures that support the Intel AVX instruction set. +Adding support for a new umbrella configuration family in BLIS is fairly straightforward and can be done via the following steps. The hypothetical examples used in these steps assume you are trying to create a new configuration family `intelavx` that supports only Intel microarchitectures that support the Intel AVX instruction set. @@ -636,7 +636,7 @@ Adding support for a new-subconfiguration to BLIS is similar to adding support f ``` THIS_CONFIG := knl ``` - and while we're editing the file, we can make any other changes to compiler flags we wish (if any). Similarly, the `bli_family_knl.h` header file should be updated as needed. Since the number of vector registers and the vector register size on `knl` differ from the defaults, we must explicitly set them. (The role of these parameters was explained in a [previous section](ConfigurationHowTo.md#bli_family_h).) Furthermore, provided that a macro `BLIS_NO_HBWMALLOC` is not set, we use a different implementation of `malloc()` and `free()` and `#include` that implementation's header file. + and while we're editing the file, we can make any other changes to compiler flags we wish (if any). Similarly, the `bli_family_knl.h` header file should be updated as needed. Since the number of vector registers and the vector register size on `knl` differ from the defaults, we must explicitly set them. (The role of these parameters was explained in a [previous section](ConfigurationHowTo.md#bli_family_h).) Furthermore, provided that a macro `BLIS_NO_HBWMALLOC` is not set, we use a different implementation of `malloc()` and `free()` and `#include` that implementation's header file. ```c #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #define BLIS_SIMD_MAX_SIZE 64 @@ -714,7 +714,7 @@ Adding support for a new-subconfiguration to BLIS is similar to adding support f #include "bli_family_knl.h" #endif ``` - As before with umbrella families, the `BLIS_FAMILY_KNL` macro is automatically defined by the build system for whatever family was targeted by `configure`. (That is, if the user runs `./configure foobar`, the C preprocessor macro `BLIS_FAMILY_FOOBAR` will be defined.) + As before with umbrella families, the `BLIS_FAMILY_KNL` macro is automatically defined by the build system for whatever family was targeted by `configure`. (That is, if the user runs `./configure foobar`, the C preprocessor macro `BLIS_FAMILY_FOOBAR` will be defined.) diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c index 5fdfdb91e..1d12b42eb 100644 --- a/frame/1/bli_l1v_tapi.c +++ b/frame/1/bli_l1v_tapi.c @@ -61,15 +61,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjx, \ - n, \ - x, incx, \ - y, incy, \ - cntx \ + conjx, \ + n, \ + x, incx, \ + y, incy, \ + cntx \ ); \ } @@ -98,14 +98,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - n, \ - x, incx, \ - index, \ - cntx \ + n, \ + x, incx, \ + index, \ + cntx \ ); \ } @@ -135,17 +135,17 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjx, \ - n, \ - alpha, \ - x, incx, \ - beta, \ - y, incy, \ - cntx \ + conjx, \ + n, \ + alpha, \ + x, incx, \ + beta, \ + y, incy, \ + cntx \ ); \ } @@ -175,16 +175,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \ if ( cntx == NULL ) \ cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjx, \ - n, \ - alpha, \ - x, incx, \ - y, incy, \ - cntx \ + conjx, \ + n, \ + alpha, \ + x, incx, \ + y, incy, \ + cntx \ ); \ } @@ -215,17 +215,17 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjx, \ - conjy, \ - n, \ - x, incx, \ - y, incy, \ - rho, \ - cntx \ + conjx, \ + conjy, \ + n, \ + x, incx, \ + y, incy, \ + rho, \ + cntx \ ); \ } @@ -257,19 +257,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjx, \ - conjy, \ - n, \ - alpha, \ - x, incx, \ - y, incy, \ - beta, \ - rho, \ - cntx \ + conjx, \ + conjy, \ + n, \ + alpha, \ + x, incx, \ + y, incy, \ + beta, \ + rho, \ + cntx \ ); \ } @@ -295,13 +295,13 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - n, \ - x, incx, \ - cntx \ + n, \ + x, incx, \ + cntx \ ); \ } @@ -329,15 +329,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjalpha, \ - n, \ - alpha, \ - x, incx, \ - cntx \ + conjalpha, \ + n, \ + alpha, \ + x, incx, \ + cntx \ ); \ } @@ -365,14 +365,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - n, \ - x, incx, \ - y, incy, \ - cntx \ + n, \ + x, incx, \ + y, incy, \ + cntx \ ); \ } @@ -400,16 +400,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjx, \ - n, \ - x, incx, \ - beta, \ - y, incy, \ - cntx \ + conjx, \ + n, \ + x, incx, \ + beta, \ + y, incy, \ + cntx \ ); \ } diff --git a/frame/1/other/packv/bli_packv_unb_var1.c b/frame/1/other/packv/bli_packv_unb_var1.c index 23b370949..ca1323b58 100644 --- a/frame/1/other/packv/bli_packv_unb_var1.c +++ b/frame/1/other/packv/bli_packv_unb_var1.c @@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ - PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ + PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ diff --git a/frame/1/other/unpackv/bli_unpackv_unb_var1.c b/frame/1/other/unpackv/bli_unpackv_unb_var1.c index 5dc1101b6..43c9a266c 100644 --- a/frame/1/other/unpackv/bli_unpackv_unb_var1.c +++ b/frame/1/other/unpackv/bli_unpackv_unb_var1.c @@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ - PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ + PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c index a8f9e844a..cfaf5150f 100644 --- a/frame/1d/bli_l1d_tapi.c +++ b/frame/1d/bli_l1d_tapi.c @@ -85,32 +85,33 @@ void PASTEMAC2(ch,opname,EX_SUF) \ \ if ( bli_is_nonunit_diag( diagx ) ) \ { \ - x1 = x + offx; \ - y1 = y + offy; \ + x1 = x + offx; \ + y1 = y + offy; \ } \ else /* if ( bli_is_unit_diag( diagx ) ) */ \ { \ - /* Simulate a unit diagonal for x with a zero increment over a unit - scalar. */ \ - x1 = PASTEMAC(ch,1); \ - incx = 0; \ - y1 = y + offy; \ + /* Simulate a unit diagonal for x with a zero increment over a unit + scalar. */ \ + x1 = PASTEMAC(ch,1); \ + incx = 0; \ + y1 = y + offy; \ } \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - x1, incx, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + x1, incx, \ + y1, incy, \ + cntx \ + ); \ } INSERT_GENTFUNC_BASIC2( addd, addv, BLIS_ADDV_KER ) @@ -164,33 +165,34 @@ void PASTEMAC2(ch,opname,EX_SUF) \ \ if ( bli_is_nonunit_diag( diagx ) ) \ { \ - x1 = x + offx; \ - y1 = y + offy; \ + x1 = x + offx; \ + y1 = y + offy; \ } \ else /* if ( bli_is_unit_diag( diagx ) ) */ \ { \ - /* Simulate a unit diagonal for x with a zero increment over a unit - scalar. */ \ - x1 = PASTEMAC(ch,1); \ - incx = 0; \ - y1 = y + offy; \ + /* Simulate a unit diagonal for x with a zero increment over a unit + scalar. */ \ + x1 = PASTEMAC(ch,1); \ + incx = 0; \ + y1 = y + offy; \ } \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - alpha, \ - x1, incx, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + alpha, \ + x1, incx, \ + y1, incy, \ + cntx \ + ); \ } INSERT_GENTFUNC_BASIC2( axpyd, axpyv, BLIS_AXPYV_KER ) @@ -233,20 +235,21 @@ void PASTEMAC2(ch,opname,EX_SUF) \ &offx, &n_elem, &incx \ ); \ \ - x1 = x + offx; \ + x1 = x + offx; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - n_elem, \ - x1, incx, \ - cntx \ - ); \ + f \ + ( \ + n_elem, \ + x1, incx, \ + cntx \ + ); \ } INSERT_GENTFUNC_BASIC2( invertd, invertv, BLIS_INVERTV_KER ) @@ -290,22 +293,23 @@ void PASTEMAC2(ch,opname,EX_SUF) \ &offx, &n_elem, &incx \ ); \ \ - x1 = x + offx; \ + x1 = x + offx; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjalpha, \ - n_elem, \ - alpha, \ - x1, incx, \ - cntx \ - ); \ + f \ + ( \ + conjalpha, \ + n_elem, \ + alpha, \ + x1, incx, \ + cntx \ + ); \ } INSERT_GENTFUNC_BASIC2( scald, scalv, BLIS_SCALV_KER ) @@ -361,27 +365,28 @@ void PASTEMAC2(ch,opname,EX_SUF) \ PASTEMAC(ch,setis)( *alpha, *chi11 ); \ } */ \ \ - /* Acquire the addres of the imaginary component of the first element, + /* Acquire the address of the imaginary component of the first element, and scale the increment for use in the real domain. Note that the indexing into the imaginary field only needs to work for complex datatypes since we return early for real domain types. */ \ - x1 = ( ctype_r* )( x + offx ) + 1; \ + x1 = ( ctype_r* )( x + offx ) + 1; \ incx = 2*incx; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ - PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt_r, kerid, cntx ); \ + PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt_r, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - BLIS_NO_CONJUGATE, \ - n_elem, \ - alpha, \ - x1, incx, \ - cntx \ - ); \ + f \ + ( \ + BLIS_NO_CONJUGATE, \ + n_elem, \ + alpha, \ + x1, incx, \ + cntx \ + ); \ } INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER ) @@ -424,22 +429,23 @@ void PASTEMAC2(ch,opname,EX_SUF) \ &offx, &n_elem, &incx \ ); \ \ - x1 = x + offx; \ + x1 = x + offx; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - BLIS_NO_CONJUGATE, \ - n_elem, \ - alpha, 0, \ - x1, incx, \ - cntx \ - ); \ + f \ + ( \ + BLIS_NO_CONJUGATE, \ + n_elem, \ + alpha, 0, \ + x1, incx, \ + cntx \ + ); \ } INSERT_GENTFUNC_BASIC2( shiftd, addv, BLIS_ADDV_KER ) @@ -491,33 +497,34 @@ void PASTEMAC2(ch,opname,EX_SUF) \ \ if ( bli_is_nonunit_diag( diagx ) ) \ { \ - x1 = x + offx; \ - y1 = y + offy; \ + x1 = x + offx; \ + y1 = y + offy; \ } \ else /* if ( bli_is_unit_diag( diagx ) ) */ \ { \ - /* Simulate a unit diagonal for x with a zero increment over a unit - scalar. */ \ - x1 = PASTEMAC(ch,1); \ - incx = 0; \ - y1 = y + offy; \ + /* Simulate a unit diagonal for x with a zero increment over a unit + scalar. */ \ + x1 = PASTEMAC(ch,1); \ + incx = 0; \ + y1 = y + offy; \ } \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ - f( \ - conjx, \ - n_elem, \ - x1, incx, \ - beta, \ - y1, incy, \ - cntx \ - ); \ + f \ + ( \ + conjx, \ + n_elem, \ + x1, incx, \ + beta, \ + y1, incy, \ + cntx \ + ); \ } INSERT_GENTFUNC_BASIC2( xpbyd, xpbyv, BLIS_XPBYV_KER ) diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c index 332ff5af2..a54379299 100644 --- a/frame/1f/bli_l1f_tapi.c +++ b/frame/1f/bli_l1f_tapi.c @@ -65,19 +65,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjx, \ - conjy, \ - n, \ - alphax, \ - alphay, \ - x, incx, \ - y, incy, \ - z, incz, \ - cntx \ + conjx, \ + conjy, \ + n, \ + alphax, \ + alphay, \ + x, incx, \ + y, incy, \ + z, incz, \ + cntx \ ); \ } @@ -109,19 +109,19 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conja, \ - conjx, \ - m, \ - b_n, \ - alpha, \ - a, inca, lda, \ - x, incx, \ - y, incy, \ - cntx \ + conja, \ + conjx, \ + m, \ + b_n, \ + alpha, \ + a, inca, lda, \ + x, incx, \ + y, incy, \ + cntx \ ); \ } @@ -154,20 +154,20 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjxt, \ - conjx, \ - conjy, \ - n, \ - alpha, \ - x, incx, \ - y, incy, \ - rho, \ - z, incz, \ - cntx \ + conjxt, \ + conjx, \ + conjy, \ + n, \ + alpha, \ + x, incx, \ + y, incy, \ + rho, \ + z, incz, \ + cntx \ ); \ } @@ -204,24 +204,24 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjat, \ - conja, \ - conjw, \ - conjx, \ - m, \ - b_n, \ - alpha, \ - a, inca, lda, \ - w, incw, \ - x, incx, \ - beta, \ - y, incy, \ - z, incz, \ - cntx \ + conjat, \ + conja, \ + conjw, \ + conjx, \ + m, \ + b_n, \ + alpha, \ + a, inca, lda, \ + w, incw, \ + x, incx, \ + beta, \ + y, incy, \ + z, incz, \ + cntx \ ); \ } @@ -254,20 +254,20 @@ void PASTEMAC2(ch,opname,EX_SUF) \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ f \ ( \ - conjat, \ - conjx, \ - m, \ - b_n, \ - alpha, \ - a, inca, lda, \ - x, incx, \ - beta, \ - y, incy, \ - cntx \ + conjat, \ + conjx, \ + m, \ + b_n, \ + alpha, \ + a, inca, lda, \ + x, incx, \ + beta, \ + y, incy, \ + cntx \ ); \ } diff --git a/frame/1m/bli_l1m_ft_ker.h b/frame/1m/bli_l1m_ft_ker.h index 2e813cf4a..41d80e217 100644 --- a/frame/1m/bli_l1m_ft_ker.h +++ b/frame/1m/bli_l1m_ft_ker.h @@ -102,35 +102,40 @@ INSERT_GENTDEF( packm_cxk ) \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ - conj_t conjp, \ + conj_t conja, \ + pack_t schema, \ + dim_t cdim, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ + cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) -// packm_1er_ker +// packm_diag_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ + struc_t struca, \ + diag_t diaga, \ + uplo_t uploa, \ conj_t conja, \ pack_t schema, \ + bool invdiag, \ dim_t cdim, \ - dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ + cntx_t* restrict cntx \ ); -INSERT_GENTDEF( packm_cxk_1er ) +INSERT_GENTDEF( packm_cxc_diag ) #endif diff --git a/frame/1m/bli_l1m_ker.h b/frame/1m/bli_l1m_ker.h index 76d51af2b..970c5f040 100644 --- a/frame/1m/bli_l1m_ker.h +++ b/frame/1m/bli_l1m_ker.h @@ -47,16 +47,8 @@ #undef GENTPROT #define GENTPROT PACKM_KER_PROT -INSERT_GENTPROT_BASIC0( packm_2xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_3xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_4xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_6xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_8xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_10xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_12xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_14xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_16xk_ker_name ) -INSERT_GENTPROT_BASIC0( packm_24xk_ker_name ) +INSERT_GENTPROT_BASIC0( packm_mrxk_ker_name ) +INSERT_GENTPROT_BASIC0( packm_nrxk_ker_name ) // native unpackm kernels @@ -64,27 +56,33 @@ INSERT_GENTPROT_BASIC0( packm_24xk_ker_name ) #undef GENTPROT #define GENTPROT UNPACKM_KER_PROT -INSERT_GENTPROT_BASIC0( unpackm_2xk_ker_name ) -INSERT_GENTPROT_BASIC0( unpackm_4xk_ker_name ) -INSERT_GENTPROT_BASIC0( unpackm_6xk_ker_name ) -INSERT_GENTPROT_BASIC0( unpackm_8xk_ker_name ) -INSERT_GENTPROT_BASIC0( unpackm_10xk_ker_name ) -INSERT_GENTPROT_BASIC0( unpackm_12xk_ker_name ) -INSERT_GENTPROT_BASIC0( unpackm_14xk_ker_name ) -INSERT_GENTPROT_BASIC0( unpackm_16xk_ker_name ) +INSERT_GENTPROT_BASIC0( unpackm_mrxk_ker_name ) +INSERT_GENTPROT_BASIC0( unpackm_nrxk_ker_name ) // 1e/1r packm kernels #undef GENTPROT -#define GENTPROT PACKM_1ER_KER_PROT - -INSERT_GENTPROT_BASIC0( packm_2xk_1er_ker_name ) -INSERT_GENTPROT_BASIC0( packm_4xk_1er_ker_name ) -INSERT_GENTPROT_BASIC0( packm_6xk_1er_ker_name ) -INSERT_GENTPROT_BASIC0( packm_8xk_1er_ker_name ) -INSERT_GENTPROT_BASIC0( packm_10xk_1er_ker_name ) -INSERT_GENTPROT_BASIC0( packm_12xk_1er_ker_name ) -INSERT_GENTPROT_BASIC0( packm_14xk_1er_ker_name ) -INSERT_GENTPROT_BASIC0( packm_16xk_1er_ker_name ) +#define GENTPROT PACKM_KER_PROT + +INSERT_GENTPROT_BASIC0( packm_mrxk_1er_ker_name ) +INSERT_GENTPROT_BASIC0( packm_nrxk_1er_ker_name ) + + +// packm kernels for diagonal blocks + +#undef GENTPROT +#define GENTPROT PACKM_DIAG_KER_PROT + +INSERT_GENTPROT_BASIC0( packm_mrxmr_diag_ker_name ) +INSERT_GENTPROT_BASIC0( packm_nrxnr_diag_ker_name ) + + +// 1e/1r packm kernels for diagonal blocks + +#undef GENTPROT +#define GENTPROT PACKM_DIAG_KER_PROT + +INSERT_GENTPROT_BASIC0( packm_mrxmr_diag_1er_ker_name ) +INSERT_GENTPROT_BASIC0( packm_nrxnr_diag_1er_ker_name ) diff --git a/frame/1m/bli_l1m_ker_prot.h b/frame/1m/bli_l1m_ker_prot.h index 02d329622..80284ea22 100644 --- a/frame/1m/bli_l1m_ker_prot.h +++ b/frame/1m/bli_l1m_ker_prot.h @@ -37,7 +37,7 @@ // Define template prototypes for level-1m kernels. // -// native packm kernels +// packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ @@ -55,35 +55,40 @@ void PASTEMAC(ch,varname) \ ); -// native unpackm kernels +// unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ + pack_t schema, \ + dim_t cdim, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ + cntx_t* restrict cntx \ ); -// 1e/1r packm kernels +// packm kernels for diagonal blocks -#define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ +#define PACKM_DIAG_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ + struc_t struca, \ + diag_t diaga, \ + uplo_t uploa, \ conj_t conja, \ pack_t schema, \ + bool invdiag, \ dim_t cdim, \ - dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ + cntx_t* restrict cntx \ ); diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c index f2ce3c8d7..c979f082a 100644 --- a/frame/1m/bli_l1m_unb_var1.c +++ b/frame/1m/bli_l1m_unb_var1.c @@ -80,7 +80,7 @@ void PASTEMAC(ch,opname) \ conjx = bli_extract_conj( transx ); \ \ /* Query the kernel needed for this operation. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ @@ -197,7 +197,7 @@ void PASTEMAC(ch,opname) \ conjx = bli_extract_conj( transx ); \ \ /* Query the kernel needed for this operation. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ @@ -310,7 +310,7 @@ void PASTEMAC(ch,opname) \ if ( bli_is_zeros( uplox_eff ) ) return; \ \ /* Query the kernel needed for this operation. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ @@ -423,7 +423,7 @@ void PASTEMAC(ch,opname) \ conjx = bli_extract_conj( transx ); \ \ /* Query the kernel needed for this operation. */ \ - PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ + PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ diff --git a/frame/1m/packm/bli_packm_cxk.c b/frame/1m/other/bli_packm_cxk.c similarity index 84% rename from frame/1m/packm/bli_packm_cxk.c rename to frame/1m/other/bli_packm_cxk.c index ea0418cae..612b37f78 100644 --- a/frame/1m/packm/bli_packm_cxk.c +++ b/frame/1m/other/bli_packm_cxk.c @@ -54,15 +54,16 @@ void PASTEMAC(ch,opname) \ /* Note that we use panel_dim_max, not panel_dim, to query the packm kernel function pointer. This means that we always use the same kernel, even for edge cases. */ \ - num_t dt = PASTEMAC(ch,type); \ - l1mkr_t ker_id = panel_dim_max; \ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \ + : BLIS_PACKM_MRXK_KER; \ \ PASTECH2(ch,opname,_ker_ft) f; \ \ /* Query the context for the packm kernel corresponding to the current panel dimension, or kernel id. If the id is invalid, the function will return NULL. */ \ - f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \ + f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* If there exists a kernel implementation for the micro-panel dimension provided, we invoke the implementation. Otherwise, we use scal2m. */ \ @@ -91,30 +92,30 @@ void PASTEMAC(ch,opname) \ that happens, the packm kernel must have set the 0's added in step (3) below. - packm kernel packm kernel packm kernel packm_tri_cxk + packm kernel packm kernel packm kernel packm_tri_cxk step 1: step 2: step 3: step 4: - x x x x . . x x x x . . x x x x 0 0 x x x x 0 0 - ? x x x . . ? x x x . . ? x x x 0 0 ? x x x 0 0 - ? ? x x . . -> ? ? x x . . -> ? ? x x 0 0 -> ? ? x x 0 0 - ? ? ? x . . ? ? ? x . . ? ? ? x 0 0 ? ? ? x 0 0 - . . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 - . . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 + x x x x . . x x x x . . x x x x 0 0 x x x x 0 0 + ? x x x . . ? x x x . . ? x x x 0 0 ? x x x 0 0 + ? ? x x . . -> ? ? x x . . -> ? ? x x 0 0 -> ? ? x x 0 0 + ? ? ? x . . ? ? ? x . . ? ? ? x 0 0 ? ? ? x 0 0 + . . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 + . . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 x Copied from A; valid element. - ? Copied from A, but value is unknown and unused. + ? Copied from A, but value is unknown and unused. . Uninitialized. - 0 Initialized to zero. - 1 Initialized to one. + 0 Initialized to zero. + 1 Initialized to one. NOTE: In step 5 (not shown), bli_packm_tri_cxk() sets the ?'s to zero. This is not needed to support trsm, but rather to support trmm. (Both use the same packing format and code.) - In this case, panel_dim will be 4 because four rows of data are - copied from A, panel_len will be 4 because those four rows span - four columns of A, and panel_len_max will be 6 because there are a - total of 6 columns that can be written to in the packed micropanel, + In this case, panel_dim will be 4 because four rows of data are + copied from A, panel_len will be 4 because those four rows span + four columns of A, and panel_len_max will be 6 because there are a + total of 6 columns that can be written to in the packed micropanel, 2 of which lie beyond the values copied from A. */ \ f \ ( \ diff --git a/frame/1m/packm/bli_packm_cxk.h b/frame/1m/other/bli_packm_cxk.h similarity index 100% rename from frame/1m/packm/bli_packm_cxk.h rename to frame/1m/other/bli_packm_cxk.h diff --git a/frame/1m/packm/bli_packm_cxk_1er.c b/frame/1m/other/bli_packm_cxk_1er.c similarity index 94% rename from frame/1m/packm/bli_packm_cxk_1er.c rename to frame/1m/other/bli_packm_cxk_1er.c index e583c8a82..22598dbac 100644 --- a/frame/1m/packm/bli_packm_cxk_1er.c +++ b/frame/1m/other/bli_packm_cxk_1er.c @@ -54,15 +54,16 @@ void PASTEMAC(ch,opname) \ /* Note that we use panel_dim_max, not panel_dim, to query the packm kernel function pointer. This means that we always use the same kernel, even for edge cases. */ \ - num_t dt = PASTEMAC(ch,type); \ - l1mkr_t ker_id = panel_dim_max; \ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER \ + : BLIS_PACKM_MRXK_1ER_KER; \ \ PASTECH2(ch,opname,_ker_ft) f; \ \ /* Query the context for the packm kernel corresponding to the current panel dimension, or kernel id. If the id is invalid, the function will return NULL. */ \ - f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \ + f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* If there exists a kernel implementation for the micro-panel dimension provided, we invoke the implementation. Otherwise, we use scal2m. */ \ diff --git a/frame/1m/packm/bli_packm_cxk_1er.h b/frame/1m/other/bli_packm_cxk_1er.h similarity index 100% rename from frame/1m/packm/bli_packm_cxk_1er.h rename to frame/1m/other/bli_packm_cxk_1er.h diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.c b/frame/1m/other/bli_packm_struc_cxk_1er.c similarity index 100% rename from frame/1m/packm/bli_packm_struc_cxk_1er.c rename to frame/1m/other/bli_packm_struc_cxk_1er.c diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.h b/frame/1m/other/bli_packm_struc_cxk_1er.h similarity index 100% rename from frame/1m/packm/bli_packm_struc_cxk_1er.h rename to frame/1m/other/bli_packm_struc_cxk_1er.h diff --git a/frame/1m/unpackm/bli_unpackm_cxk.c b/frame/1m/other/bli_unpackm_cxk.c similarity index 92% rename from frame/1m/unpackm/bli_unpackm_cxk.c rename to frame/1m/other/bli_unpackm_cxk.c index 4423c41a2..4b7977e86 100644 --- a/frame/1m/unpackm/bli_unpackm_cxk.c +++ b/frame/1m/other/bli_unpackm_cxk.c @@ -40,6 +40,7 @@ void PASTEMAC(ch,opname) \ ( \ conj_t conjp, \ + pack_t schema, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ @@ -48,15 +49,16 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - num_t dt = PASTEMAC(ch,type); \ - l1mkr_t ker_id = panel_dim; \ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER \ + : BLIS_UNPACKM_MRXK_KER; \ \ PASTECH2(ch,opname,_ker_ft) f; \ \ /* Query the context for the unpackm kernel corresponding to the current panel dimension, or kernel id. If the id is invalid, the function will return NULL. */ \ - f = bli_cntx_get_unpackm_ker_dt( dt, ker_id, cntx ); \ + f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* If there exists a kernel implementation for the micro-panel dimension provided, we invoke the implementation. Otherwise, we use scal2m. */ \ diff --git a/frame/1m/unpackm/bli_unpackm_cxk.h b/frame/1m/other/bli_unpackm_cxk.h similarity index 98% rename from frame/1m/unpackm/bli_unpackm_cxk.h rename to frame/1m/other/bli_unpackm_cxk.h index 53c3c0c44..d828a9b8e 100644 --- a/frame/1m/unpackm/bli_unpackm_cxk.h +++ b/frame/1m/other/bli_unpackm_cxk.h @@ -39,6 +39,7 @@ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ + pack_t schema, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 88657a712..7d73bf903 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -43,10 +43,6 @@ #include "bli_packm_part.h" #include "bli_packm_struc_cxk.h" -#include "bli_packm_struc_cxk_1er.h" - -#include "bli_packm_cxk.h" -#include "bli_packm_cxk_1er.h" // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index edeeae2b9..e13391151 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -43,11 +43,11 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = { { bli_spackm_struc_cxk, bli_cpackm_struc_cxk, bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } }, // 0001 row/col panels: 1m-expanded (1e) - { { NULL, bli_cpackm_struc_cxk_1er, - NULL, bli_zpackm_struc_cxk_1er, } }, + { { NULL, bli_cpackm_struc_cxk, + NULL, bli_zpackm_struc_cxk, } }, // 0010 row/col panels: 1m-reordered (1r) - { { NULL, bli_cpackm_struc_cxk_1er, - NULL, bli_zpackm_struc_cxk_1er, } }, + { { NULL, bli_cpackm_struc_cxk, + NULL, bli_zpackm_struc_cxk, } }, }; static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md); diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c index 2a52c42de..dbdaf4738 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.c +++ b/frame/1m/packm/bli_packm_struc_cxk.c @@ -34,8 +34,8 @@ #include "blis.h" -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, kername ) \ +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, cxk_kername, cxc_kername ) \ \ void PASTEMAC(ch,varname) \ ( \ @@ -58,138 +58,38 @@ void PASTEMAC(ch,varname) \ cntx_t* cntx \ ) \ { \ - /* Handle micro-panel packing based on the structure of the matrix - being packed. */ \ - if ( bli_is_general( strucc ) ) \ - { \ - /* For micro-panels of general matrices, we can call the pack - kernel front-end directly. */ \ - PASTEMAC(ch,kername) \ - ( \ - conjc, \ - schema, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, incc, ldc, \ - p, ldp, \ - cntx \ - ); \ - } \ - else if ( bli_is_herm_or_symm( strucc ) ) \ - { \ - /* Call a helper function for micro-panels of Hermitian/symmetric - matrices. */ \ - PASTEMAC(ch,packm_herm_cxk) \ - ( \ - strucc, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - panel_dim, \ - panel_len, \ - panel_dim_max, \ - panel_len_max, \ - panel_dim_off, \ - panel_len_off, \ - kappa, \ - c, incc, ldc, \ - p, ldp, \ - is_p, \ - cntx \ - ); \ - } \ - else /* ( bli_is_triangular( strucc ) ) */ \ - { \ - /* Call a helper function for micro-panels of triangular - matrices. */ \ - PASTEMAC(ch,packm_tri_cxk) \ - ( \ - strucc, \ - diagc, \ - uploc, \ - conjc, \ - schema, \ - invdiag, \ - panel_dim, \ - panel_len, \ - panel_dim_max, \ - panel_len_max, \ - panel_dim_off, \ - panel_len_off, \ - kappa, \ - c, incc, ldc, \ - p, ldp, \ - is_p, \ - cntx \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, kername ) \ + num_t dt = PASTEMAC(ch,type); \ + num_t dt_r = PASTEMAC(chr,type); \ + dim_t panel_len_pad = panel_len_max - panel_len; \ \ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t panel_dim, \ - dim_t panel_len, \ - dim_t panel_dim_max, \ - dim_t panel_len_max, \ - dim_t panel_dim_off, \ - dim_t panel_len_off, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t ldp, \ - inc_t is_p, \ - cntx_t* cntx \ - ) \ -{ \ - doff_t diagoffc = panel_dim_off - panel_len_off; \ - doff_t diagoffc_abs; \ - dim_t i, j; \ + bszid_t bsz_id = bli_is_col_packed( schema ) ? BLIS_NR : BLIS_MR; \ + dim_t packmrnr = bli_cntx_get_blksz_max_dt( dt, bsz_id, cntx ); \ + dim_t packmrnr_r = bli_cntx_get_blksz_max_dt( dt_r, bsz_id, cntx ); \ \ - /* Handle the case where the micro-panel does NOT intersect the - diagonal separately from the case where it does intersect. */ \ - if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \ + ukr_t cxk_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \ + : BLIS_PACKM_MRXK_KER; \ + ukr_t cxc_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_KER \ + : BLIS_PACKM_MRXMR_DIAG_KER; \ +\ + if ( bli_is_1m_packed( schema ) ) \ { \ - /* If the current panel is unstored, we need to make a few - adjustments so we refer to the data where it is actually - stored, also taking conjugation into account. (Note this - implicitly assumes we are operating on a dense panel - within a larger symmetric or Hermitian matrix, since a - general matrix would not contain any unstored region.) */ \ - if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \ - { \ - c = c + diagoffc * ( doff_t )ldc + \ - -diagoffc * ( doff_t )incc; \ - bli_swap_incs( &incc, &ldc ); \ + cxk_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER \ + : BLIS_PACKM_MRXK_1ER_KER; \ + cxc_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_1ER_KER \ + : BLIS_PACKM_MRXMR_DIAG_1ER_KER; \ + } \ \ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( &conjc ); \ - } \ + PASTECH2(ch,cxk_kername,_ker_ft) f_cxk = bli_cntx_get_ukr_dt( dt, cxk_ker_id, cntx ); \ + PASTECH2(ch,cxc_kername,_ker_ft) f_cxc = bli_cntx_get_ukr_dt( dt, cxc_ker_id, cntx ); \ \ - /* Pack the full panel. */ \ - PASTEMAC(ch,kername) \ + /* For general matrices, pack and return early */ \ + if ( bli_is_general( strucc ) ) \ + { \ + f_cxk \ ( \ conjc, \ schema, \ panel_dim, \ - panel_dim_max, \ panel_len, \ panel_len_max, \ kappa, \ @@ -197,321 +97,210 @@ void PASTEMAC(ch,varname) \ p, ldp, \ cntx \ ); \ + return; \ } \ - else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \ - { \ - ctype* restrict c10; \ - ctype* restrict p10; \ - dim_t p10_dim, p10_len; \ - inc_t incc10, ldc10; \ - doff_t diagoffc10; \ - conj_t conjc10; \ \ - ctype* restrict c12; \ - ctype* restrict p12; \ - dim_t p12_dim, p12_len; \ - inc_t incc12, ldc12; \ - doff_t diagoffc12; \ - conj_t conjc12; \ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ + doff_t diagoffc = panel_dim_off - panel_len_off; \ + if ( ( -panel_dim < diagoffc && diagoffc < 0 ) || \ + ( panel_len-panel_dim < diagoffc && diagoffc < panel_len ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ - /* Sanity check. Diagonals should not intersect the short end of - a micro-panel. If they do, then somehow the constraints on - cache blocksizes being a whole multiple of the register - blocksizes was somehow violated. */ \ - if ( diagoffc < 0 ) \ - bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ + /* For triangular, symmetric, and hermitian matrices we need to consider + three parts. */ \ \ - diagoffc_abs = bli_abs( diagoffc ); \ + /* Pack to p10. */ \ + if ( 0 < diagoffc ) \ + { \ + dim_t p10_dim = panel_dim; \ + dim_t p10_len = bli_min( diagoffc, panel_len ); \ + dim_t p10_len_max = p10_len == panel_len ? panel_len_max : p10_len; \ + ctype* p10 = p; \ + conj_t conjc10 = conjc; \ + ctype* c10 = c; \ + inc_t incc10 = incc; \ + inc_t ldc10 = ldc; \ \ - if ( bli_is_lower( uploc ) ) \ + if ( bli_is_upper( uploc ) ) \ { \ - p10_dim = panel_dim; \ - p10_len = diagoffc_abs; \ - p10 = p; \ - c10 = c; \ - incc10 = incc; \ - ldc10 = ldc; \ - conjc10 = conjc; \ -\ - p12_dim = panel_dim; \ - p12_len = panel_len - p10_len; \ - j = p10_len; \ - diagoffc12 = diagoffc_abs - j; \ - p12 = p + (j )*ldp; \ - c12 = c + (j )*ldc; \ - c12 = c12 + diagoffc12 * ( doff_t )ldc + \ - -diagoffc12 * ( doff_t )incc; \ - incc12 = ldc; \ - ldc12 = incc; \ - conjc12 = conjc; \ -\ - if ( bli_is_hermitian( strucc ) ) \ - bli_toggle_conj( &conjc12 ); \ - } \ - else /* if ( bli_is_upper( uploc ) ) */ \ - { \ - p10_dim = panel_dim; \ - p10_len = diagoffc_abs + panel_dim; \ - diagoffc10 = diagoffc; \ - p10 = p; \ - c10 = c; \ - c10 = c10 + diagoffc10 * ( doff_t )ldc + \ - -diagoffc10 * ( doff_t )incc; \ - incc10 = ldc; \ - ldc10 = incc; \ - conjc10 = conjc; \ -\ - p12_dim = panel_dim; \ - p12_len = panel_len - p10_len; \ - j = p10_len; \ - p12 = p + (j )*ldp; \ - c12 = c + (j )*ldc; \ - incc12 = incc; \ - ldc12 = ldc; \ - conjc12 = conjc; \ + bli_reflect_to_stored_part( diagoffc, c10, incc10, ldc10 ); \ \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc10 ); \ } \ \ - /* Pack to p10. For upper storage, this includes the unstored - triangle of c11. */ \ - /* NOTE: Since we're only packing partial panels here, we pass in - p1x_len as panel_len_max; otherwise, the packm kernel will zero- - fill the columns up to panel_len_max, which is not what we need - or want to happen. */ \ - PASTEMAC(ch,kername) \ - ( \ - conjc10, \ - schema, \ - p10_dim, \ - panel_dim_max, \ - p10_len, \ - p10_len, \ - kappa, \ - c10, incc10, ldc10, \ - p10, ldp, \ - cntx \ - ); \ -\ - /* Pack to p12. For lower storage, this includes the unstored - triangle of c11. */ \ - /* NOTE: Since we're only packing partial panels here, we pass in - p1x_len as panel_len_max; otherwise, the packm kernel will zero- - fill the columns up to panel_len_max, which is not what we need - or want to happen. */ \ - PASTEMAC(ch,kername) \ - ( \ - conjc12, \ - schema, \ - p12_dim, \ - panel_dim_max, \ - p12_len, \ - p12_len, \ - kappa, \ - c12, incc12, ldc12, \ - p12, ldp, \ - cntx \ - ); \ -\ - /* Pack the stored triangle of c11 to p11. */ \ + /* If we are referencing the unstored part of a triangular matrix, + explicitly store zeros */ \ + if ( bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) \ { \ - dim_t p11_m = panel_dim; \ - dim_t p11_n = panel_dim; \ - dim_t j2 = diagoffc_abs; \ - ctype* restrict c11 = c + (j2 )*ldc; \ - ctype* restrict p11 = p + (j2 )*ldp; \ - trans_t transc = ( trans_t )conjc; \ -\ - PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - transc, \ - p11_m, \ - p11_n, \ - c11, incc, ldc, \ - p11, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* If source matrix c is Hermitian, we have to zero out the - imaginary components of the diagonal of p11 in case the - corresponding elements in c11 were not already zero. */ \ - if ( bli_is_hermitian( strucc ) ) \ + if ( bli_is_1m_packed( schema ) ) \ { \ - ctype* restrict pi11 = p11; \ -\ - for ( i = 0; i < p11_m; ++i ) \ - { \ - PASTEMAC(ch,seti0s)( *pi11 ); \ + ctype_r* restrict zero = PASTEMAC(chr,0); \ \ - pi11 += 1 + ldp; \ - } \ + PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + packmrnr_r, \ + p10_len_max * 2, \ + zero, \ + ( ctype_r* )p10, 1, ldp, \ + cntx, \ + NULL \ + ); \ } \ + else \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ \ - /* Now that the diagonal has been made explicitly Hermitian - (if applicable), we can now safely scale the stored - triangle specified by uploc. */ \ - PASTEMAC2(ch,scalm,BLIS_TAPI_EX_SUF) \ + PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + packmrnr, \ + p10_len_max, \ + zero, \ + p10, 1, ldp, \ + cntx, \ + NULL \ + ); \ + } \ + } \ + else \ + { \ + f_cxk \ ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - p11_m, \ - p11_n, \ + conjc10, \ + schema, \ + p10_dim, \ + p10_len, \ + p10_len_max, \ kappa, \ - p11, 1, ldp, \ - cntx, \ - NULL \ + c10, incc10, ldc10, \ + p10, ldp, \ + cntx \ ); \ } \ } \ -} - -INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk ) - - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, kername ) \ \ -void PASTEMAC(ch,varname) \ - ( \ - struc_t strucc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t panel_dim, \ - dim_t panel_len, \ - dim_t panel_dim_max, \ - dim_t panel_len_max, \ - dim_t panel_dim_off, \ - dim_t panel_len_off, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t ldp, \ - inc_t is_p, \ - cntx_t* cntx \ - ) \ -{ \ - doff_t diagoffc = panel_dim_off - panel_len_off; \ -\ - /* Pack the panel. */ \ - PASTEMAC(ch,kername) \ - ( \ - conjc, \ - schema, \ - panel_dim, \ - panel_dim_max, \ - panel_len, \ - panel_len_max, \ - kappa, \ - c, incc, ldc, \ - p, ldp, \ - cntx \ - ); \ -\ -\ - /* If the diagonal of c is implicitly unit, explicitly set the - the diagonal of the packed panel to kappa. */ \ - if ( bli_is_unit_diag( diagc ) ) \ + /* Pack to p11. */ \ + if ( 0 <= diagoffc && diagoffc + panel_dim <= panel_len ) \ { \ - PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ + dim_t i = diagoffc; \ + dim_t p11_dim = panel_dim; \ + dim_t p11_len_max = panel_dim + ( diagoffc + panel_dim == panel_len \ + ? panel_len_pad : 0 ); \ + ctype* p11 = p + i * ldp; \ + conj_t conjc11 = conjc; \ + ctype* c11 = c + i * ldc; \ + inc_t incc11 = incc; \ + inc_t ldc11 = ldc; \ +\ + f_cxc \ ( \ - BLIS_NO_CONJUGATE, \ - diagoffc, \ - panel_dim, \ - panel_len, \ + strucc, \ + diagc, \ + uploc, \ + conjc11, \ + schema, \ + invdiag, \ + p11_dim, \ + p11_len_max, \ kappa, \ - p, 1, ldp, \ - cntx, \ - NULL \ + c11, incc11, ldc11, \ + p11, ldp, \ + cntx \ ); \ } \ \ - /* If requested, invert the diagonal of the packed panel. */ \ - if ( invdiag == TRUE ) \ + /* Pack to p12. */ \ + if ( diagoffc + panel_dim < panel_len ) \ { \ - PASTEMAC2(ch,invertd,BLIS_TAPI_EX_SUF) \ - ( \ - diagoffc, \ - panel_dim, \ - panel_len, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ - } \ + dim_t i = bli_max( 0, diagoffc + panel_dim ); \ + dim_t p12_dim = panel_dim; \ + dim_t p12_len = panel_len - i; \ + /* If we are packing p12, then it is always the last partial block \ + and so we should make sure to pad with zeros if necessary. */ \ + dim_t p12_len_max = p12_len + panel_len_pad; \ + ctype* p12 = p + i * ldp; \ + conj_t conjc12 = conjc; \ + ctype* c12 = c + i * ldc; \ + inc_t incc12 = incc; \ + inc_t ldc12 = ldc; \ \ - /* Set the region opposite the diagonal of p to zero. To do this, - we need to reference the "unstored" region on the other side of - the diagonal. This amounts to toggling uploc and then shifting - the diagonal offset to shrink the newly referenced region (by - one diagonal). Note that this zero-filling is not needed for - trsm, since the unstored region is not referenced by the trsm - micro-kernel; however, zero-filling is needed for trmm, which - uses the gemm micro-kernel.*/ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - uplo_t uplop = uploc; \ + if ( bli_is_lower( uploc ) ) \ + { \ + bli_reflect_to_stored_part( diagoffc - i, c12, incc12, ldc12 ); \ \ - bli_toggle_uplo( &uplop ); \ - bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( &conjc12 ); \ + } \ \ - PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - diagoffc, \ - BLIS_NONUNIT_DIAG, \ - uplop, \ - panel_dim, \ - panel_len, \ - zero, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ - } \ + /* If we are referencing the unstored part of a triangular matrix, + explicitly store zeros */ \ + if ( bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) \ + { \ + if ( bli_is_1m_packed( schema ) ) \ + { \ + ctype_r* restrict zero = PASTEMAC(chr,0); \ \ - /* If this panel is an edge case in both panel dimension and length, - then it must be a bottom-right corner case. Set the part of the - diagonal that extends into the zero-padded region to identity. - NOTE: This is actually only necessary when packing for trsm, as - it helps prevent NaNs and Infs from creeping into the computation. - However, we set the region to identity for trmm as well. Those - 1.0's end up getting muliplied by the 0.0's in the zero-padded - region of the other matrix, so there is no harm in this. */ \ - if ( panel_dim != panel_dim_max && \ - panel_len != panel_len_max ) \ - { \ - ctype* restrict one = PASTEMAC(ch,1); \ - dim_t i = panel_dim; \ - dim_t j = panel_len; \ - dim_t m_br = panel_dim_max - i; \ - dim_t n_br = panel_len_max - j; \ - ctype* p_br = p + (i ) + (j )*ldp; \ + PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + packmrnr_r, \ + p12_len_max * 2, \ + zero, \ + ( ctype_r* )p12, 1, ldp, \ + cntx, \ + NULL \ + ); \ + } \ + else \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ \ - PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - m_br, \ - n_br, \ - one, \ - p_br, 1, ldp, \ - cntx, \ - NULL \ - ); \ + PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + packmrnr, \ + p12_len_max, \ + zero, \ + p12, 1, ldp, \ + cntx, \ + NULL \ + ); \ + } \ + } \ + else \ + { \ + f_cxk \ + ( \ + conjc12, \ + schema, \ + p12_dim, \ + p12_len, \ + p12_len_max, \ + kappa, \ + c12, incc12, ldc12, \ + p12, ldp, \ + cntx \ + ); \ + } \ } \ } -INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk ) +INSERT_GENTFUNCR_BASIC2( packm_struc_cxk, packm_cxk, packm_cxc_diag ) diff --git a/frame/1m/unpackm/bli_unpackm.h b/frame/1m/unpackm/bli_unpackm.h index 5e4542841..80fa3804a 100644 --- a/frame/1m/unpackm/bli_unpackm.h +++ b/frame/1m/unpackm/bli_unpackm.h @@ -37,5 +37,3 @@ #include "bli_unpackm_int.h" #include "bli_unpackm_blk_var1.h" - -#include "bli_unpackm_cxk.h" diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c index b2c862045..b6165f516 100644 --- a/frame/1m/unpackm/bli_unpackm_blk_var1.c +++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c @@ -36,21 +36,22 @@ #define FUNCPTR_T unpackm_fp -typedef void (*FUNCPTR_T)( - struc_t strucc, - doff_t diagoffc, - diag_t diagc, - uplo_t uploc, - trans_t transc, - dim_t m, - dim_t n, - dim_t m_panel, - dim_t n_panel, - void* p, inc_t rs_p, inc_t cs_p, - dim_t pd_p, inc_t ps_p, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx - ); +typedef void (*FUNCPTR_T) + ( + struc_t strucc, + doff_t diagoffc, + diag_t diagc, + uplo_t uploc, + trans_t transc, + dim_t m, + dim_t n, + dim_t m_panel, + dim_t n_panel, + void* p, inc_t rs_p, inc_t cs_p, + dim_t pd_p, inc_t ps_p, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx + ); static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1); @@ -152,10 +153,10 @@ void PASTEMAC(ch,varname) \ dim_t iter_dim; \ dim_t num_iter; \ dim_t it, ic, ip; \ - dim_t ic0, ip0; \ + dim_t ic0, ip0; \ doff_t ic_inc, ip_inc; \ - doff_t diagoffc_i; \ - doff_t diagoffc_inc; \ + doff_t diagoffc_i; \ + doff_t diagoffc_inc; \ dim_t panel_len; \ dim_t panel_dim_i; \ dim_t panel_dim_max; \ @@ -164,6 +165,7 @@ void PASTEMAC(ch,varname) \ inc_t ldp; \ dim_t* m_panel_full; \ dim_t* n_panel_full; \ + pack_t schema; \ \ \ /* If c needs a transposition, induce it so that we can more simply @@ -182,6 +184,7 @@ void PASTEMAC(ch,varname) \ if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to unpack from column panels. */ \ + schema = BLIS_PACKED_COL_PANELS; \ iter_dim = n; \ panel_len = m; \ panel_dim_max = pd_p; \ @@ -196,6 +199,7 @@ void PASTEMAC(ch,varname) \ else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to unpack from row panels. */ \ + schema = BLIS_PACKED_ROW_PANELS; \ iter_dim = m; \ panel_len = n; \ panel_dim_max = pd_p; \ @@ -207,6 +211,14 @@ void PASTEMAC(ch,varname) \ m_panel_full = &panel_dim_i; \ n_panel_full = &n; \ } \ +\ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER \ + : BLIS_UNPACKM_MRXK_KER; \ +\ + /* Query the context for the unpackm kernel corresponding to the current + panel dimension, or kernel id. */ \ + PASTECH2(ch,unpackm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* Compute the total number of iterations we'll need. */ \ num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ @@ -253,9 +265,10 @@ void PASTEMAC(ch,varname) \ else \ { \ /* Pack the current panel. */ \ - PASTEMAC(ch,unpackm_cxk) \ + f \ ( \ BLIS_NO_CONJUGATE, \ + schema, \ panel_dim_i, \ panel_len, \ one, \ diff --git a/frame/2/gemv/bli_gemv_unb_var1.c b/frame/2/gemv/bli_gemv_unb_var1.c index 3f5681d2b..840b96901 100644 --- a/frame/2/gemv/bli_gemv_unb_var1.c +++ b/frame/2/gemv/bli_gemv_unb_var1.c @@ -70,7 +70,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ + kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( i = 0; i < n_iter; ++i ) \ { \ diff --git a/frame/2/gemv/bli_gemv_unb_var2.c b/frame/2/gemv/bli_gemv_unb_var2.c index 8166aa417..7fc4fcfe4 100644 --- a/frame/2/gemv/bli_gemv_unb_var2.c +++ b/frame/2/gemv/bli_gemv_unb_var2.c @@ -100,7 +100,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < n_iter; ++i ) \ { \ diff --git a/frame/2/gemv/bli_gemv_unf_var1.c b/frame/2/gemv/bli_gemv_unf_var1.c index e392e830e..0dceed4cf 100644 --- a/frame/2/gemv/bli_gemv_unf_var1.c +++ b/frame/2/gemv/bli_gemv_unf_var1.c @@ -71,7 +71,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxf_ker_ft) kfp_df; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ + kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \ \ for ( i = 0; i < n_iter; i += f ) \ diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c index fe7702e4c..4c43657ad 100644 --- a/frame/2/gemv/bli_gemv_unf_var2.c +++ b/frame/2/gemv/bli_gemv_unf_var2.c @@ -100,7 +100,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyf_ker_ft) kfp_af; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ + kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \ \ for ( i = 0; i < n_iter; i += f ) \ diff --git a/frame/2/ger/bli_ger_unb_var1.c b/frame/2/ger/bli_ger_unb_var1.c index d6cda277e..d8ddd1247 100644 --- a/frame/2/ger/bli_ger_unb_var1.c +++ b/frame/2/ger/bli_ger_unb_var1.c @@ -61,7 +61,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/ger/bli_ger_unb_var2.c b/frame/2/ger/bli_ger_unb_var2.c index 1590bfe5e..9c49e336b 100644 --- a/frame/2/ger/bli_ger_unb_var2.c +++ b/frame/2/ger/bli_ger_unb_var2.c @@ -61,7 +61,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( j = 0; j < n; ++j ) \ { \ diff --git a/frame/2/hemv/bli_hemv_unb_var1.c b/frame/2/hemv/bli_hemv_unb_var1.c index ea5d478be..71c27a326 100644 --- a/frame/2/hemv/bli_hemv_unb_var1.c +++ b/frame/2/hemv/bli_hemv_unb_var1.c @@ -122,8 +122,8 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointers. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ - kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/hemv/bli_hemv_unb_var2.c b/frame/2/hemv/bli_hemv_unb_var2.c index 1f7346517..3753c8d3b 100644 --- a/frame/2/hemv/bli_hemv_unb_var2.c +++ b/frame/2/hemv/bli_hemv_unb_var2.c @@ -123,7 +123,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ + kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/hemv/bli_hemv_unb_var3.c b/frame/2/hemv/bli_hemv_unb_var3.c index 6573e59fc..d592251d5 100644 --- a/frame/2/hemv/bli_hemv_unb_var3.c +++ b/frame/2/hemv/bli_hemv_unb_var3.c @@ -122,8 +122,8 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointers. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ - kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/hemv/bli_hemv_unb_var4.c b/frame/2/hemv/bli_hemv_unb_var4.c index deabc3ab4..10cf953b6 100644 --- a/frame/2/hemv/bli_hemv_unb_var4.c +++ b/frame/2/hemv/bli_hemv_unb_var4.c @@ -122,7 +122,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointers. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/hemv/bli_hemv_unf_var1.c b/frame/2/hemv/bli_hemv_unf_var1.c index d36dc0098..a449909a5 100644 --- a/frame/2/hemv/bli_hemv_unf_var1.c +++ b/frame/2/hemv/bli_hemv_unf_var1.c @@ -130,7 +130,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \ + kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \ \ for ( i = 0; i < m; i += f ) \ diff --git a/frame/2/hemv/bli_hemv_unf_var1a.c b/frame/2/hemv/bli_hemv_unf_var1a.c index 31ab1515f..d0af57393 100644 --- a/frame/2/hemv/bli_hemv_unf_var1a.c +++ b/frame/2/hemv/bli_hemv_unf_var1a.c @@ -121,7 +121,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \ + kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/hemv/bli_hemv_unf_var3.c b/frame/2/hemv/bli_hemv_unf_var3.c index d8db9bc78..baaff098d 100644 --- a/frame/2/hemv/bli_hemv_unf_var3.c +++ b/frame/2/hemv/bli_hemv_unf_var3.c @@ -130,7 +130,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \ + kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \ \ for ( i = 0; i < m; i += f ) \ diff --git a/frame/2/hemv/bli_hemv_unf_var3a.c b/frame/2/hemv/bli_hemv_unf_var3a.c index 54ab0f6ce..55c1929ff 100644 --- a/frame/2/hemv/bli_hemv_unf_var3a.c +++ b/frame/2/hemv/bli_hemv_unf_var3a.c @@ -121,7 +121,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \ + kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her/bli_her_unb_var1.c b/frame/2/her/bli_her_unb_var1.c index e7f718680..8cd6bd397 100644 --- a/frame/2/her/bli_her_unb_var1.c +++ b/frame/2/her/bli_her_unb_var1.c @@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her/bli_her_unb_var2.c b/frame/2/her/bli_her_unb_var2.c index 4b39e1df0..f68798dce 100644 --- a/frame/2/her/bli_her_unb_var2.c +++ b/frame/2/her/bli_her_unb_var2.c @@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her2/bli_her2_unb_var1.c b/frame/2/her2/bli_her2_unb_var1.c index 37423bfcb..b5c182639 100644 --- a/frame/2/her2/bli_her2_unb_var1.c +++ b/frame/2/her2/bli_her2_unb_var1.c @@ -106,7 +106,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her2/bli_her2_unb_var2.c b/frame/2/her2/bli_her2_unb_var2.c index 22d6de07a..602e922a8 100644 --- a/frame/2/her2/bli_her2_unb_var2.c +++ b/frame/2/her2/bli_her2_unb_var2.c @@ -113,7 +113,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her2/bli_her2_unb_var3.c b/frame/2/her2/bli_her2_unb_var3.c index 297b9b702..1d5872d5d 100644 --- a/frame/2/her2/bli_her2_unb_var3.c +++ b/frame/2/her2/bli_her2_unb_var3.c @@ -113,7 +113,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her2/bli_her2_unb_var4.c b/frame/2/her2/bli_her2_unb_var4.c index 58adb0e70..922fe7db7 100644 --- a/frame/2/her2/bli_her2_unb_var4.c +++ b/frame/2/her2/bli_her2_unb_var4.c @@ -114,7 +114,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her2/bli_her2_unf_var1.c b/frame/2/her2/bli_her2_unf_var1.c index a0aec48f7..3824880c6 100644 --- a/frame/2/her2/bli_her2_unf_var1.c +++ b/frame/2/her2/bli_her2_unf_var1.c @@ -106,7 +106,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpy2v_ker_ft) kfp_2v; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \ + kfp_2v = bli_cntx_get_ukr_dt( dt, BLIS_AXPY2V_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/her2/bli_her2_unf_var4.c b/frame/2/her2/bli_her2_unf_var4.c index 3dea31d53..6b2b0e9ac 100644 --- a/frame/2/her2/bli_her2_unf_var4.c +++ b/frame/2/her2/bli_her2_unf_var4.c @@ -114,7 +114,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpy2v_ker_ft) kfp_2v; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \ + kfp_2v = bli_cntx_get_ukr_dt( dt, BLIS_AXPY2V_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ diff --git a/frame/2/trmv/bli_trmv_unb_var1.c b/frame/2/trmv/bli_trmv_unb_var1.c index 31bfa6a83..367a34e6c 100644 --- a/frame/2/trmv/bli_trmv_unb_var1.c +++ b/frame/2/trmv/bli_trmv_unb_var1.c @@ -83,7 +83,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \ + kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ diff --git a/frame/2/trmv/bli_trmv_unb_var2.c b/frame/2/trmv/bli_trmv_unb_var2.c index 00d4d95f3..fa21776b3 100644 --- a/frame/2/trmv/bli_trmv_unb_var2.c +++ b/frame/2/trmv/bli_trmv_unb_var2.c @@ -83,7 +83,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ diff --git a/frame/2/trmv/bli_trmv_unf_var1.c b/frame/2/trmv/bli_trmv_unf_var1.c index 6dc3cea36..9e576fc77 100644 --- a/frame/2/trmv/bli_trmv_unf_var1.c +++ b/frame/2/trmv/bli_trmv_unf_var1.c @@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxf_ker_ft) kfp_df; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ + kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ diff --git a/frame/2/trmv/bli_trmv_unf_var2.c b/frame/2/trmv/bli_trmv_unf_var2.c index 8bbd51820..052595935 100644 --- a/frame/2/trmv/bli_trmv_unf_var2.c +++ b/frame/2/trmv/bli_trmv_unf_var2.c @@ -90,7 +90,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyf_ker_ft) kfp_af; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ + kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ diff --git a/frame/2/trsv/bli_trsv_unb_var1.c b/frame/2/trsv/bli_trsv_unb_var1.c index c7493e33d..2f24b10a8 100644 --- a/frame/2/trsv/bli_trsv_unb_var1.c +++ b/frame/2/trsv/bli_trsv_unb_var1.c @@ -94,7 +94,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotv_ker_ft) kfp_tv; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_tv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \ + kfp_tv = bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ diff --git a/frame/2/trsv/bli_trsv_unb_var2.c b/frame/2/trsv/bli_trsv_unb_var2.c index a78e7eef0..1a8e81634 100644 --- a/frame/2/trsv/bli_trsv_unb_var2.c +++ b/frame/2/trsv/bli_trsv_unb_var2.c @@ -94,7 +94,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ diff --git a/frame/2/trsv/bli_trsv_unf_var1.c b/frame/2/trsv/bli_trsv_unf_var1.c index 3b03b43e5..824f26d15 100644 --- a/frame/2/trsv/bli_trsv_unf_var1.c +++ b/frame/2/trsv/bli_trsv_unf_var1.c @@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxf_ker_ft) kfp_df; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ + kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ diff --git a/frame/2/trsv/bli_trsv_unf_var2.c b/frame/2/trsv/bli_trsv_unf_var2.c index 10741d291..bd1f8e3b0 100644 --- a/frame/2/trsv/bli_trsv_unf_var2.c +++ b/frame/2/trsv/bli_trsv_unf_var2.c @@ -102,7 +102,7 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyf_ker_ft) kfp_af; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ + kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ diff --git a/frame/3/bli_l3_schema.c b/frame/3/bli_l3_schema.c index bde30c527..1d4608799 100644 --- a/frame/3/bli_l3_schema.c +++ b/frame/3/bli_l3_schema.c @@ -57,7 +57,7 @@ void bli_l3_set_schemas // projection of dt to query the preference of the corresponding native // real-domain microkernel. This is what ultimately determines which // variant of 1m is applicable. - if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ) ) { schema_a = BLIS_PACKED_ROW_PANELS_1E; schema_b = BLIS_PACKED_COL_PANELS_1R; diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c index 72ec405ab..7e37e1f22 100644 --- a/frame/3/bli_l3_sup.c +++ b/frame/3/bli_l3_sup.c @@ -63,7 +63,7 @@ err_t bli_gemmsup // Return early if a microkernel preference-induced transposition would // have been performed and shifted the dimensions outside of the space // of sup-handled problems. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( c, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( c, BLIS_GEMM_VIR_UKR, cntx ) ) { const num_t dt = bli_obj_dt( c ); const dim_t m = bli_obj_length( c ); diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c index e54e01d7c..3da3954fa 100644 --- a/frame/3/bli_l3_sup_int.c +++ b/frame/3/bli_l3_sup_int.c @@ -85,7 +85,7 @@ err_t bli_gemmsup_int const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; const num_t dt = bli_obj_dt( c ); - const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); + const bool row_pref = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( stor_id ), cntx ); const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr : is_rcc_crc_ccr_ccc ); @@ -259,7 +259,7 @@ err_t bli_gemmtsup_int const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; const num_t dt = bli_obj_dt( c ); - const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); + const bool row_pref = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( stor_id ), cntx ); const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr : is_rcc_crc_ccr_ccc ); diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c index 85fb246f0..519dc5ccd 100644 --- a/frame/3/bli_l3_sup_packm_var.c +++ b/frame/3/bli_l3_sup_packm_var.c @@ -122,6 +122,14 @@ void PASTEMAC(ch,varname) \ ldc = cs_c; \ ldp = cs_p; \ } \ +\ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \ + : BLIS_PACKM_MRXK_KER; \ +\ + /* Query the context for the unpackm kernel corresponding to the current + panel dimension, or kernel id. */ \ + PASTECH2(ch,packm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* Compute the total number of iterations we'll need. */ \ n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ @@ -171,12 +179,11 @@ void PASTEMAC(ch,varname) \ or round-robin partitioning was requested at configure-time. */ \ if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ - PASTEMAC(ch,packm_cxk) \ + f \ ( \ conjc, \ schema, \ panel_dim_i, \ - panel_dim_max, \ panel_len_i, \ panel_len_max_i, \ kappa_cast, \ diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h index 7c315192d..ead9925e6 100644 --- a/frame/3/bli_l3_sup_vars.h +++ b/frame/3/bli_l3_sup_vars.h @@ -127,7 +127,7 @@ BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases cntx_t* cntx ) { - const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); + const bool row_pref = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( *eff_id ), cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 4ff45036f..cd8827bd9 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -99,7 +99,7 @@ void bli_gemm_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_obj_swap( &a_local, &b_local ); diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 6de361194..874a12439 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -201,7 +201,7 @@ void bli_gemm_ker_var2 // column-stored as well. char ct[ BLIS_STACK_BUF_MAX_SIZE ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_UKR, cntx ); + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx ); const inc_t rs_ct = ( col_pref ? 1 : NR ); const inc_t cs_ct = ( col_pref ? MR : 1 ); char* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c index e257cdf28..6202cfffd 100644 --- a/frame/3/gemm/bli_gemm_md.c +++ b/frame/3/gemm/bli_gemm_md.c @@ -173,7 +173,7 @@ mddm_t bli_gemm_md_ccr // preference. const num_t dt = BLIS_REAL | bli_obj_comp_prec( c ); const bool row_pref - = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, *cntx ); + = bli_cntx_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, *cntx ); // We can only perform this case of mixed-domain gemm, C += A*B where // B is real, if the microkernel prefers column output. If it prefers @@ -236,8 +236,8 @@ mddm_t bli_gemm_md_ccr // Use the default pack schemas in the objects. - // static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) - func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx ); + // static func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx ) + func_t* l3_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, *cntx ); // Rather than check which complex datatype dt_comp refers to, we set // the mixed-domain virtual microkernel for both types. @@ -278,7 +278,7 @@ mddm_t bli_gemm_md_crc // preference. const num_t dt = BLIS_REAL | bli_obj_comp_prec( c ); const bool col_pref - = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, *cntx ); + = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, *cntx ); // We can only perform this case of mixed-domain gemm, C += A*B where // A is real, if the microkernel prefers row output. If it prefers @@ -341,8 +341,8 @@ mddm_t bli_gemm_md_crc // Use the default pack schemas in the objects. - // static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) - func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx ); + // static func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx ) + func_t* l3_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, *cntx ); // Rather than check which complex datatype dt_comp refers to, we set // the mixed-domain virtual microkernel for both types. @@ -430,13 +430,11 @@ mddm_t bli_gemm_md_rcc const num_t dt_complex = bli_obj_dt( a ); cntx_t* cntx_1m = bli_gks_query_ind_cntx( BLIS_1M, dt_complex ); - func_t* cntx_funcs = bli_cntx_packm_kers_buf( *cntx ); - func_t* cntx_1m_funcs = bli_cntx_packm_kers_buf( cntx_1m ); + func_t* cntx_funcs = bli_cntx_ukrs_buf( *cntx ); + func_t* cntx_1m_funcs = bli_cntx_ukrs_buf( cntx_1m ); - for ( dim_t i = 0; i <= BLIS_PACKM_31XK_KER; ++i ) - { - cntx_funcs[ i ] = cntx_1m_funcs[ i ]; - } + cntx_funcs[ BLIS_PACKM_MRXK_KER ] = cntx_1m_funcs[ BLIS_PACKM_MRXK_KER ]; + cntx_funcs[ BLIS_PACKM_NRXK_KER ] = cntx_1m_funcs[ BLIS_PACKM_NRXK_KER ]; // Return the computation and execution domains. return doms; diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c index bbd9190a9..a4797ad4f 100644 --- a/frame/3/gemm/bli_gemm_md_c2r_ref.c +++ b/frame/3/gemm/bli_gemm_md_c2r_ref.c @@ -57,8 +57,8 @@ void PASTEMAC2(ch,opname,suf) \ const num_t dt_r = PASTEMAC(chr,type); \ \ PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ + rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ const bool row_pref = !col_pref; \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ diff --git a/frame/3/gemm/other/bli_gemm_ker_var2.c b/frame/3/gemm/other/bli_gemm_ker_var2.c index 62d2a9e04..c5cf935b8 100644 --- a/frame/3/gemm/other/bli_gemm_ker_var2.c +++ b/frame/3/gemm/other/bli_gemm_ker_var2.c @@ -198,7 +198,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/gemm/other/bli_gemm_ker_var2rr.c b/frame/3/gemm/other/bli_gemm_ker_var2rr.c index 289e4ddf5..946e3048c 100644 --- a/frame/3/gemm/other/bli_gemm_ker_var2rr.c +++ b/frame/3/gemm/other/bli_gemm_ker_var2rr.c @@ -199,7 +199,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/gemm/other/bli_gemm_ker_var2sl.c b/frame/3/gemm/other/bli_gemm_ker_var2sl.c index d75838fb4..f5159bbb9 100644 --- a/frame/3/gemm/other/bli_gemm_ker_var2sl.c +++ b/frame/3/gemm/other/bli_gemm_ker_var2sl.c @@ -199,7 +199,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c index 2a9d91759..d53838470 100644 --- a/frame/3/gemmt/bli_gemmt_front.c +++ b/frame/3/gemmt/bli_gemmt_front.c @@ -86,7 +86,7 @@ void bli_gemmt_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_obj_swap( &a_local, &b_local ); diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c index fea4efec0..3aedc6e9a 100644 --- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c @@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c index 4b849bbc6..b3a9fe8a1 100644 --- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c @@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c index 0bf4b1a0f..ece351ef7 100644 --- a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c +++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c @@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c index 1655bea55..f00e769b5 100644 --- a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c +++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c @@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 9835de9c1..15460125d 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -117,7 +117,7 @@ void bli_hemm_front // micro-kernel to access elements of C in its preferred manner. //if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT // be enabled. See issue #342 comments. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_toggle_conj( &a_local ); diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index be94c44c1..8108b607f 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -117,7 +117,7 @@ void bli_symm_front // micro-kernel to access elements of C in its preferred manner. //if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT // be enabled. See issue #342 comments. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &b_local ); diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 1de28958e..d973b6eb6 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -135,7 +135,7 @@ void bli_trmm_front // of row- vs. column storage breaks down. //if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT // be enabled. See issue #342 comments. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &a_local ); diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c index 9ab64e470..706e14d43 100644 --- a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c @@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c index 6fef4e0c9..699892635 100644 --- a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c +++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ @@ -337,7 +337,7 @@ void PASTEMAC(ch,varname) \ dim_t jr_inc; \ \ /* Use round-robin assignment of micropanels to threads in the 2nd loop for - the initial rectangular region of C (if it exists). + the initial rectangular region of C (if it exists). NOTE: Parallelism in the 1st loop is disabled for now. */ \ bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c index e0d9cc75f..eb5577593 100644 --- a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c +++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c index 0abcfd77a..738711f58 100644 --- a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c @@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c index 8c505f88a..df53b2011 100644 --- a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c +++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c index 3bb0deaa3..fbcd4f9aa 100644 --- a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c +++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c index 672caaa05..7775d9217 100644 --- a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c @@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c index 9d9e3809c..c1354a962 100644 --- a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c index 8bac0ec4a..7cf8eeef0 100644 --- a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c index fc2991b13..1d0f31708 100644 --- a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c @@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c index 00a0dc3f0..d8ae4f8bb 100644 --- a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c index 889fa49fa..c05a082d4 100644 --- a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c @@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 3b9753960..9cd04963b 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -127,7 +127,7 @@ void bli_trmm3_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &a_local ); diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index f50f739e7..7b1133c2a 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -180,7 +180,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 4f3514143..2059d1c9f 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -180,7 +180,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index b4937134f..cace3622a 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -185,7 +185,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 09942d311..4b0c7f083 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -185,7 +185,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c index dc57eac5f..26da1b004 100644 --- a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c @@ -179,7 +179,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c index 38768242e..607b40e54 100644 --- a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c +++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c @@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c index 78ffe1758..3299b5f8e 100644 --- a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c +++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c @@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c index 7c4cea976..b02ff0955 100644 --- a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c @@ -179,7 +179,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c index 8d050c62b..e78cef477 100644 --- a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c +++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c @@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c index b49a1144e..93cac371a 100644 --- a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c +++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c @@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c index a11936389..1e903c3c1 100644 --- a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c @@ -184,7 +184,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c index 7ad1e4271..a44d64f45 100644 --- a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c @@ -184,7 +184,7 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 3a698871b..218325d5a 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -43,253 +43,76 @@ void bli_cntx_clear( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) +void bli_cntx_set_blkszs( cntx_t* cntx, ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use // non-default blocksizes. It should be called after - // bli_cntx_init_defaults() so that the context begins with default - // blocksizes across all datatypes. + // bli_cntx_init__ref() so that the context begins with + // default blocksizes across all datatypes. /* Example prototypes: void bli_cntx_set_blkszs ( - ind_t method = BLIS_NAT, - dim_t n_bs, + cntx_t* cntx, bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, - ... - cntx_t* cntx - ); - - void bli_cntx_set_blkszs - ( - ind_t method != BLIS_NAT, - dim_t n_bs, - bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0, - bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1, - bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2, - ... - cntx_t* cntx + ..., + BLIS_VA_END ); */ - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bszid_t* bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - double* dsclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - double* msclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_bs ); - - // Handle native and induced method cases separately. - if ( method == BLIS_NAT ) - { - // Process n_bs tuples. - for ( i = 0; i < n_bs; ++i ) - { - // Here, we query the variable argument list for: - // - the bszid_t of the blocksize we're about to process, - // - the address of the blksz_t object, - // - the bszid_t of the multiple we need to associate with - // the blksz_t object. - bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t ); - blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* ); - bszid_t bm_id = ( bszid_t )va_arg( args, bszid_t ); - - // Store the values in our temporary arrays. - bszids[ i ] = bs_id; - blkszs[ i ] = blksz; - bmults[ i ] = bm_id; - } - } - else // if induced method execution was indicated - { - // Process n_bs tuples. - for ( i = 0; i < n_bs; ++i ) - { - // Here, we query the variable argument list for: - // - the bszid_t of the blocksize we're about to process, - // - the address of the blksz_t object, - // - the bszid_t of the multiple we need to associate with - // the blksz_t object, - // - the scalars we wish to apply to the real blocksizes to - // come up with the induced complex blocksizes (for default - // and maximum blocksizes). - bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t ); - blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* ); - bszid_t bm_id = ( bszid_t )va_arg( args, bszid_t ); - double dsclr = ( double )va_arg( args, double ); - double msclr = ( double )va_arg( args, double ); - - // Store the values in our temporary arrays. - bszids[ i ] = bs_id; - blkszs[ i ] = blksz; - bmults[ i ] = bm_id; - dsclrs[ i ] = dsclr; - msclrs[ i ] = msclr; - } - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- - // Save the execution type into the context. - bli_cntx_set_method( method, cntx ); + bli_cntx_set_method( BLIS_NAT, cntx ); // Query the context for the addresses of: // - the blocksize object array // - the blocksize multiple array - blksz_t* cntx_blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* cntx_bmults = bli_cntx_bmults_buf( cntx ); - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. Notice that the blksz_t* pointers were saved, rather than - // the objects themselves, but we copy the contents of the objects - // when copying into the context. + // Initialize variable argument environment. + va_list args; + va_start( args, cntx ); - // Handle native and induced method cases separately. - if ( method == BLIS_NAT ) + // Process blocksizes until we get a BLIS_VA_END. + while ( true ) { - // Process each blocksize id tuple provided. - for ( i = 0; i < n_bs; ++i ) - { - // Read the current blocksize id, blksz_t* pointer, blocksize - // multiple id, and blocksize scalar. - bszid_t bs_id = bszids[ i ]; - bszid_t bm_id = bmults[ i ]; - - blksz_t* blksz = blkszs[ i ]; + int bs_id0 = va_arg( args, int ); - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + // If we find a bszid_t id of BLIS_VA_END, then we are done. + if ( bs_id0 == BLIS_VA_END ) break; - // Copy the blksz_t object contents into the appropriate - // location within the context's blksz_t array. Do the same - // for the blocksize multiple id. - //cntx_blkszs[ bs_id ] = *blksz; - //bli_blksz_copy( blksz, cntx_blksz ); - bli_blksz_copy_if_pos( blksz, cntx_blksz ); + // Here, we query the variable argument list for: + // - the bszid_t of the blocksize we're about to process (already done), + // - the address of the blksz_t object, + // - the bszid_t of the multiple we need to associate with + // the blksz_t object. + bszid_t bs_id = ( bszid_t )bs_id0; + blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* ); + bszid_t bm_id = ( bszid_t )va_arg( args, bszid_t ); - // Copy the blocksize multiple id into the context. - cntx_bmults[ bs_id ] = bm_id; - } - } - else - { - // Process each blocksize id tuple provided. - for ( i = 0; i < n_bs; ++i ) - { - // Read the current blocksize id, blksz_t pointer, blocksize - // multiple id, and blocksize scalar. - bszid_t bs_id = bszids[ i ]; - bszid_t bm_id = bmults[ i ]; - double dsclr = dsclrs[ i ]; - double msclr = msclrs[ i ]; - - blksz_t* blksz = blkszs[ i ]; - - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; - - // Copy the real domain values of the source blksz_t object into - // the context, duplicating into the complex domain fields. - bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_FLOAT, cntx_blksz ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DOUBLE, cntx_blksz ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz ); - - // If the default blocksize scalar is non-unit, we need to scale - // the complex domain default blocksizes. - if ( dsclr != 1.0 ) - { - // Scale the complex domain default blocksize values in the - // blocksize object. - bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz ); - } - - // Similarly, if the maximum blocksize scalar is non-unit, we need - // to scale the complex domain maximum blocksizes. - if ( msclr != 1.0 ) - { - // Scale the complex domain maximum blocksize values in the - // blocksize object. - bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz ); - } - - // Copy the blocksize multiple id into the context. - cntx_bmults[ bs_id ] = bm_id; - } + // Copy the blksz_t object contents into the appropriate + // location within the context's blksz_t array. Do the same + // for the blocksize multiple id. + //cntx_blkszs[ bs_id ] = *blksz; + //bli_blksz_copy( blksz, cntx_blksz ); + blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + bli_blksz_copy_if_pos( blksz, cntx_blksz ); + + // Copy the blocksize multiple id into the context. + cntx_bmults[ bs_id ] = bm_id; } - // Free the temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bli_free_intl( blkszs ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bli_free_intl( bszids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bli_free_intl( bmults ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bli_free_intl( dsclrs ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bli_free_intl( msclrs ); + // Shutdown variable argument environment and clean up stack. + va_end( args ); } // ----------------------------------------------------------------------------- -void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ) +void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... ) { /* Example prototypes: @@ -297,1269 +120,268 @@ void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ) ( ind_t method != BLIS_NAT, num_t dt, - dim_t n_bs, + cntx_t* cntx, bszid_t bs0_id, dim_t def_scalr0, dim_t max_scalr0, bszid_t bs1_id, dim_t def_scalr1, dim_t max_scalr1, bszid_t bs2_id, dim_t def_scalr2, dim_t max_scalr2, - ... - cntx_t* cntx + ..., + BLIS_VA_END ); - + NOTE: This function modifies an existing context that is presumed to have been initialized for native execution. */ - va_list args; - dim_t i; - err_t r_val; - // Project the given datatype to the real domain. This will be used later on. num_t dt_real = bli_dt_proj_to_real( dt ); // Return early if called with BLIS_NAT. if ( method == BLIS_NAT ) return; - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_ind_blkszs(): " ); - #endif - bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_ind_blkszs(): " ); - #endif - double* dsclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_ind_blkszs(): " ); - #endif - double* msclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); - - // -- Begin variable argument section -- + // Save the execution type into the context. + bli_cntx_set_method( method, cntx ); // Initialize variable argument environment. - va_start( args, n_bs ); + va_list args; + va_start( args, cntx ); + // Process blocksizes until we get a BLIS_VA_END. + while ( true ) { - // Process n_bs tuples. - for ( i = 0; i < n_bs; ++i ) - { - // Here, we query the variable argument list for: - // - the bszid_t of the blocksize we're about to process, - // - the scalars we wish to apply to the real blocksizes to - // come up with the induced complex blocksizes (for default - // and maximum blocksizes). - bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t ); - double dsclr = ( double )va_arg( args, double ); - double msclr = ( double )va_arg( args, double ); - - // Store the values in our temporary arrays. - bszids[ i ] = bs_id; - dsclrs[ i ] = dsclr; - msclrs[ i ] = msclr; - } - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- + int bs_id0 = va_arg( args, int ); - // Save the execution type into the context. - bli_cntx_set_method( method, cntx ); + // If we find a bszid_t id of BLIS_VA_END, then we are done. + if ( bs_id0 == BLIS_VA_END ) break; - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - - { - // Process each blocksize id tuple provided. - for ( i = 0; i < n_bs; ++i ) + // Here, we query the variable argument list for: + // - the bszid_t of the blocksize we're about to process (already done), + // - the scalars we wish to apply to the real blocksizes to + // come up with the induced complex blocksizes (for default + // and maximum blocksizes). + bszid_t bs_id = ( bszid_t )bs_id0; + double dsclr = ( double )va_arg( args, double ); + double msclr = ( double )va_arg( args, double ); + + // Query the context for the blksz_t object assoicated with the + // current blocksize id, and also query the object corresponding + // to the blocksize multiple. + blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx ); + + // Copy the real domain value of the blksz_t object into the + // corresponding complex domain slot of the same object. + bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_blksz ); + + // If the default blocksize scalar is non-unit, we need to scale + // the complex domain default blocksizes. + if ( dsclr != 1.0 ) { - // Read the current blocksize id, blocksize multiple id, - // and blocksize scalar. - bszid_t bs_id = bszids[ i ]; - double dsclr = dsclrs[ i ]; - double msclr = msclrs[ i ]; - - //blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; - - // Query the context for the blksz_t object assoicated with the - // current blocksize id, and also query the object corresponding - // to the blocksize multiple. - blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx ); - - // Copy the real domain value of the blksz_t object into the - // corresponding complex domain slot of the same object. - bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_blksz ); - - // If the default blocksize scalar is non-unit, we need to scale - // the complex domain default blocksizes. - if ( dsclr != 1.0 ) - { - // Scale the default blocksize value corresponding to the given - // datatype. - bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_blksz ); - } - - // Similarly, if the maximum blocksize scalar is non-unit, we need - // to scale the complex domain maximum blocksizes. - if ( msclr != 1.0 ) - { - // Scale the maximum blocksize value corresponding to the given - // datatype. - bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_blksz ); - } + // Scale the default blocksize value corresponding to the given + // datatype. + bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_blksz ); } - } - - // Free the temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_ind_blkszs(): " ); - #endif - bli_free_intl( bszids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_ind_blkszs(): " ); - #endif - bli_free_intl( dsclrs ); - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_ind_blkszs(): " ); - #endif - bli_free_intl( msclrs ); -} - -// ----------------------------------------------------------------------------- - -void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ) -{ - // This function can be called from the bli_cntx_init_*() function for - // a particular architecture if the kernel developer wishes to use - // non-default level-3 microkernels. It should be called after - // bli_cntx_init_defaults() so that the context begins with default - // microkernels across all datatypes. - - /* Example prototypes: - - void bli_cntx_set_l3_nat_ukrs - ( - dim_t n_ukrs, - l3ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp, bool pref0, - l3ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp, bool pref1, - l3ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp, bool pref2, - ... - cntx_t* cntx - ); - */ - - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - void_fp* ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void_fp ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - bool* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_ukrs ); - - // Process n_ukrs tuples. - for ( i = 0; i < n_ukrs; ++i ) - { - // Here, we query the variable argument list for: - // - the l3ukr_t of the kernel we're about to process, - // - the datatype of the kernel, - // - the kernel function pointer, and - // - the kernel function storage preference - // that we need to store to the context. - - // NOTE: Though bool_t is no longer used, the following comment is - // being kept for historical reasons. - // The type that we pass into the va_arg() macro for the ukr - // preference matters. Using 'bool_t' may cause breakage on 64-bit - // systems that define int as 32 bits and long int and pointers as - // 64 bits. The problem is that TRUE or FALSE are defined as 1 and - // 0, respectively, and when "passed" into the variadic function - // they come with no contextual typecast. Thus, default rules of - // argument promotion kick in to treat these integer literals as - // being of type int. Thus, we need to let va_arg() treat the TRUE - // or FALSE value as an int, even if we cast it to and store it - // within a bool_t afterwards. - const l3ukr_t ukr_id = ( l3ukr_t )va_arg( args, l3ukr_t ); - const num_t ukr_dt = ( num_t )va_arg( args, num_t ); - void_fp ukr_fp = ( void_fp )va_arg( args, void_fp ); - const bool ukr_pref = ( bool )va_arg( args, int ); - - // Store the values in our temporary arrays. - ukr_ids[ i ] = ukr_id; - ukr_dts[ i ] = ukr_dt; - ukr_fps[ i ] = ukr_fp; - ukr_prefs[ i ] = ukr_pref; + // Similarly, if the maximum blocksize scalar is non-unit, we need + // to scale the complex domain maximum blocksizes. + if ( msclr != 1.0 ) + { + // Scale the maximum blocksize value corresponding to the given + // datatype. + bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_blksz ); + } } - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - // Shutdown variable argument environment and clean up stack. va_end( args ); - - // -- End variable argument section -- - - // Query the context for the addresses of: - // - the l3 virtual ukernel func_t array - // - the l3 native ukernel func_t array - // - the l3 native ukernel preferences array - func_t* cntx_l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); - func_t* cntx_l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); - mbool_t* cntx_l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_ukrs; ++i ) - { - // Read the current ukernel id, ukernel datatype, ukernel function - // pointer, and ukernel preference. - const l3ukr_t ukr_id = ukr_ids[ i ]; - const num_t ukr_dt = ukr_dts[ i ]; - void_fp ukr_fp = ukr_fps[ i ]; - const bool ukr_pref = ukr_prefs[ i ]; - - // Index into the func_t and mbool_t for the current kernel id - // being processed. - func_t* vukrs = &cntx_l3_vir_ukrs[ ukr_id ]; - func_t* ukrs = &cntx_l3_nat_ukrs[ ukr_id ]; - mbool_t* prefs = &cntx_l3_nat_ukrs_prefs[ ukr_id ]; - - // Store the ukernel function pointer and preference values into - // the context. Notice that we redundantly store the native - // ukernel address in both the native and virtual ukernel slots - // in the context. This is standard practice when creating a - // native context. (Induced method contexts will overwrite the - // virtual function pointer with the address of the appropriate - // virtual ukernel.) - bli_func_set_dt( ukr_fp, ukr_dt, vukrs ); - bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); - bli_mbool_set_dt( ukr_pref, ukr_dt, prefs ); - } - - // Free the temporary local arrays. - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - bli_free_intl( ukr_ids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - bli_free_intl( ukr_dts ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - bli_free_intl( ukr_fps ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_nat_ukrs(): " ); - #endif - bli_free_intl( ukr_prefs ); } // ----------------------------------------------------------------------------- -void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ) +void bli_cntx_set_ukrs( cntx_t* cntx , ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use - // non-default level-3 virtual microkernels. It should be called after - // bli_cntx_init_defaults() so that the context begins with default - // microkernels across all datatypes. + // non-default microkernels. It should be called after + // bli_cntx_init__ref() so that the context begins with + // default microkernels across all datatypes. /* Example prototypes: - void bli_cntx_set_l3_vir_ukrs + void bli_cntx_set_ukrs ( - dim_t n_ukrs, - l3ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp, - l3ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp, - l3ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp, - ... - cntx_t* cntx + cntx_t* cntx, + ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp, + ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp, + ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp, + ..., + BLIS_VA_END ); */ - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_vir_ukrs(): " ); - #endif - l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_vir_ukrs(): " ); - #endif - num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_vir_ukrs(): " ); - #endif - void_fp* ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void_fp ), &r_val ); - - // -- Begin variable argument section -- + // Query the context for the address of the ukernel func_t array + func_t* cntx_ukrs = bli_cntx_ukrs_buf( cntx ); // Initialize variable argument environment. - va_start( args, n_ukrs ); + va_list args; + va_start( args, cntx ); - // Process n_ukrs tuples. - for ( i = 0; i < n_ukrs; ++i ) + // Process ukernels until BLIS_VA_END is reached. + while ( true ) { - // Here, we query the variable argument list for: - // - the l3ukr_t of the kernel we're about to process, - // - the datatype of the kernel, and - // - the kernel function pointer. - // that we need to store to the context. - const l3ukr_t ukr_id = ( l3ukr_t )va_arg( args, l3ukr_t ); - const num_t ukr_dt = ( num_t )va_arg( args, num_t ); - void_fp ukr_fp = ( void_fp )va_arg( args, void_fp ); - - // Store the values in our temporary arrays. - ukr_ids[ i ] = ukr_id; - ukr_dts[ i ] = ukr_dt; - ukr_fps[ i ] = ukr_fp; - } + const int ukr_id0 = va_arg( args, int ); - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); + // If we find a ukernel id of BLIS_VA_END, then we are done. + if ( ukr_id0 == BLIS_VA_END ) break; - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- - - // Query the context for the addresses of: - // - the l3 virtual ukernel func_t array - func_t* cntx_l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_ukrs; ++i ) - { - // Read the current ukernel id, ukernel datatype, ukernel function - // pointer, and ukernel preference. - const l3ukr_t ukr_id = ukr_ids[ i ]; - const num_t ukr_dt = ukr_dts[ i ]; - void_fp ukr_fp = ukr_fps[ i ]; + // Here, we query the variable argument list for: + // - the ukr_t of the kernel we're about to process (already done), + // - the datatype of the kernel, and + // - the kernel function pointer + const ukr_t ukr_id = ( ukr_t )ukr_id0; + const num_t ukr_dt = ( num_t )va_arg( args, num_t ); + void_fp ukr_fp = ( void_fp )va_arg( args, void_fp ); // Index into the func_t and mbool_t for the current kernel id // being processed. - func_t* vukrs = &cntx_l3_vir_ukrs[ ukr_id ]; + func_t* ukrs = &cntx_ukrs[ ukr_id ]; - // Store the ukernel function pointer and preference values into - // the context. Notice that we redundantly store the native + // Store the ukernel function pointer into the context. + // Notice that we redundantly store the native // ukernel address in both the native and virtual ukernel slots // in the context. This is standard practice when creating a // native context. (Induced method contexts will overwrite the // virtual function pointer with the address of the appropriate // virtual ukernel.) - bli_func_set_dt( ukr_fp, ukr_dt, vukrs ); - } - - // Free the temporary local arrays. - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_vir_ukrs(): " ); - #endif - bli_free_intl( ukr_ids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_vir_ukrs(): " ); - #endif - bli_free_intl( ukr_dts ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_vir_ukrs(): " ); - #endif - bli_free_intl( ukr_fps ); -} - -// ----------------------------------------------------------------------------- - -void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ) -{ - // This function can be called from the bli_cntx_init_*() function for - // a particular architecture if the kernel developer wishes to use - // non-default thresholds for small/unpacked matrix handling. It should - // be called after bli_cntx_init_defaults() so that the context begins - // with default thresholds. - - /* Example prototypes: - - void bli_cntx_set_l3_sup_thresh - ( - dim_t n_thresh, - threshid_t th0_id, blksz_t* blksz0, - threshid_t th1_id, blksz_t* blksz1, - ... - cntx_t* cntx - ); - - */ - - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_thresh(): " ); - #endif - threshid_t* threshids = bli_malloc_intl( n_thresh * sizeof( threshid_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_thresh(): " ); - #endif - blksz_t** threshs = bli_malloc_intl( n_thresh * sizeof( blksz_t* ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_thresh ); - - // Process n_thresh tuples. - for ( i = 0; i < n_thresh; ++i ) - { - // Here, we query the variable argument list for: - // - the threshid_t of the threshold we're about to process, - // - the address of the blksz_t object, - threshid_t th_id = ( threshid_t )va_arg( args, threshid_t ); - blksz_t* thresh = ( blksz_t* )va_arg( args, blksz_t* ); - - // Store the values in our temporary arrays. - threshids[ i ] = th_id; - threshs[ i ] = thresh; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- - - // Query the context for the addresses of: - // - the threshold array - blksz_t* cntx_threshs = bli_cntx_l3_sup_thresh_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. Notice that the blksz_t* pointers were saved, rather than - // the objects themselves, but we copy the contents of the objects - // when copying into the context. - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_thresh; ++i ) - { - // Read the current blocksize id, blksz_t* pointer, blocksize - // multiple id, and blocksize scalar. - threshid_t th_id = threshids[ i ]; - blksz_t* thresh = threshs[ i ]; - - blksz_t* cntx_thresh = &cntx_threshs[ th_id ]; - - // Copy the blksz_t object contents into the appropriate - // location within the context's blksz_t array. - //cntx_threshs[ th_id ] = *thresh; - //bli_blksz_copy( thresh, cntx_thresh ); - bli_blksz_copy_if_pos( thresh, cntx_thresh ); - } - - // Free the temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_thresh(): " ); - #endif - bli_free_intl( threshs ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_thresh(): " ); - #endif - bli_free_intl( threshids ); -} - -// ----------------------------------------------------------------------------- - -void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ) -{ - // This function can be called from the bli_cntx_init_*() function for - // a particular architecture if the kernel developer wishes to use - // non-default level-3 operation handler for small/unpacked matrices. It - // should be called after bli_cntx_init_defaults() so that the context - // begins with default sup handlers across all datatypes. - - /* Example prototypes: - - void bli_cntx_set_l3_sup_handlers - ( - dim_t n_ops, - opid_t op0_id, void* handler0_fp, - opid_t op1_id, void* handler1_fp, - opid_t op2_id, void* handler2_fp, - ... - cntx_t* cntx - ); - */ - - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_handlers(): " ); - #endif - opid_t* op_ids = bli_malloc_intl( n_ops * sizeof( opid_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_handlers(): " ); - #endif - void** op_fps = bli_malloc_intl( n_ops * sizeof( void* ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_ops ); - - // Process n_ukrs tuples. - for ( i = 0; i < n_ops; ++i ) - { - // Here, we query the variable argument list for: - // - the opid_t of the operation we're about to process, - // - the sup handler function pointer - // that we need to store to the context. - const opid_t op_id = ( opid_t )va_arg( args, opid_t ); - void* op_fp = ( void* )va_arg( args, void* ); + bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); - // Store the values in our temporary arrays. - op_ids[ i ] = op_id; - op_fps[ i ] = op_fp; + // Locate the virtual ukernel func_t pointer that corresponds to the + // ukernel id provided by the caller. + switch ( ukr_id ) + { + case BLIS_GEMM_UKR: ukrs = &cntx_ukrs[ BLIS_GEMM_VIR_UKR ]; break; + case BLIS_GEMMTRSM_L_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_L_VIR_UKR ]; break; + case BLIS_GEMMTRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_U_VIR_UKR ]; break; + case BLIS_TRSM_L_UKR: ukrs = &cntx_ukrs[ BLIS_TRSM_L_VIR_UKR ]; break; + case BLIS_TRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_TRSM_U_VIR_UKR ]; break; + default: ukrs = NULL; break; + }; + + if ( ukrs ) + bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); } - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - // Shutdown variable argument environment and clean up stack. va_end( args ); - - // -- End variable argument section -- - - // Query the context for the addresses of: - // - the l3 small/unpacked handlers array - void** cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - - // Process each operation id tuple provided. - for ( i = 0; i < n_ops; ++i ) - { - // Read the current operation id and handler function pointer. - const opid_t op_id = op_ids[ i ]; - void* op_fp = op_fps[ i ]; - - // Store the sup handler function pointer into the slot for the - // specified operation id. - cntx_l3_sup_handlers[ op_id ] = op_fp; - } - - // Free the temporary local arrays. - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_handlers(): " ); - #endif - bli_free_intl( op_ids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_handlers(): " ); - #endif - bli_free_intl( op_fps ); } // ----------------------------------------------------------------------------- -void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ) +void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use - // non-default l3 sup blocksizes. It should be called after - // bli_cntx_init_defaults() so that the context begins with default - // blocksizes across all datatypes. + // non-default microkernel preferences. It should be called after + // bli_cntx_init__ref() so that the context begins with + // default preferences across all datatypes. /* Example prototypes: - void bli_cntx_set_blkszs + void bli_cntx_set_ukr_prefs ( - dim_t n_bs, - bszid_t bs0_id, blksz_t* blksz0, - bszid_t bs1_id, blksz_t* blksz1, - bszid_t bs2_id, blksz_t* blksz2, - ... - cntx_t* cntx + cntx_t* cntx, + ukr_pref_t ukr_pref0_id, num_t dt0, bool ukr_pref0, + ukr_pref_t ukr_pref1_id, num_t dt1, bool ukr_pref1, + ukr_pref_t ukr_pref2_id, num_t dt2, bool ukr_pref2, + ..., + BLIS_VA_END ); */ - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &r_val ); - - // -- Begin variable argument section -- + // Query the context for the address of the ukernel preference mbool_t array + mbool_t* cntx_ukr_prefs = bli_cntx_ukr_prefs_buf( cntx ); // Initialize variable argument environment. - va_start( args, n_bs ); - - // Process n_bs tuples. - for ( i = 0; i < n_bs; ++i ) - { - // Here, we query the variable argument list for: - // - the bszid_t of the blocksize we're about to process, - // - the address of the blksz_t object. - bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t ); - blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* ); - - // Store the values in our temporary arrays. - bszids[ i ] = bs_id; - blkszs[ i ] = blksz; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- - - // Query the context for the addresses of: - // - the blocksize object array - blksz_t* cntx_l3_sup_blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. Notice that the blksz_t* pointers were saved, rather than - // the objects themselves, but we copy the contents of the objects - // when copying into the context. - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_bs; ++i ) - { - // Read the current blocksize id, blksz_t* pointer, blocksize - // multiple id, and blocksize scalar. - bszid_t bs_id = bszids[ i ]; - blksz_t* blksz = blkszs[ i ]; - - blksz_t* cntx_l3_sup_blksz = &cntx_l3_sup_blkszs[ bs_id ]; - - // Copy the blksz_t object contents into the appropriate - // location within the context's blksz_t array. - //cntx_l3_sup_blkszs[ bs_id ] = *blksz; - //bli_blksz_copy( blksz, cntx_l3_sup_blksz ); - bli_blksz_copy_if_pos( blksz, cntx_l3_sup_blksz ); - } - - // Free the temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bli_free_intl( blkszs ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_blkszs(): " ); - #endif - bli_free_intl( bszids ); -} - -// ----------------------------------------------------------------------------- - -void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ) -{ - // This function can be called from the bli_cntx_init_*() function for - // a particular architecture if the kernel developer wishes to use - // non-default level-3 microkernels for small/unpacked matrices. It - // should be called after bli_cntx_init_defaults() so that the context - // begins with default sup micro/millikernels across all datatypes. - - /* Example prototypes: - - void bli_cntx_set_l3_sup_kers - ( - dim_t n_ukrs, - stor3_t stor_id0, num_t dt0, void* ukr0_fp, bool pref0, - stor3_t stor_id1, num_t dt1, void* ukr1_fp, bool pref1, - stor3_t stor_id2, num_t dt2, void* ukr2_fp, bool pref2, - ... - cntx_t* cntx - ); - */ - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - stor3_t* st3_ids = bli_malloc_intl( n_ukrs * sizeof( stor3_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val ); + va_start( args, cntx ); - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - void** ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void* ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - bool* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_ukrs ); - - // Process n_ukrs tuples. - for ( i = 0; i < n_ukrs; ++i ) + // Process ukernel preferences until BLIS_VA_END is reached. + while ( true ) { - // Here, we query the variable argument list for: - // - the stor3_t storage case being assigned to the kernel we're - // about to process, - // - the datatype of the kernel, - // - the kernel function pointer, and - // - the kernel function storage preference - // that we need to store to the context. - const stor3_t st3_id = ( stor3_t )va_arg( args, stor3_t ); - const num_t ukr_dt = ( num_t )va_arg( args, num_t ); - void* ukr_fp = ( void* )va_arg( args, void* ); - const bool ukr_pref = ( bool )va_arg( args, int ); - - // Store the values in our temporary arrays. - st3_ids[ i ] = st3_id; - ukr_dts[ i ] = ukr_dt; - ukr_fps[ i ] = ukr_fp; - ukr_prefs[ i ] = ukr_pref; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- - - // Query the context for the addresses of: - // - the l3 small/unpacked ukernel func_t array - // - the l3 small/unpacked ukernel preferences array - func_t* cntx_l3_sup_kers = bli_cntx_l3_sup_kers_buf( cntx ); - mbool_t* cntx_l3_sup_kers_prefs = bli_cntx_l3_sup_kers_prefs_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - -#if 0 - dim_t sup_map[ BLIS_NUM_LEVEL3_SUP_UKRS ][2]; - - // Create the small/unpacked ukernel mappings: - // - rv -> rrr 0, rcr 2 - // - rg -> rrc 1, rcc 3 - // - cv -> ccr 6, ccc 7 - // - cg -> crr 4, crc 5 - // - rd -> rrc 1 - // - cd -> crc 5 - // - rc -> rcc 3 - // - cr -> crr 4 - // - gx -> xxx 8 - // NOTE: We only need to set one slot in the context l3_sup_kers array - // for the general-stride/generic ukernel type, but since the loop below - // needs to be set up to set two slots to accommodate the RV, RG, CV, and - // CG, ukernel types, we will just be okay with the GX ukernel being set - // redundantly. (The RD, CD, CR, and RC ukernel types are set redundantly - // for the same reason.) - sup_map[ BLIS_GEMMSUP_RV_UKR ][0] = BLIS_RRR; - sup_map[ BLIS_GEMMSUP_RV_UKR ][1] = BLIS_RCR; - sup_map[ BLIS_GEMMSUP_RG_UKR ][0] = BLIS_RRC; - sup_map[ BLIS_GEMMSUP_RG_UKR ][1] = BLIS_RCC; - sup_map[ BLIS_GEMMSUP_CV_UKR ][0] = BLIS_CCR; - sup_map[ BLIS_GEMMSUP_CV_UKR ][1] = BLIS_CCC; - sup_map[ BLIS_GEMMSUP_CG_UKR ][0] = BLIS_CRR; - sup_map[ BLIS_GEMMSUP_CG_UKR ][1] = BLIS_CRC; - - sup_map[ BLIS_GEMMSUP_RD_UKR ][0] = BLIS_RRC; - sup_map[ BLIS_GEMMSUP_RD_UKR ][1] = BLIS_RRC; - sup_map[ BLIS_GEMMSUP_CD_UKR ][0] = BLIS_CRC; - sup_map[ BLIS_GEMMSUP_CD_UKR ][1] = BLIS_CRC; - - sup_map[ BLIS_GEMMSUP_RC_UKR ][0] = BLIS_RCC; - sup_map[ BLIS_GEMMSUP_RC_UKR ][1] = BLIS_RCC; - sup_map[ BLIS_GEMMSUP_CR_UKR ][0] = BLIS_CRR; - sup_map[ BLIS_GEMMSUP_CR_UKR ][1] = BLIS_CRR; - - sup_map[ BLIS_GEMMSUP_GX_UKR ][0] = BLIS_XXX; - sup_map[ BLIS_GEMMSUP_GX_UKR ][1] = BLIS_XXX; -#endif - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_ukrs; ++i ) - { - // Read the current stor3_t id, ukernel datatype, ukernel function - // pointer, and ukernel preference. - const stor3_t st3_id = st3_ids[ i ]; - const num_t ukr_dt = ukr_dts[ i ]; - void* ukr_fp = ukr_fps[ i ]; - const bool ukr_pref = ukr_prefs[ i ]; - - // Index to the func_t and mbool_t for the current stor3_t id - // being processed. - func_t* ukrs = &cntx_l3_sup_kers[ st3_id ]; - mbool_t* prefs = &cntx_l3_sup_kers_prefs[ st3_id ]; - - // Store the ukernel function pointer and preference values into - // the stor3_t location in the context. - bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); - bli_mbool_set_dt( ukr_pref, ukr_dt, prefs ); - } - - // Free the temporary local arrays. - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - bli_free_intl( st3_ids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - bli_free_intl( ukr_dts ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - bli_free_intl( ukr_fps ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l3_sup_kers(): " ); - #endif - bli_free_intl( ukr_prefs ); -} - -// ----------------------------------------------------------------------------- - -void bli_cntx_set_l1f_kers( dim_t n_kers, ... ) -{ - // This function can be called from the bli_cntx_init_*() function for - // a particular architecture if the kernel developer wishes to use - // non-default level-1f kernels. It should be called after - // bli_cntx_init_defaults() so that the context begins with default l1f - // kernels across all datatypes. - - /* Example prototypes: - - void bli_cntx_set_l1f_kers - ( - dim_t n_ukrs, - l1fkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp, - l1fkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp, - l1fkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp, - ... - cntx_t* cntx - ); - */ - - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1f_kers(): " ); - #endif - l1fkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1fkr_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1f_kers(): " ); - #endif - num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1f_kers(): " ); - #endif - void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val ); - - // -- Begin variable argument section -- + const int ukr_pref_id0 = va_arg( args, int ); - // Initialize variable argument environment. - va_start( args, n_kers ); + // If we find a ukernel pref id of BLIS_VA_END, then we are done. + if ( ukr_pref_id0 == BLIS_VA_END ) break; - // Process n_kers tuples. - for ( i = 0; i < n_kers; ++i ) - { // Here, we query the variable argument list for: - // - the l1fkr_t of the kernel we're about to process, + // - the ukr_t of the kernel we're about to process (already done), // - the datatype of the kernel, and // - the kernel function pointer - // that we need to store to the context. - const l1fkr_t ker_id = ( l1fkr_t )va_arg( args, l1fkr_t ); - const num_t ker_dt = ( num_t )va_arg( args, num_t ); - void_fp ker_fp = ( void_fp )va_arg( args, void_fp ); - - // Store the values in our temporary arrays. - ker_ids[ i ] = ker_id; - ker_dts[ i ] = ker_dt; - ker_fps[ i ] = ker_fp; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- - - // Query the context for the address of: - // - the level-1f kernels func_t array - func_t* cntx_l1f_kers = bli_cntx_l1f_kers_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_kers; ++i ) - { - // Read the current kernel id, kernel datatype, and kernel function - // pointer. - const l1fkr_t ker_id = ker_ids[ i ]; - const num_t ker_dt = ker_dts[ i ]; - void_fp ker_fp = ker_fps[ i ]; + const ukr_pref_t ukr_pref_id = ( ukr_pref_t )ukr_pref_id0; + const bool ukr_pref_dt = ( num_t )va_arg( args, num_t ); + const bool ukr_pref = ( bool )va_arg( args, int ); // Index into the func_t and mbool_t for the current kernel id // being processed. - func_t* kers = &cntx_l1f_kers[ ker_id ]; + mbool_t* ukr_prefs = &cntx_ukr_prefs[ ukr_pref_id ]; - // Store the ukernel function pointer and preference values into - // the context. - bli_func_set_dt( ker_fp, ker_dt, kers ); + // Store the ukernel preference value into the context. + bli_mbool_set_dt( ukr_pref, ukr_pref_dt, ukr_prefs ); } - // Free the temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1f_kers(): " ); - #endif - bli_free_intl( ker_ids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1f_kers(): " ); - #endif - bli_free_intl( ker_dts ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1f_kers(): " ); - #endif - bli_free_intl( ker_fps ); -} - -// ----------------------------------------------------------------------------- - -void bli_cntx_set_l1v_kers( dim_t n_kers, ... ) -{ - // This function can be called from the bli_cntx_init_*() function for - // a particular architecture if the kernel developer wishes to use - // non-default level-1v kernels. It should be called after - // bli_cntx_init_defaults() so that the context begins with default l1v - // kernels across all datatypes. - - /* Example prototypes: - - void bli_cntx_set_l1v_kers - ( - dim_t n_ukrs, - l1vkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp, - l1vkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp, - l1vkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp, - ... - cntx_t* cntx - ); - */ - - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1v_kers(): " ); - #endif - l1vkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1vkr_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1v_kers(): " ); - #endif - num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1v_kers(): " ); - #endif - void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val ); - - // -- Begin variable argument section -- - - // Initialize variable argument environment. - va_start( args, n_kers ); - - // Process n_kers tuples. - for ( i = 0; i < n_kers; ++i ) - { - // Here, we query the variable argument list for: - // - the l1vkr_t of the kernel we're about to process, - // - the datatype of the kernel, and - // - the kernel function pointer - // that we need to store to the context. - const l1vkr_t ker_id = ( l1vkr_t )va_arg( args, l1vkr_t ); - const num_t ker_dt = ( num_t )va_arg( args, num_t ); - void_fp ker_fp = ( void_fp )va_arg( args, void_fp ); - - // Store the values in our temporary arrays. - ker_ids[ i ] = ker_id; - ker_dts[ i ] = ker_dt; - ker_fps[ i ] = ker_fp; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - // Shutdown variable argument environment and clean up stack. va_end( args ); - - // -- End variable argument section -- - - // Query the context for the address of: - // - the level-1v kernels func_t array - func_t* cntx_l1v_kers = bli_cntx_l1v_kers_buf( cntx ); - - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_kers; ++i ) - { - // Read the current kernel id, kernel datatype, and kernel function - // pointer. - const l1vkr_t ker_id = ker_ids[ i ]; - const num_t ker_dt = ker_dts[ i ]; - void_fp ker_fp = ker_fps[ i ]; - - // Index into the func_t and mbool_t for the current kernel id - // being processed. - func_t* kers = &cntx_l1v_kers[ ker_id ]; - - // Store the ukernel function pointer and preference values into - // the context. - bli_func_set_dt( ker_fp, ker_dt, kers ); - } - - // Free the temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1v_kers(): " ); - #endif - bli_free_intl( ker_ids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1v_kers(): " ); - #endif - bli_free_intl( ker_dts ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_l1v_kers(): " ); - #endif - bli_free_intl( ker_fps ); } // ----------------------------------------------------------------------------- -void bli_cntx_set_packm_kers( dim_t n_kers, ... ) +void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use - // non-default packing kernels. It should be called after - // bli_cntx_init_defaults() so that the context begins with default packm - // kernels across all datatypes. + // non-default level-3 operation handler for small/unpacked matrices. It + // should be called after bli_cntx_init__ref() so that the + // context begins with default sup handlers across all datatypes. /* Example prototypes: - void bli_cntx_set_packm_kers + void bli_cntx_set_l3_sup_handlers ( - dim_t n_ukrs, - l1mkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp, - l1mkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp, - l1mkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp, - ... cntx_t* cntx + opid_t op0_id, void_fp handler0_fp, + opid_t op1_id, void_fp handler1_fp, + opid_t op2_id, void_fp handler2_fp, + ..., + BLIS_VA_END ); */ - va_list args; - dim_t i; - err_t r_val; - - // Allocate some temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_packm_kers(): " ); - #endif - l1mkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1mkr_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_packm_kers(): " ); - #endif - num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_packm_kers(): " ); - #endif - void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val ); - - // -- Begin variable argument section -- + // Query the context for the address of the l3 sup handlers array. + void_fp* cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx ); // Initialize variable argument environment. - va_start( args, n_kers ); + va_list args; + va_start( args, cntx ); - // Process n_kers tuples. - for ( i = 0; i < n_kers; ++i ) + // Process sup handlers until BLIS_VA_END is reached. + while ( true ) { - // Here, we query the variable argument list for: - // - the l1mkr_t of the kernel we're about to process, - // - the datatype of the kernel, and - // - the kernel function pointer - // that we need to store to the context. - const l1mkr_t ker_id = ( l1mkr_t )va_arg( args, l1mkr_t ); - const num_t ker_dt = ( num_t )va_arg( args, num_t ); - void_fp ker_fp = ( void_fp )va_arg( args, void_fp ); - - // Store the values in our temporary arrays. - ker_ids[ i ] = ker_id; - ker_dts[ i ] = ker_dt; - ker_fps[ i ] = ker_fp; - } - - // The last argument should be the context pointer. - cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); - - // Shutdown variable argument environment and clean up stack. - va_end( args ); - - // -- End variable argument section -- + const int op_id0 = va_arg( args, int ); - // Query the context for the address of: - // - the packm kernels func_t array - func_t* cntx_packm_kers = bli_cntx_packm_kers_buf( cntx ); + // If we find an operation id of BLIS_VA_END, then we are done. + if ( op_id0 == BLIS_VA_END ) break; - // Now that we have the context address, we want to copy the values - // from the temporary buffers into the corresponding buffers in the - // context. - - // Process each blocksize id tuple provided. - for ( i = 0; i < n_kers; ++i ) - { - // Read the current kernel id, kernel datatype, and kernel function - // pointer. - const l1mkr_t ker_id = ker_ids[ i ]; - const num_t ker_dt = ker_dts[ i ]; - void_fp ker_fp = ker_fps[ i ]; - - // Index into the func_t and mbool_t for the current kernel id - // being processed. - func_t* kers = &cntx_packm_kers[ ker_id ]; + // Here, we query the variable argument list for: + // - the opid_t of the operation we're about to process, + // - the sup handler function pointer + const opid_t op_id = ( opid_t )op_id0; + void_fp op_fp = ( void_fp )va_arg( args, void_fp ); - // Store the ukernel function pointer and preference values into - // the context. - bli_func_set_dt( ker_fp, ker_dt, kers ); + // Store the sup handler function pointer into the slot for the + // specified operation id. + cntx_l3_sup_handlers[ op_id ] = op_fp; } - // Free the temporary local arrays. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_packm_kers(): " ); - #endif - bli_free_intl( ker_ids ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_packm_kers(): " ); - #endif - bli_free_intl( ker_dts ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntx_set_packm_kers(): " ); - #endif - bli_free_intl( ker_fps ); + // Shutdown variable argument environment and clean up stack. + va_end( args ); } // ----------------------------------------------------------------------------- @@ -1586,11 +408,11 @@ void bli_cntx_print( cntx_t* cntx ) ); } - for ( i = 0; i < BLIS_NUM_LEVEL3_UKRS; ++i ) + for ( i = 0; i < BLIS_NUM_UKRS; ++i ) { - func_t* ukr = bli_cntx_get_l3_vir_ukrs( i, cntx ); + func_t* ukr = bli_cntx_get_ukrs( i, cntx ); - printf( "l3 vir ukr %2lu: %16p %16p %16p %16p\n", + printf( "ukr %2lu: %16p %16p %16p %16p\n", ( unsigned long )i, bli_func_get_dt( BLIS_FLOAT, ukr ), bli_func_get_dt( BLIS_DOUBLE, ukr ), @@ -1599,42 +421,16 @@ void bli_cntx_print( cntx_t* cntx ) ); } - for ( i = 0; i < BLIS_NUM_3OP_RC_COMBOS; ++i ) - { - func_t* ukr = bli_cntx_get_l3_sup_kers( i, cntx ); - - printf( "l3 sup ukr %2lu: %16p %16p %16p %16p\n", - ( unsigned long )i, - bli_func_get_dt( BLIS_FLOAT, ukr ), - bli_func_get_dt( BLIS_DOUBLE, ukr ), - bli_func_get_dt( BLIS_SCOMPLEX, ukr ), - bli_func_get_dt( BLIS_DCOMPLEX, ukr ) - ); - } - - for ( i = 0; i < BLIS_NUM_LEVEL1F_KERS; ++i ) - { - func_t* ker = bli_cntx_get_l1f_kers( i, cntx ); - - printf( "l1f ker %2lu: %16p %16p %16p %16p\n", - ( unsigned long )i, - bli_func_get_dt( BLIS_FLOAT, ker ), - bli_func_get_dt( BLIS_DOUBLE, ker ), - bli_func_get_dt( BLIS_SCOMPLEX, ker ), - bli_func_get_dt( BLIS_DCOMPLEX, ker ) - ); - } - - for ( i = 0; i < BLIS_NUM_LEVEL1V_KERS; ++i ) + for ( i = 0; i < BLIS_NUM_UKR_PREFS; ++i ) { - func_t* ker = bli_cntx_get_l1v_kers( i, cntx ); + mbool_t* ukr_pref = bli_cntx_get_ukr_prefs( i, cntx ); - printf( "l1v ker %2lu: %16p %16p %16p %16p\n", + printf( "ukr pref %2lu: %d %d %d %d\n", ( unsigned long )i, - bli_func_get_dt( BLIS_FLOAT, ker ), - bli_func_get_dt( BLIS_DOUBLE, ker ), - bli_func_get_dt( BLIS_SCOMPLEX, ker ), - bli_func_get_dt( BLIS_DCOMPLEX, ker ) + bli_mbool_get_dt( BLIS_FLOAT, ukr_pref ), + bli_mbool_get_dt( BLIS_DOUBLE, ukr_pref ), + bli_mbool_get_dt( BLIS_SCOMPLEX, ukr_pref ), + bli_mbool_get_dt( BLIS_DCOMPLEX, ukr_pref ) ); } diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 76350f6bc..412430e9b 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -43,24 +43,13 @@ /* typedef struct cntx_s { - blksz_t* blkszs; - bszid_t* bmults; + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + bszid_t bmults[ BLIS_NUM_BLKSZS ]; - func_t* l3_vir_ukrs; - func_t* l3_nat_ukrs; - mbool_t* l3_nat_ukrs_prefs; + func_t ukrs[ BLIS_NUM_UKRS ]; + mbool_t ukr_prefs[ BLIS_NUM_UKR_PREFS ]; - blksz_t* l3_sup_thresh; - void** l3_sup_handlers; - blksz_t* l3_sup_blkszs; - func_t* l3_sup_kers; - mbool_t* l3_sup_kers_prefs; - - func_t* l1f_kers; - func_t* l1v_kers; - - func_t* packm_kers; - func_t* unpackm_kers; + void_fp l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; ind_t method; @@ -81,54 +70,18 @@ BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } -BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) -{ - return cntx->l3_vir_ukrs; -} -BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) +BLIS_INLINE func_t* bli_cntx_ukrs_buf( cntx_t* cntx ) { - return cntx->l3_nat_ukrs; + return cntx->ukrs; } -BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) +BLIS_INLINE mbool_t* bli_cntx_ukr_prefs_buf( cntx_t* cntx ) { - return cntx->l3_nat_ukrs_prefs; + return cntx->ukr_prefs; } -BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) -{ - return cntx->l3_sup_thresh; -} -BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) +BLIS_INLINE void_fp* bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } -BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) -{ - return cntx->l3_sup_blkszs; -} -BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) -{ - return cntx->l3_sup_kers; -} -BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) -{ - return cntx->l3_sup_kers_prefs; -} -BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) -{ - return cntx->l1f_kers; -} -BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) -{ - return cntx->l1v_kers; -} -BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) -{ - return cntx->packm_kers; -} -BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) -{ - return cntx->unpackm_kers; -} BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; @@ -204,399 +157,144 @@ BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) // ----------------------------------------------------------------------------- -BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx ) { - func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); + func_t* funcs = bli_cntx_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } -BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx ) { - func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); + func_t* func = bli_cntx_get_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } -BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); - func_t* func = &funcs[ ukr_id ]; - - return func; -} - -BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx ) { - func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); + switch ( ukr_id ) + { + case BLIS_GEMM_UKR: ukr_id = BLIS_GEMM_VIR_UKR; break; + case BLIS_TRSM_L_UKR: ukr_id = BLIS_TRSM_L_VIR_UKR; break; + case BLIS_TRSM_U_UKR: ukr_id = BLIS_TRSM_U_VIR_UKR; break; + case BLIS_GEMMTRSM_L_UKR: ukr_id = BLIS_GEMMTRSM_L_VIR_UKR; break; + case BLIS_GEMMTRSM_U_UKR: ukr_id = BLIS_GEMMTRSM_U_VIR_UKR; break; + default: break; + }; - return bli_func_get_dt( dt, func ); + return bli_cntx_get_ukr_dt( dt, ukr_id, cntx ); } // ----------------------------------------------------------------------------- -BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE mbool_t* bli_cntx_get_ukr_prefs( ukr_pref_t ukr_id, cntx_t* cntx ) { - mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); + mbool_t* mbools = bli_cntx_ukr_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } -BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, ukr_pref_t ukr_id, cntx_t* cntx ) { - mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); + mbool_t* mbool = bli_cntx_get_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- -BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) -{ - blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); - blksz_t* thresh = &threshs[ thresh_id ]; - - // Return the address of the blksz_t identified by thresh_id. - return thresh; -} - -BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) -{ - blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); - dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); - - // Return the main (default) threshold value for the datatype given. - return thresh_dt; -} - BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { - if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; - if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; - if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; + if ( m < bli_cntx_get_blksz_def_dt( dt, BLIS_MT, cntx ) ) return TRUE; + if ( n < bli_cntx_get_blksz_def_dt( dt, BLIS_NT, cntx ) ) return TRUE; + if ( k < bli_cntx_get_blksz_def_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- -BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) +BLIS_INLINE void_fp bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { - void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); - void* func = funcs[ op ]; + void_fp* funcs = bli_cntx_l3_sup_handlers_buf( cntx ); + void_fp func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- -BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) -{ - blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); - blksz_t* blksz = &blkszs[ bs_id ]; - - // Return the address of the blksz_t identified by bs_id. - return blksz; -} - -BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx ) { - blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); - dim_t bs_dt = bli_blksz_get_def( dt, blksz ); + // This initial value will get overwritten during the switch statement below. + ukr_pref_t ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF; - // Return the main (default) blocksize value for the datatype given. - return bs_dt; -} - -BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) -{ - blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); - dim_t bs_dt = bli_blksz_get_max( dt, blksz ); - - // Return the auxiliary (maximum) blocksize value for the datatype given. - return bs_dt; -} - -// ----------------------------------------------------------------------------- - -BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); - func_t* func = &funcs[ stor_id ]; - - return func; -} - -BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) -{ - func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); - - return bli_func_get_dt( dt, func ); -} - -// ----------------------------------------------------------------------------- - -BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) -{ - mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); - mbool_t* mbool = &mbools[ stor_id ]; - - return mbool; -} - -BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) -{ - mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); - - return ( bool )bli_mbool_get_dt( dt, mbool ); -} - -// ----------------------------------------------------------------------------- - -BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); - func_t* func = &funcs[ ker_id ]; - - return func; -} - -BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); - - return bli_func_get_dt( dt, func ); -} - -// ----------------------------------------------------------------------------- - -BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); - func_t* func = &funcs[ ker_id ]; - - return func; -} - -BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); - - return bli_func_get_dt( dt, func ); -} - -// ----------------------------------------------------------------------------- - -BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = NULL; - - // Only index to the requested packm func_t if the packm kernel being - // requested is one that is explicitly supported. - if ( 0 <= ( gint_t )ker_id && - ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) - { - func_t* funcs = bli_cntx_packm_kers_buf( cntx ); - - func = &funcs[ ker_id ]; - } - - return func; -} - -BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) -{ - void_fp fp = NULL; - - // Only query the context for the packm func_t (and then extract the - // datatype-specific function pointer) if the packm kernel being - // requested is one that is explicitly supported. - if ( 0 <= ( gint_t )ker_id && - ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) + // Get the correct preference from the kernel ID. + switch ( ukr_id ) { - func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); - - fp = bli_func_get_dt( dt, func ); + case BLIS_GEMM_VIR_UKR: // fallthrough + case BLIS_GEMM_UKR: ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF; break; + case BLIS_TRSM_L_VIR_UKR: // fallthrough + case BLIS_TRSM_L_UKR: ukr_pref_id = BLIS_TRSM_L_UKR_ROW_PREF; break; + case BLIS_TRSM_U_VIR_UKR: // fallthrough + case BLIS_TRSM_U_UKR: ukr_pref_id = BLIS_TRSM_U_UKR_ROW_PREF; break; + case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough + case BLIS_GEMMTRSM_L_UKR: ukr_pref_id = BLIS_GEMMTRSM_L_UKR_ROW_PREF; break; + case BLIS_GEMMTRSM_U_VIR_UKR: // fallthrough + case BLIS_GEMMTRSM_U_UKR: ukr_pref_id = BLIS_GEMMTRSM_U_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_RRR_UKR: ukr_pref_id = BLIS_GEMMSUP_RRR_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_RRC_UKR: ukr_pref_id = BLIS_GEMMSUP_RRC_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_RCR_UKR: ukr_pref_id = BLIS_GEMMSUP_RCR_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_RCC_UKR: ukr_pref_id = BLIS_GEMMSUP_RCC_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_CRR_UKR: ukr_pref_id = BLIS_GEMMSUP_CRR_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_CRC_UKR: ukr_pref_id = BLIS_GEMMSUP_CRC_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_CCR_UKR: ukr_pref_id = BLIS_GEMMSUP_CCR_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_CCC_UKR: ukr_pref_id = BLIS_GEMMSUP_CCC_UKR_ROW_PREF; break; + case BLIS_GEMMSUP_XXX_UKR: ukr_pref_id = BLIS_GEMMSUP_XXX_UKR_ROW_PREF; break; + default: break; // TODO: should be an error condition } - return fp; -} - -BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = NULL; - - // Only index to the requested unpackm func_t if the unpackm kernel being - // requested is one that is explicitly supported. - if ( 0 <= ( gint_t )ker_id && - ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) - { - func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); - - func = &funcs[ ker_id ]; - } - - return func; -} - -BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) -{ - void_fp fp = NULL; - - // Only query the context for the unpackm func_t (and then extract the - // datatype-specific function pointer) if the unpackm kernel being - // requested is one that is explicitly supported. - if ( 0 <= ( gint_t )ker_id && - ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) + // For virtual ukernels during non-native execution, use the real projection of + // the datatype. + if ( bli_cntx_method( cntx ) != BLIS_NAT ) { - func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); - - fp = bli_func_get_dt( dt, func ); + switch ( ukr_id ) + { + case BLIS_GEMM_VIR_UKR: // fallthrough + case BLIS_TRSM_L_VIR_UKR: // fallthrough + case BLIS_TRSM_U_VIR_UKR: // fallthrough + case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough + case BLIS_GEMMTRSM_U_VIR_UKR: dt = bli_dt_proj_to_real( dt ); break; + default: break; + } } - return fp; -} - -// ----------------------------------------------------------------------------- - -BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); - - // A ukernel preference of TRUE means the ukernel prefers row storage. - return ( bool ) - ( prefs == TRUE ); -} - -BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); - - // A ukernel preference of FALSE means the ukernel prefers column storage. - return ( bool ) - ( prefs == FALSE ); -} - -BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) -{ - // Note that we use the computation datatype, which may differ from the - // storage datatype of C (when performing a mixed datatype operation). - const num_t dt = bli_obj_comp_dt( obj ); - const bool ukr_prefers_rows - = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); - const bool ukr_prefers_cols - = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); - bool r_val = FALSE; - - if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; - else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; - - return r_val; -} - -BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) -{ - return ( bool ) - !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); -} - -// ----------------------------------------------------------------------------- - -BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - // For induced methods, return the ukernel storage preferences of the - // corresponding real micro-kernel. - // NOTE: This projection to real domain becomes unnecessary if you - // set the exec_dt for 1m to the real projection of the storage - // datatype. - if ( bli_cntx_method( cntx ) != BLIS_NAT ) - dt = bli_dt_proj_to_real( dt ); - - return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); -} - -BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) -{ - // For induced methods, return the ukernel storage preferences of the - // corresponding real micro-kernel. - // NOTE: This projection to real domain becomes unnecessary if you - // set the exec_dt for 1m to the real projection of the storage - // datatype. - if ( bli_cntx_method( cntx ) != BLIS_NAT ) - dt = bli_dt_proj_to_real( dt ); - - return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); -} - -BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) -{ - // Note that we use the computation datatype, which may differ from the - // storage datatype of C (when performing a mixed datatype operation). - const num_t dt = bli_obj_comp_dt( obj ); - const bool ukr_prefers_rows - = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); - const bool ukr_prefers_cols - = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); - bool r_val = FALSE; - - if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; - else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; - - return r_val; + return bli_cntx_get_ukr_prefs_dt( dt, ukr_pref_id, cntx ); } -BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_ukr_prefers_cols_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx ) { - return ( bool ) - !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); + return ! bli_cntx_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } -// ----------------------------------------------------------------------------- - -BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) -{ - const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); - - // A ukernel preference of TRUE means the ukernel prefers row storage. - return ( bool ) - ( prefs == TRUE ); -} - -BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) -{ - const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); - - // A ukernel preference of FALSE means the ukernel prefers column storage. - return ( bool ) - ( prefs == FALSE ); -} - -#if 0 -// NOTE: These static functions aren't needed yet. - -BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_prefers_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx ) { - const num_t dt = bli_obj_dt( obj ); - const bool ukr_prefers_rows - = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); - const bool ukr_prefers_cols - = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); - bool r_val = FALSE; + const bool ukr_prefers_rows + = bli_cntx_ukr_prefers_rows_dt( bli_obj_dt( obj ), ukr_id, cntx ); - if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; - else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; + if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) return TRUE; + else if ( bli_obj_is_col_stored( obj ) && !ukr_prefers_rows ) return TRUE; - return r_val; + return FALSE; } -BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_dislikes_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx ) { - return ( bool ) - !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); + return ! bli_cntx_prefers_storage_of( obj, ukr_id, cntx ); } -#endif // ----------------------------------------------------------------------------- @@ -632,67 +330,64 @@ BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, c bli_blksz_set_max( bs, dt, blksz ); } -BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) +BLIS_INLINE void bli_cntx_set_ukr( ukr_t ukr_id, func_t* func, cntx_t* cntx ) { - func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); + func_t* funcs = bli_cntx_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } -BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) +BLIS_INLINE void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, ukr_t ker_id, cntx_t* cntx ) { - func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); + func_t* func = bli_cntx_get_ukrs( ker_id, cntx ); - funcs[ ukr_id ] = *func; + bli_func_set_dt( fp, dt, func ); } -BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) +BLIS_INLINE void bli_cntx_set_ukr_pref( ukr_pref_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { - mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); + mbool_t* mbools = bli_cntx_ukr_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } -BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) +BLIS_INLINE void_fp bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { - func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); + ukr_t ukr_id = bli_stor3_ukr( stor_id ); - funcs[ ker_id ] = *func; + return bli_cntx_get_ukr_dt( dt, ukr_id, cntx ); } -BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); - - funcs[ ker_id ] = *func; -} - -BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) -{ - func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); - - funcs[ ker_id ] = *func; -} - -BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) -{ - func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); - - bli_func_set_dt( fp, dt, func ); -} - -BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) +BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { - func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); + switch ( bs_id ) + { + case BLIS_MR: bs_id = BLIS_MR_SUP; break; + case BLIS_NR: bs_id = BLIS_NR_SUP; break; + case BLIS_KR: bs_id = BLIS_KR_SUP; break; + case BLIS_MC: bs_id = BLIS_MC_SUP; break; + case BLIS_NC: bs_id = BLIS_NC_SUP; break; + case BLIS_KC: bs_id = BLIS_KC_SUP; break; + default: break; + }; - funcs[ ker_id ] = *func; + return bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ); } -BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) +BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { - func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); + switch ( bs_id ) + { + case BLIS_MR: bs_id = BLIS_MR_SUP; break; + case BLIS_NR: bs_id = BLIS_NR_SUP; break; + case BLIS_KR: bs_id = BLIS_KR_SUP; break; + case BLIS_MC: bs_id = BLIS_MC_SUP; break; + case BLIS_NC: bs_id = BLIS_NC_SUP; break; + case BLIS_KC: bs_id = BLIS_KC_SUP; break; + default: break; + }; - bli_func_set_dt( fp, dt, func ); + return bli_cntx_get_blksz_max_dt( dt, bs_id, cntx ); } // ----------------------------------------------------------------------------- @@ -701,24 +396,17 @@ BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_ BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); -BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); +BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( cntx_t* cntx, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); +BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); - -BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); - -BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); +BLIS_EXPORT_BLIS void bli_cntx_set_ukrs( cntx_t* cntx, ... ); +BLIS_EXPORT_BLIS void bli_cntx_set_ukr_prefs( cntx_t* cntx, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); +BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... ); + #endif diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index cc17b33ff..1372a055a 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -185,7 +185,7 @@ void bli_gks_init( void ) bli_gks_register_cntx( BLIS_ARCH_POWER10, bli_cntx_init_power10, bli_cntx_init_power10_ref, bli_cntx_init_power10_ind ); -#endif +#endif #ifdef BLIS_CONFIG_POWER9 bli_gks_register_cntx( BLIS_ARCH_POWER9, bli_cntx_init_power9, bli_cntx_init_power9_ref, @@ -267,7 +267,7 @@ void bli_gks_finalize( void ) void bli_gks_init_index( void ) { // This function is called by bli_gks_init(). It simply initializes all - // architecture id elements of the internal arrays to NULL. + // architecture id elements of the internal arrays to NULL. const size_t gks_size = sizeof( cntx_t* ) * BLIS_NUM_ARCHS; const size_t fpa_size = sizeof( void_fp ) * BLIS_NUM_ARCHS; @@ -382,7 +382,7 @@ void bli_gks_register_cntx // functions for reference kernels and induced method execution. The // former will be used whenever we need to obtain reference kernels and // latter will be used later on if the user calls a level-3 function - // with induced execution enabled. + // with induced execution enabled. cntx_ref_init[ id ] = ref_fp; cntx_ind_init[ id ] = ind_fp; @@ -582,7 +582,7 @@ cntx_t* bli_gks_query_ind_cntx // function on the newly allocated structure, we must first copy // over the contents of the native context. *gks_id_ind = *gks_id_nat; - + // Use the architecture id to look up the function pointer to the // context initialization function for induced methods. ind_cntx_init_ft f = cntx_ind_init[ id ]; @@ -635,7 +635,7 @@ void bli_gks_init_ref_cntx bool bli_gks_cntx_l3_nat_ukr_is_ref ( num_t dt, - l3ukr_t ukr_id, + ukr_t ukr_id, cntx_t* cntx ) { @@ -647,8 +647,8 @@ bool bli_gks_cntx_l3_nat_ukr_is_ref // Query each context for the micro-kernel function pointer for the // specified datatype. - void_fp ref_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr_id, &ref_cntx ); - void_fp fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr_id, cntx ); + void_fp ref_fp = bli_cntx_get_ukr_dt( dt, ukr_id, &ref_cntx ); + void_fp fp = bli_cntx_get_ukr_dt( dt, ukr_id, cntx ); // Return the result. return fp == ref_fp; @@ -668,7 +668,7 @@ static char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] = // ----------------------------------------------------------------------------- -char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ) +char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt ) { kimpl_t ki; @@ -676,7 +676,7 @@ char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ) // then query the ukernel function pointer for the given datatype from // that context. cntx_t* cntx = bli_gks_query_ind_cntx( method, dt ); - void_fp fp = bli_cntx_get_l3_vir_ukr_dt( dt, ukr, cntx ); + void_fp fp = bli_cntx_get_ukr_dt( dt, ukr, cntx ); // Check whether the ukernel function pointer is NULL for the given // datatype. If it is NULL, return the string for not applicable. @@ -691,7 +691,7 @@ char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ) } #if 0 -char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ) +char* bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt ) { opid_t oper; ind_t method; @@ -716,7 +716,7 @@ char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ) } #endif -kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ) +kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt ) { // If the current available induced method is not native, it // must be virtual. @@ -731,8 +731,6 @@ kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ) // method to the typed function pointer within the known // reference ukrs object. - cntx_t ref_cntx_l; - // Query the architecture id. arch_t id = bli_arch_query_id(); @@ -743,23 +741,13 @@ kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ) bli_check_error_code( e_val ); } - // Obtain the function pointer to the context initialization function - // for reference kernels. - ref_cntx_init_ft f = cntx_ref_init[ id ]; - - // Initialize a local context with reference kernels and related values. - f( &ref_cntx_l ); - // Query the native context from the gks. cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id ); - // Query the native ukernel func_t from both the native and reference - // contexts. - void_fp nat_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr, nat_cntx ); - void_fp ref_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr, &ref_cntx_l ); - - if ( nat_fp == ref_fp ) return BLIS_REFERENCE_UKERNEL; - else return BLIS_OPTIMIZED_UKERNEL; + if ( bli_gks_cntx_l3_nat_ukr_is_ref( dt, ukr, nat_cntx ) ) + return BLIS_REFERENCE_UKERNEL; + else + return BLIS_OPTIMIZED_UKERNEL; } } diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h index 188dcd507..b8e4c4fe0 100644 --- a/frame/base/bli_gks.h +++ b/frame/base/bli_gks.h @@ -54,12 +54,12 @@ BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); -bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); +bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, ukr_t ukr_id, cntx_t* cntx ); -BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); -BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); +BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt ); +BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt ); -//char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); +//char* bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt ); #endif diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h index 011ebcdfb..e863f7dcf 100644 --- a/frame/include/bli_gentfunc_macro_defs.h +++ b/frame/include/bli_gentfunc_macro_defs.h @@ -289,6 +289,13 @@ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) +// -- (four auxiliary arguments) -- + +#define INSERT_GENTFUNCCO_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ +\ +GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ +GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) + // -- Basic one-operand macro with integer instance -- diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index 4de624f98..d273c353a 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -245,7 +245,111 @@ #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif +// -- MR and NR blocksizes (only for reference kernels) ------------------------ +// The build system defines BLIS_IN_REF_KERNEL, but only when compiling +// reference kernels. By using compile-time constants for MR and NR, the +// compiler can perform certain optimizations, such as unrolling and +// vectorization, that would not be otherwise be possible. +#ifdef BLIS_IN_REF_KERNEL + +#ifndef BLIS_MR_s +#define BLIS_MR_s 4 +#endif + +#ifndef BLIS_MR_d +#define BLIS_MR_d 4 +#endif + +#ifndef BLIS_MR_c +#define BLIS_MR_c 4 +#endif + +#ifndef BLIS_MR_z +#define BLIS_MR_z 4 +#endif + +#ifndef BLIS_NR_s +#define BLIS_NR_s 16 +#endif + +#ifndef BLIS_NR_d +#define BLIS_NR_d 8 +#endif + +#ifndef BLIS_NR_c +#define BLIS_NR_c 8 +#endif + +#ifndef BLIS_NR_z +#define BLIS_NR_z 4 +#endif + +#ifndef BLIS_BBM_s +#define BLIS_BBM_s 1 +#endif + +#ifndef BLIS_BBM_d +#define BLIS_BBM_d 1 +#endif + +#ifndef BLIS_BBM_c +#define BLIS_BBM_c 1 +#endif + +#ifndef BLIS_BBM_z +#define BLIS_BBM_z 1 +#endif + +#ifndef BLIS_BBN_s +#define BLIS_BBN_s 1 +#endif + +#ifndef BLIS_BBN_d +#define BLIS_BBN_d 1 +#endif + +#ifndef BLIS_BBN_c +#define BLIS_BBN_c 1 +#endif + +#ifndef BLIS_BBN_z +#define BLIS_BBN_z 1 +#endif + +#ifndef BLIS_PACKMR_s +#define BLIS_PACKMR_s (BLIS_MR_s*BLIS_BBM_s) +#endif + +#ifndef BLIS_PACKMR_d +#define BLIS_PACKMR_d (BLIS_MR_d*BLIS_BBM_d) +#endif + +#ifndef BLIS_PACKMR_c +#define BLIS_PACKMR_c (BLIS_MR_c*BLIS_BBM_c) +#endif + +#ifndef BLIS_PACKMR_z +#define BLIS_PACKMR_z (BLIS_MR_z*BLIS_BBM_z) +#endif + +#ifndef BLIS_PACKNR_s +#define BLIS_PACKNR_s (BLIS_NR_s*BLIS_BBN_s) +#endif + +#ifndef BLIS_PACKNR_d +#define BLIS_PACKNR_d (BLIS_NR_d*BLIS_BBN_d) +#endif + +#ifndef BLIS_PACKNR_c +#define BLIS_PACKNR_c (BLIS_NR_c*BLIS_BBN_c) +#endif + +#ifndef BLIS_PACKNR_z +#define BLIS_PACKNR_z (BLIS_NR_z*BLIS_BBN_z) +#endif + +#endif #endif diff --git a/frame/include/bli_misc_macro_defs.h b/frame/include/bli_misc_macro_defs.h index 120338beb..903b4ece6 100644 --- a/frame/include/bli_misc_macro_defs.h +++ b/frame/include/bli_misc_macro_defs.h @@ -164,5 +164,11 @@ BLIS_INLINE void bli_toggle_bool( bool* b ) #define bli_iformatspec() "%6d" +// Sentinel constant used to indicate the end of a variable argument function +// (See bli_cntx.c) + +#define BLIS_VA_END (-1) + + #endif diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 286e79e2b..1822065da 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -754,7 +754,7 @@ BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, *offm_inc = 0; // If the diagonal intersects the right side of the matrix, - // ignore the area below that intersection. + // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; @@ -777,6 +777,14 @@ BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m bli_toggle_uplo( uplo ); } +// we don't know the type of a, so this must be a macro +// rs_a and cs_a must be variables and not expressions +#define bli_reflect_to_stored_part( diagoff, a, rs_a, cs_a ) \ +do { \ + a += ( diagoff ) * ( cs_a - rs_a ); \ + bli_swap_incs( &rs_a, &cs_a ); \ +} while (0) \ + BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; @@ -858,6 +866,22 @@ BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) #endif } +BLIS_INLINE ukr_t bli_stor3_ukr( stor3_t id ) +{ + switch ( id ) + { + case BLIS_RRR: return BLIS_GEMMSUP_RRR_UKR; + case BLIS_RRC: return BLIS_GEMMSUP_RRC_UKR; + case BLIS_RCR: return BLIS_GEMMSUP_RCR_UKR; + case BLIS_RCC: return BLIS_GEMMSUP_RCC_UKR; + case BLIS_CRR: return BLIS_GEMMSUP_CRR_UKR; + case BLIS_CRC: return BLIS_GEMMSUP_CRC_UKR; + case BLIS_CCR: return BLIS_GEMMSUP_CCR_UKR; + case BLIS_CCC: return BLIS_GEMMSUP_CCC_UKR; + default: return BLIS_GEMMSUP_XXX_UKR; + } +} + BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h index 293c80f91..f567e7ef3 100644 --- a/frame/include/bli_scalar_macro_defs.h +++ b/frame/include/bli_scalar_macro_defs.h @@ -49,8 +49,8 @@ // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. -#include "bli_setrs.h" // sets real component only -#include "bli_setis.h" // sets imaginary component only +#include "bli_setrs.h" // sets real component only +#include "bli_setis.h" // sets imaginary component only // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields @@ -194,6 +194,7 @@ #include "bli_adds_mxn.h" #include "bli_adds_mxn_uplo.h" #include "bli_set0s_mxn.h" +#include "bli_set0s_edge.h" #include "bli_copys_mxn.h" #include "bli_scal2s_mxn.h" #include "bli_xpbys_mxn.h" @@ -230,7 +231,7 @@ #include "bli_scal21rs.h" #include "bli_scal2j1rs.h" -// 1m (1e or 1r) +// 1m (1e or 1r) #include "bli_invert1ms_mxn_diag.h" #include "bli_scal1ms_mxn.h" diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index c66505bde..4e64f3711 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -626,7 +626,8 @@ typedef enum typedef enum { - BLIS_ADDV_KER = 0, + // l1v kernels + BLIS_ADDV_KER, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, @@ -639,108 +640,82 @@ typedef enum BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, - BLIS_XPBYV_KER -} l1vkr_t; - -#define BLIS_NUM_LEVEL1V_KERS 14 - - -typedef enum -{ - BLIS_AXPY2V_KER = 0, + BLIS_XPBYV_KER, + BLIS_AXPY2V_KER, BLIS_DOTAXPYV_KER, + + // l1f kernels BLIS_AXPYF_KER, BLIS_DOTXF_KER, - BLIS_DOTXAXPYF_KER -} l1fkr_t; - -#define BLIS_NUM_LEVEL1F_KERS 5 - - -typedef enum -{ - BLIS_PACKM_0XK_KER = 0, - BLIS_PACKM_1XK_KER = 1, - BLIS_PACKM_2XK_KER = 2, - BLIS_PACKM_3XK_KER = 3, - BLIS_PACKM_4XK_KER = 4, - BLIS_PACKM_5XK_KER = 5, - BLIS_PACKM_6XK_KER = 6, - BLIS_PACKM_7XK_KER = 7, - BLIS_PACKM_8XK_KER = 8, - BLIS_PACKM_9XK_KER = 9, - BLIS_PACKM_10XK_KER = 10, - BLIS_PACKM_11XK_KER = 11, - BLIS_PACKM_12XK_KER = 12, - BLIS_PACKM_13XK_KER = 13, - BLIS_PACKM_14XK_KER = 14, - BLIS_PACKM_15XK_KER = 15, - BLIS_PACKM_16XK_KER = 16, - BLIS_PACKM_17XK_KER = 17, - BLIS_PACKM_18XK_KER = 18, - BLIS_PACKM_19XK_KER = 19, - BLIS_PACKM_20XK_KER = 20, - BLIS_PACKM_21XK_KER = 21, - BLIS_PACKM_22XK_KER = 22, - BLIS_PACKM_23XK_KER = 23, - BLIS_PACKM_24XK_KER = 24, - BLIS_PACKM_25XK_KER = 25, - BLIS_PACKM_26XK_KER = 26, - BLIS_PACKM_27XK_KER = 27, - BLIS_PACKM_28XK_KER = 28, - BLIS_PACKM_29XK_KER = 29, - BLIS_PACKM_30XK_KER = 30, - BLIS_PACKM_31XK_KER = 31, - - BLIS_UNPACKM_0XK_KER = 0, - BLIS_UNPACKM_1XK_KER = 1, - BLIS_UNPACKM_2XK_KER = 2, - BLIS_UNPACKM_3XK_KER = 3, - BLIS_UNPACKM_4XK_KER = 4, - BLIS_UNPACKM_5XK_KER = 5, - BLIS_UNPACKM_6XK_KER = 6, - BLIS_UNPACKM_7XK_KER = 7, - BLIS_UNPACKM_8XK_KER = 8, - BLIS_UNPACKM_9XK_KER = 9, - BLIS_UNPACKM_10XK_KER = 10, - BLIS_UNPACKM_11XK_KER = 11, - BLIS_UNPACKM_12XK_KER = 12, - BLIS_UNPACKM_13XK_KER = 13, - BLIS_UNPACKM_14XK_KER = 14, - BLIS_UNPACKM_15XK_KER = 15, - BLIS_UNPACKM_16XK_KER = 16, - BLIS_UNPACKM_17XK_KER = 17, - BLIS_UNPACKM_18XK_KER = 18, - BLIS_UNPACKM_19XK_KER = 19, - BLIS_UNPACKM_20XK_KER = 20, - BLIS_UNPACKM_21XK_KER = 21, - BLIS_UNPACKM_22XK_KER = 22, - BLIS_UNPACKM_23XK_KER = 23, - BLIS_UNPACKM_24XK_KER = 24, - BLIS_UNPACKM_25XK_KER = 25, - BLIS_UNPACKM_26XK_KER = 26, - BLIS_UNPACKM_27XK_KER = 27, - BLIS_UNPACKM_28XK_KER = 28, - BLIS_UNPACKM_29XK_KER = 29, - BLIS_UNPACKM_30XK_KER = 30, - BLIS_UNPACKM_31XK_KER = 31 - -} l1mkr_t; - -#define BLIS_NUM_PACKM_KERS 32 -#define BLIS_NUM_UNPACKM_KERS 32 - - -typedef enum -{ - BLIS_GEMM_UKR = 0, + BLIS_DOTXAXPYF_KER, + + // pack kernels + BLIS_PACKM_MRXK_KER, + BLIS_PACKM_NRXK_KER, + BLIS_PACKM_MRXK_1ER_KER, + BLIS_PACKM_NRXK_1ER_KER, + BLIS_PACKM_MRXMR_DIAG_KER, + BLIS_PACKM_NRXNR_DIAG_KER, + BLIS_PACKM_MRXMR_DIAG_1ER_KER, + BLIS_PACKM_NRXNR_DIAG_1ER_KER, + + // unpack kernels + BLIS_UNPACKM_MRXK_KER, + BLIS_UNPACKM_NRXK_KER, + + // l3 native kernels + BLIS_GEMM_UKR, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, - BLIS_TRSM_U_UKR -} l3ukr_t; + BLIS_TRSM_U_UKR, + + // l3 virtual kernels + BLIS_GEMM_VIR_UKR, + BLIS_GEMMTRSM_L_VIR_UKR, + BLIS_GEMMTRSM_U_VIR_UKR, + BLIS_TRSM_L_VIR_UKR, + BLIS_TRSM_U_VIR_UKR, + + // gemmsup kernels + BLIS_GEMMSUP_RRR_UKR, + BLIS_GEMMSUP_RRC_UKR, + BLIS_GEMMSUP_RCR_UKR, + BLIS_GEMMSUP_RCC_UKR, + BLIS_GEMMSUP_CRR_UKR, + BLIS_GEMMSUP_CRC_UKR, + BLIS_GEMMSUP_CCR_UKR, + BLIS_GEMMSUP_CCC_UKR, + BLIS_GEMMSUP_XXX_UKR, + + // BLIS_NUM_UKRS must be last! + BLIS_NUM_UKRS +} ukr_t; -#define BLIS_NUM_LEVEL3_UKRS 5 + +typedef enum +{ + // l3 kernel row preferences + BLIS_GEMM_UKR_ROW_PREF, + BLIS_GEMMTRSM_L_UKR_ROW_PREF, + BLIS_GEMMTRSM_U_UKR_ROW_PREF, + BLIS_TRSM_L_UKR_ROW_PREF, + BLIS_TRSM_U_UKR_ROW_PREF, + + // gemmsup kernel row preferences + BLIS_GEMMSUP_RRR_UKR_ROW_PREF, + BLIS_GEMMSUP_RRC_UKR_ROW_PREF, + BLIS_GEMMSUP_RCR_UKR_ROW_PREF, + BLIS_GEMMSUP_RCC_UKR_ROW_PREF, + BLIS_GEMMSUP_CRR_UKR_ROW_PREF, + BLIS_GEMMSUP_CRC_UKR_ROW_PREF, + BLIS_GEMMSUP_CCR_UKR_ROW_PREF, + BLIS_GEMMSUP_CCC_UKR_ROW_PREF, + BLIS_GEMMSUP_XXX_UKR_ROW_PREF, + + // BLIS_NUM_UKR_PREFS must be last! + BLIS_NUM_UKR_PREFS +} ukr_pref_t; typedef enum @@ -884,39 +859,45 @@ typedef enum // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. - - BLIS_KR = 0, + BLIS_KR, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, + // broadcast factors for packing + BLIS_BBM, + BLIS_BBN, + + // level-2 blocksizes BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension + // level-1f blocksizes BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor - BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. + // gemmsup thresholds + BLIS_MT, // level-3 small/unpacked matrix threshold in m dimension + BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension + BLIS_KT, // level-3 small/unpacked matrix threshold in k dimension + + // gemmsup block sizes + BLIS_KR_SUP, + BLIS_MR_SUP, + BLIS_NR_SUP, + BLIS_MC_SUP, + BLIS_KC_SUP, + BLIS_NC_SUP, + + // BLIS_NO_PART (= BLIS_NUM_BLKSZS) must be last! + BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable, + // such as when characterizing a packm operation. + BLIS_NUM_BLKSZS = BLIS_NO_PART } bszid_t; -#define BLIS_NUM_BLKSZS 11 - - -// -- Threshold ID type -- - -typedef enum -{ - BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension - BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension - BLIS_KT // level-3 small/unpacked matrix threshold in k dimension - -} threshid_t; - -#define BLIS_NUM_THRESH 3 - // -- Architecture ID type -- @@ -1430,21 +1411,10 @@ typedef struct cntx_s blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; - func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; - func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; - mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; - - blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; - void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; - blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; - func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; - mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; - - func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; - func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; + func_t ukrs[ BLIS_NUM_UKRS ]; + mbool_t ukr_prefs[ BLIS_NUM_UKR_PREFS ]; - func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; - func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; + void_fp l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; ind_t method; @@ -1577,6 +1547,7 @@ typedef enum // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), + BLIS_INVALID_UKR_ID = (-152), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), diff --git a/frame/include/level0/bli_set0s_edge.h b/frame/include/level0/bli_set0s_edge.h new file mode 100644 index 000000000..2c436812e --- /dev/null +++ b/frame/include/level0/bli_set0s_edge.h @@ -0,0 +1,79 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET0S_EDGE_H +#define BLIS_SET0S_EDGE_H + +// set0s_mxn + +// Notes: +// - The first char encodes the type of x. +// - The second char encodes the type of y. + +#define GENTFUNC(ctype,ch,op) \ +\ +BLIS_INLINE void PASTEMAC(ch,op) \ + ( \ + const dim_t i, \ + const dim_t m, \ + const dim_t j, \ + const dim_t n, \ + ctype* restrict p, \ + const inc_t ldp \ + ) \ +{ \ + if ( i < m ) \ + { \ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m - i, \ + j, \ + p + i*1, 1, ldp \ + ); \ + } \ +\ + if ( j < n ) \ + { \ + PASTEMAC(ch,set0s_mxn) \ + ( \ + m, \ + n - j, \ + p + j*ldp, 1, ldp \ + ); \ + } \ +} + +INSERT_GENTFUNC_BASIC0(set0s_edge) + +#endif diff --git a/kernels/penryn/1/bli_axpyv_penryn_int.c b/kernels/penryn/1/bli_axpyv_penryn_int.c index 53904b645..2dd7c7324 100644 --- a/kernels/penryn/1/bli_axpyv_penryn_int.c +++ b/kernels/penryn/1/bli_axpyv_penryn_int.c @@ -102,7 +102,7 @@ void bli_daxpyv_penryn_int // Call the reference implementation if needed. if ( use_ref == TRUE ) { - daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); f ( diff --git a/kernels/penryn/1/bli_dotv_penryn_int.c b/kernels/penryn/1/bli_dotv_penryn_int.c index 4d39b3641..2e88a577a 100644 --- a/kernels/penryn/1/bli_dotv_penryn_int.c +++ b/kernels/penryn/1/bli_dotv_penryn_int.c @@ -104,7 +104,7 @@ void bli_ddotv_penryn_int // Call the reference implementation if needed. if ( use_ref == TRUE ) { - ddotv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_DOTV_KER, cntx ); + ddotv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTV_KER, cntx ); f ( diff --git a/kernels/penryn/1f/bli_axpy2v_penryn_int.c b/kernels/penryn/1f/bli_axpy2v_penryn_int.c index 5e8a2a9a1..c809ebb41 100644 --- a/kernels/penryn/1f/bli_axpy2v_penryn_int.c +++ b/kernels/penryn/1f/bli_axpy2v_penryn_int.c @@ -110,7 +110,7 @@ void bli_daxpy2v_penryn_int // Call the reference implementation if needed. if ( use_ref == TRUE ) { - daxpy2v_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPY2V_KER, cntx ); + daxpy2v_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPY2V_KER, cntx ); f ( diff --git a/kernels/penryn/1f/bli_axpyf_penryn_int.c b/kernels/penryn/1f/bli_axpyf_penryn_int.c index 66bb88ec6..ce4c4f786 100644 --- a/kernels/penryn/1f/bli_axpyf_penryn_int.c +++ b/kernels/penryn/1f/bli_axpyf_penryn_int.c @@ -115,7 +115,7 @@ void bli_daxpyf_penryn_int // Call the reference implementation if needed. if ( use_ref == TRUE ) { - daxpyf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx ); + daxpyf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx ); f ( diff --git a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c index 7602a7f28..6b9dab773 100644 --- a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c +++ b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c @@ -112,7 +112,7 @@ void bli_ddotaxpyv_penryn_int // Call the reference implementation if needed. if ( use_ref == TRUE ) { - ddotaxpyv_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTAXPYV_KER, cntx ); + ddotaxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTAXPYV_KER, cntx ); f ( diff --git a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c index 2deb4a457..fe102d427 100644 --- a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c +++ b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c @@ -104,7 +104,7 @@ void bli_ddotxaxpyf_penryn_int // If the vector lengths are zero, scale y by beta and return. if ( bli_zero_dim1( m ) ) { - dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); + dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); f ( @@ -149,7 +149,7 @@ void bli_ddotxaxpyf_penryn_int if ( use_ref == TRUE ) { - ddotxaxpyf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTXAXPYF_KER, cntx ); + ddotxaxpyf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXAXPYF_KER, cntx ); f ( conjat, diff --git a/kernels/penryn/1f/bli_dotxf_penryn_int.c b/kernels/penryn/1f/bli_dotxf_penryn_int.c index ad9dc5fbd..ac9887d59 100644 --- a/kernels/penryn/1f/bli_dotxf_penryn_int.c +++ b/kernels/penryn/1f/bli_dotxf_penryn_int.c @@ -90,7 +90,7 @@ void bli_ddotxf_penryn_int // If the vector lengths are zero, scale r by beta and return. if ( bli_zero_dim1( m ) ) { - dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); + dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); f ( @@ -134,7 +134,7 @@ void bli_ddotxf_penryn_int // Call the reference implementation if needed. if ( use_ref == TRUE ) { - ddotxf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTXF_KER, cntx ); + ddotxf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXF_KER, cntx ); f ( conjat, diff --git a/kernels/zen/1/bli_scalv_zen_int.c b/kernels/zen/1/bli_scalv_zen_int.c index 9f76e88e1..fb17dd4b3 100644 --- a/kernels/zen/1/bli_scalv_zen_int.c +++ b/kernels/zen/1/bli_scalv_zen_int.c @@ -83,7 +83,7 @@ void bli_sscalv_zen_int if ( PASTEMAC(s,eq0)( *alpha ) ) { float* zero = bli_s0; - ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx ); + ssetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx ); f ( @@ -182,7 +182,7 @@ void bli_dscalv_zen_int if ( PASTEMAC(d,eq0)( *alpha ) ) { double* zero = bli_d0; - dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx ); + dsetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx ); f ( diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c index c8488890f..9f31b7200 100644 --- a/kernels/zen/1/bli_scalv_zen_int10.c +++ b/kernels/zen/1/bli_scalv_zen_int10.c @@ -84,7 +84,8 @@ void bli_sscalv_zen_int10 if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx ); + ssetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx ); + f ( BLIS_NO_CONJUGATE, @@ -93,7 +94,7 @@ void bli_sscalv_zen_int10 x, incx, cntx ); - + return; } @@ -275,9 +276,9 @@ void bli_dscalv_zen_int10 { double* zero = bli_d0; - if( cntx == NULL ) cntx = bli_gks_query_cntx(); + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx ); + dsetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx ); f ( @@ -287,7 +288,7 @@ void bli_dscalv_zen_int10 x, incx, cntx ); - + return; } @@ -458,7 +459,7 @@ void bli_cscalv_zen_int10 { const num_t dt = BLIS_SCOMPLEX; - cscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCALV_KER, cntx ); + cscalv_ker_ft f = bli_cntx_get_ukr_dt( dt, BLIS_SCALV_KER, cntx ); f ( @@ -469,4 +470,3 @@ void bli_cscalv_zen_int10 cntx ); } - diff --git a/kernels/zen/1f/bli_axpyf_zen_int_4.c b/kernels/zen/1f/bli_axpyf_zen_int_4.c index 5ddb56ac5..0ec5f44f5 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_4.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c @@ -36,7 +36,7 @@ #include "blis.h" - void bli_caxpyf_zen_int_4 +void bli_caxpyf_zen_int_4 ( conj_t conja, conj_t conjx, @@ -81,7 +81,7 @@ { if ( cntx == NULL ) cntx = bli_gks_query_cntx(); - caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx ); + caxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { diff --git a/kernels/zen/1f/bli_axpyf_zen_int_5.c b/kernels/zen/1f/bli_axpyf_zen_int_5.c index 15a64d596..1566f9809 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_5.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_5.c @@ -108,8 +108,9 @@ void bli_saxpyf_zen_int_5 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - if(cntx == NULL) cntx = bli_gks_query_cntx(); - saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { @@ -131,7 +132,7 @@ void bli_saxpyf_zen_int_5 cntx ); } - + return; } @@ -359,7 +360,9 @@ void bli_daxpyf_zen_int_5 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { @@ -381,7 +384,7 @@ void bli_daxpyf_zen_int_5 cntx ); } - + return; } @@ -559,7 +562,7 @@ void bli_daxpyf_zen_int_5 // ----------------------------------------------------------------------------- -static void bli_daxpyf_zen_int_16x2 +void bli_daxpyf_zen_int_16x2 ( conj_t conja, conj_t conjx, @@ -608,7 +611,7 @@ static void bli_daxpyf_zen_int_16x2 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { @@ -843,6 +846,7 @@ static void bli_daxpyf_zen_int_16x2 } // ----------------------------------------------------------------------------- + void bli_daxpyf_zen_int_16x4 ( conj_t conja, @@ -895,8 +899,9 @@ void bli_daxpyf_zen_int_16x4 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - if(cntx == NULL) cntx = bli_gks_query_cntx(); - daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + + daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { diff --git a/kernels/zen/1f/bli_axpyf_zen_int_8.c b/kernels/zen/1f/bli_axpyf_zen_int_8.c index b958600ce..15fdf4651 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_8.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_8.c @@ -104,7 +104,7 @@ void bli_saxpyf_zen_int_8 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); + saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { @@ -313,7 +313,7 @@ void bli_daxpyf_zen_int_8 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { diff --git a/kernels/zen/1f/bli_dotxf_zen_int_8.c b/kernels/zen/1f/bli_dotxf_zen_int_8.c index e40c785d8..1f4a671b6 100644 --- a/kernels/zen/1f/bli_dotxf_zen_int_8.c +++ b/kernels/zen/1f/bli_dotxf_zen_int_8.c @@ -78,8 +78,8 @@ void bli_sdotxf_zen_int_8 // simplifies to updating y. if ( bli_zero_dim1( m ) || PASTEMAC(s,eq0)( *alpha ) ) { - sscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SCALV_KER, cntx ); - + sscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SCALV_KER, cntx ); + f ( BLIS_NO_CONJUGATE, @@ -95,7 +95,7 @@ void bli_sdotxf_zen_int_8 // operation as a loop over dotxv. if ( b_n != fuse_fac ) { - sdotxv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_DOTXV_KER, cntx ); + sdotxv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_DOTXV_KER, cntx ); for ( dim_t i = 0; i < b_n; ++i ) { @@ -468,8 +468,8 @@ void bli_ddotxf_zen_int_8 // simplifies to updating y. if ( bli_zero_dim1( m ) || PASTEMAC(d,eq0)( *alpha ) ) { - dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); - + dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); + f ( BLIS_NO_CONJUGATE, @@ -485,7 +485,7 @@ void bli_ddotxf_zen_int_8 // operation as a loop over dotxv. if ( b_n != fuse_fac ) { - ddotxv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_DOTXV_KER, cntx ); + ddotxv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXV_KER, cntx ); for ( dim_t i = 0; i < b_n; ++i ) { diff --git a/ref_kernels/1/bli_axpbyv_ref.c b/ref_kernels/1/bli_axpbyv_ref.c index 2e648bbd6..2da4bc928 100644 --- a/ref_kernels/1/bli_axpbyv_ref.c +++ b/ref_kernels/1/bli_axpbyv_ref.c @@ -60,7 +60,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \ + PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \ \ setv_p \ ( \ @@ -83,7 +83,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,scalv_ker_ft) scalv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCALV_KER, cntx ); \ + PASTECH(ch,scalv_ker_ft) scalv_p = bli_cntx_get_ukr_dt( dt, BLIS_SCALV_KER, cntx ); \ \ scalv_p \ ( \ @@ -105,7 +105,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ + PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ @@ -123,7 +123,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \ + PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \ \ addv_p \ ( \ @@ -141,7 +141,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,xpbyv_ker_ft) xpbyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_XPBYV_KER, cntx ); \ + PASTECH(ch,xpbyv_ker_ft) xpbyv_p = bli_cntx_get_ukr_dt( dt, BLIS_XPBYV_KER, cntx ); \ \ xpbyv_p \ ( \ @@ -163,7 +163,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,scal2v_ker_ft) scal2v_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCAL2V_KER, cntx ); \ + PASTECH(ch,scal2v_ker_ft) scal2v_p = bli_cntx_get_ukr_dt( dt, BLIS_SCAL2V_KER, cntx ); \ \ scal2v_p \ ( \ @@ -182,7 +182,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,axpyv_ker_ft) axpyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + PASTECH(ch,axpyv_ker_ft) axpyv_p = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ axpyv_p \ ( \ diff --git a/ref_kernels/1/bli_axpyv_ref.c b/ref_kernels/1/bli_axpyv_ref.c index 31fece0a0..30076ddaf 100644 --- a/ref_kernels/1/bli_axpyv_ref.c +++ b/ref_kernels/1/bli_axpyv_ref.c @@ -58,7 +58,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \ + PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \ \ addv_p \ ( \ @@ -148,7 +148,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \ + PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \ \ addv_p \ ( \ diff --git a/ref_kernels/1/bli_scal2v_ref.c b/ref_kernels/1/bli_scal2v_ref.c index 1dcb03839..ba0595990 100644 --- a/ref_kernels/1/bli_scal2v_ref.c +++ b/ref_kernels/1/bli_scal2v_ref.c @@ -57,7 +57,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \ + PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \ \ setv_p \ ( \ @@ -75,7 +75,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ + PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ diff --git a/ref_kernels/1/bli_scalv_ref.c b/ref_kernels/1/bli_scalv_ref.c index 4945b637b..3e6be7492 100644 --- a/ref_kernels/1/bli_scalv_ref.c +++ b/ref_kernels/1/bli_scalv_ref.c @@ -58,7 +58,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \ + PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \ \ setv_p \ ( \ diff --git a/ref_kernels/1/bli_xpbyv_ref.c b/ref_kernels/1/bli_xpbyv_ref.c index 8101023d4..28286a5f8 100644 --- a/ref_kernels/1/bli_xpbyv_ref.c +++ b/ref_kernels/1/bli_xpbyv_ref.c @@ -54,7 +54,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ + PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ @@ -71,7 +71,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \ + PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \ \ addv_p \ ( \ diff --git a/ref_kernels/1f/bli_axpy2v_ref.c b/ref_kernels/1f/bli_axpy2v_ref.c index 9c08c96f1..6439ff8b0 100644 --- a/ref_kernels/1f/bli_axpy2v_ref.c +++ b/ref_kernels/1f/bli_axpy2v_ref.c @@ -110,7 +110,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,axpyv_ker_ft) kfp_av \ = \ - bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ kfp_av \ ( \ diff --git a/ref_kernels/1f/bli_axpyf_ref.c b/ref_kernels/1f/bli_axpyf_ref.c index f001108e2..5799a03a6 100644 --- a/ref_kernels/1f/bli_axpyf_ref.c +++ b/ref_kernels/1f/bli_axpyf_ref.c @@ -97,7 +97,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,axpyv_ker_ft) kfp_av \ = \ - bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( dim_t i = 0; i < b_n; ++i ) \ { \ diff --git a/ref_kernels/1f/bli_dotaxpyv_ref.c b/ref_kernels/1f/bli_dotaxpyv_ref.c index faeef5dea..42936c650 100644 --- a/ref_kernels/1f/bli_dotaxpyv_ref.c +++ b/ref_kernels/1f/bli_dotaxpyv_ref.c @@ -132,10 +132,10 @@ void PASTEMAC3(ch,opname,arch,suf) \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,dotv_ker_ft) kfp_dv \ = \ - bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \ + bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \ PASTECH(ch,axpyv_ker_ft) kfp_av \ = \ - bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ kfp_dv \ ( \ diff --git a/ref_kernels/1f/bli_dotxaxpyf_ref.c b/ref_kernels/1f/bli_dotxaxpyf_ref.c index c61217941..990133621 100644 --- a/ref_kernels/1f/bli_dotxaxpyf_ref.c +++ b/ref_kernels/1f/bli_dotxaxpyf_ref.c @@ -165,10 +165,10 @@ void PASTEMAC3(ch,opname,arch,suf) \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,dotxf_ker_ft) kfp_df \ = \ - bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ + bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \ PASTECH(ch,axpyf_ker_ft) kfp_af \ = \ - bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ + bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \ \ kfp_df \ ( \ diff --git a/ref_kernels/1f/bli_dotxf_ref.c b/ref_kernels/1f/bli_dotxf_ref.c index 33f5d1ba5..86781fd58 100644 --- a/ref_kernels/1f/bli_dotxf_ref.c +++ b/ref_kernels/1f/bli_dotxf_ref.c @@ -113,7 +113,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,dotxv_ker_ft) kfp_dv \ = \ - bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ + bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( dim_t i = 0; i < b_n; ++i ) \ { \ diff --git a/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c b/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c index cc5852b37..e07090754 100644 --- a/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c +++ b/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c @@ -67,8 +67,8 @@ void PASTEMAC(ch,varname) \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ - PASTECH(ch,dotxv_ft) kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ - PASTECH(ch,axpyv_ft) kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + PASTECH(ch,dotxv_ft) kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \ + PASTECH(ch,axpyv_ft) kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ /* A is m x n. */ \ /* y = beta * y + alpha * A^T w; */ \ diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c new file mode 100644 index 000000000..5cee5535b --- /dev/null +++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c @@ -0,0 +1,336 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define PACKM_SET1_1E( chr, mnk ) \ +do { \ + PASTEMAC(chr,set1s)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \ + PASTEMAC(chr,set0s)( *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \ + PASTEMAC(chr,set0s)( *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \ + PASTEMAC(chr,set1s)( *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \ +} while (0) + +#define PACKM_SET1_1R( chr, mnk ) \ +do { \ + PASTEMAC(chr,set1s)( *(pi1_r + mnk*dfac + d + mnk*ldp2) ); \ + PASTEMAC(chr,set0s)( *(pi1_i + mnk*dfac + d + mnk*ldp2) ); \ +} while (0) + +#define PACKM_SCAL_1E( ch, mn, k, op ) \ +do { \ + PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn *inca2 + 0 + k*lda2), \ + *(alpha1 + mn *inca2 + 1 + k*lda2), \ + *(pi1_ri + (mn*2 + 0)*dfac + d + k*ldp2), \ + *(pi1_ri + (mn*2 + 1)*dfac + d + k*ldp2) ); \ + PASTEMAC(ch,op)( -kappa_i, kappa_r, *(alpha1 + mn *inca2 + 0 + k*lda2), \ + *(alpha1 + mn *inca2 + 1 + k*lda2), \ + *(pi1_ir + (mn*2 + 0)*dfac + d + k*ldp2), \ + *(pi1_ir + (mn*2 + 1)*dfac + d + k*ldp2) ); \ +} while (0) + +#define PACKM_SCAL_1R( ch, mn, k, op ) \ +do { \ + PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0 + k*lda2), \ + *(alpha1 + mn*inca2 + 1 + k*lda2), \ + *(pi1_r + mn*dfac + d + k*ldp2), \ + *(pi1_i + mn*dfac + d + k*ldp2) ); \ +} while (0) + +#define PACKM_DIAG_1E_BODY( ch, mn_min, mn_max, inca2_lu, lda2_lu, op ) \ +\ +do \ +{ \ + /* PACKM_SCAL_1E assumes inca2 and lda2 are the strides to use. */ \ + dim_t inca2 = inca2_lu; \ + dim_t lda2 = lda2_lu; \ + for ( dim_t k = 0; k < cdim; k++ ) \ + for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \ + for ( dim_t d = 0; d < dfac; d++ ) \ + PACKM_SCAL_1E( ch, mn, k, op ); \ +} while(0) + +#define PACKM_DIAG_BODY_1E_L( ch, op ) \ + PACKM_DIAG_1E_BODY( ch, k+1, cdim, inca_l2, lda_l2, op ) + +#define PACKM_DIAG_BODY_1E_U( ch, op ) \ + PACKM_DIAG_1E_BODY( ch, 0, k, inca_u2, lda_u2, op ) + +#define PACKM_DIAG_1R_BODY( ch, mn_min, mn_max, inca2_lu, lda2_lu, op ) \ +\ +do \ +{ \ + /* PACKM_SCAL_1R assumes inca2 and lda2 are the strides to use. */ \ + dim_t inca2 = inca2_lu; \ + dim_t lda2 = lda2_lu; \ + for ( dim_t k = 0; k < cdim; k++ ) \ + for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \ + for ( dim_t d = 0; d < dfac; d++ ) \ + PACKM_SCAL_1R( ch, mn, k, op ); \ +} while(0) + +#define PACKM_DIAG_BODY_1R_L( ch, op ) \ + PACKM_DIAG_1R_BODY( ch, k+1, cdim, inca_l2, lda_l2, op ) + +#define PACKM_DIAG_BODY_1R_U( ch, op ) \ + PACKM_DIAG_1R_BODY( ch, 0, k, inca_u2, lda_u2, op ) + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr0, bb0, arch, suf ) \ +\ +void PASTEMAC3(ch,opname,arch,suf) \ + ( \ + struc_t struca, \ + diag_t diaga, \ + uplo_t uploa, \ + conj_t conja, \ + pack_t schema, \ + bool invdiag, \ + dim_t cdim, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict a, inc_t inca, inc_t lda, \ + ctype* restrict p, inc_t ldp, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt_r = PASTEMAC(chr,type); \ + const dim_t cdim_pack = bli_cntx_get_blksz_max_dt( dt_r, mnr0, cntx ); \ + const dim_t dfac = bli_cntx_get_blksz_def_dt( dt_r, bb0, cntx ); \ +\ + /* start by zeroing out the whole block */ \ + PASTEMAC(chr,set0s_mxn) \ + ( \ + cdim_pack, \ + 2*n_max, \ + ( ctype_r* )p, 1, ldp \ + ); \ +\ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype_r kappa_r = ( ( ctype_r* )kappa )[0]; \ + ctype_r kappa_i = ( ( ctype_r* )kappa )[1]; \ + ctype_r* restrict alpha1 = ( ctype_r* )a; \ +\ + if ( bli_is_1e_packed( schema ) ) \ + { \ + const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \ +\ + ctype_r* restrict pi1_ri = ( ctype_r* )p; \ + ctype_r* restrict pi1_ir = ( ctype_r* )p + ldp; \ +\ + /* write the strictly lower part if it exists */ \ + if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \ + { \ + dim_t inca_l2 = inca2; \ + dim_t lda_l2 = lda2; \ + conj_t conja_l = conja; \ +\ + if ( bli_is_upper( uploa ) ) \ + { \ + bli_swap_incs( &inca_l2, &lda_l2 ); \ + if ( bli_is_hermitian( struca ) ) \ + bli_toggle_conj( &conja_l ); \ + } \ +\ + if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1E_L( ch, scal2jris ); \ + else PACKM_DIAG_BODY_1E_L( ch, scal2ris ); \ + } \ +\ + /* write the strictly upper part if it exists */ \ + /* assume either symmetric, hermitian, or triangular */ \ + if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \ + { \ + dim_t inca_u2 = inca2; \ + dim_t lda_u2 = lda2; \ + conj_t conja_u = conja; \ +\ + if ( bli_is_lower( uploa ) ) \ + { \ + bli_swap_incs( &inca_u2, &lda_u2 ); \ + if ( bli_is_hermitian( struca ) ) \ + bli_toggle_conj( &conja_u ); \ + } \ +\ + if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1E_U( ch, scal2jris ); \ + else PACKM_DIAG_BODY_1E_U( ch, scal2ris ); \ + } \ +\ + /* write the diagonal */ \ + if ( bli_is_unit_diag( diaga ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SET1_1E( chr, mnk ); \ + } \ + else if ( bli_is_hermitian( struca ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + { \ + ctype_r mu_r = *(alpha1 + mnk*(inca2 + lda2)); \ + PASTEMAC(chr,scal2s)( kappa_r, mu_r, *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \ + PASTEMAC(chr,scal2s)( kappa_i, mu_r, *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \ + PASTEMAC(chr,scal2s)( -kappa_i, mu_r, *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \ + PASTEMAC(chr,scal2s)( kappa_r, mu_r, *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \ + } \ + } \ + else if ( bli_is_conj( conja )) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SCAL_1E( ch, mnk, mnk, scal2jris ); \ + } \ + else \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SCAL_1E( ch, mnk, mnk, scal2ris ); \ + } \ +\ + /* invert the diagonal if requested */ \ + if ( invdiag ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + { \ + PASTEMAC(ch,invertris)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2), \ + *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \ + PASTEMAC(ch,copyjris)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2), \ + *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2), \ + *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2), \ + *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \ + } \ + } \ +\ + /* if this an edge case in both directions, extend the diagonal with ones */ \ + for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SET1_1E( chr, mnk ); \ + } \ + else /* bli_is_1r_packed( schema ) */ \ + { \ + const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \ +\ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + /* write the strictly lower part if it exists */ \ + if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \ + { \ + dim_t inca_l2 = inca2; \ + dim_t lda_l2 = lda2; \ + conj_t conja_l = conja; \ +\ + if ( bli_is_upper( uploa ) ) \ + { \ + bli_swap_incs( &inca_l2, &lda_l2 ); \ + if ( bli_is_hermitian( struca ) ) \ + bli_toggle_conj( &conja_l ); \ + } \ +\ + if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1R_L( ch, scal2jris ); \ + else PACKM_DIAG_BODY_1R_L( ch, scal2ris ); \ + } \ +\ + /* write the strictly upper part if it exists */ \ + /* assume either symmetric, hermitian, or triangular */ \ + if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \ + { \ + dim_t inca_u2 = inca2; \ + dim_t lda_u2 = lda2; \ + conj_t conja_u = conja; \ +\ + if ( bli_is_lower( uploa ) ) \ + { \ + bli_swap_incs( &inca_u2, &lda_u2 ); \ + if ( bli_is_hermitian( struca ) ) \ + bli_toggle_conj( &conja_u ); \ + } \ +\ + if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1R_U( ch, scal2jris ); \ + else PACKM_DIAG_BODY_1R_U( ch, scal2ris ); \ + } \ +\ + /* write the diagonal */ \ + if ( bli_is_unit_diag( diaga ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SET1_1R( chr, mnk ); \ + } \ + else if ( bli_is_hermitian( struca ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + { \ + ctype_r mu_r = *(alpha1 + mnk*(inca2 + lda2)); \ + PASTEMAC(chr,scal2s)( kappa_r, mu_r, *(pi1_r + mnk*(dfac + ldp2) + d) ); \ + PASTEMAC(chr,scal2s)( kappa_i, mu_r, *(pi1_i + mnk*(dfac + ldp2) + d) ); \ + } \ + } \ + else if ( bli_is_conj( conja ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SCAL_1R( ch, mnk, mnk, scal2jris ); \ + } \ + else \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SCAL_1R( ch, mnk, mnk, scal2ris ); \ + } \ +\ + /* invert the diagonal if requested */ \ + if ( invdiag ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PASTEMAC(ch,invertris)( *(pi1_r + mnk*(dfac + ldp2) + d), \ + *(pi1_i + mnk*(dfac + ldp2) + d) ); \ + } \ +\ + /* if this an edge case in both directions, extend the diagonal with ones */ \ + for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PACKM_SET1_1R( chr, mnk ); \ + } \ +} + +INSERT_GENTFUNCCO_BASIC4( packm_mrxmr_diag_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC4( packm_nrxnr_diag_1er, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) + diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c new file mode 100644 index 000000000..80ffcbc14 --- /dev/null +++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c @@ -0,0 +1,173 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define PACKM_DIAG_BODY( ctype, ch, mn_min, mn_max, inca, lda, op ) \ +\ +do \ +{ \ + for ( dim_t k = 0; k < cdim; k++ ) \ + for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \ + for ( dim_t d = 0; d < dfac; d++ ) \ + PASTEMAC(ch,op)( kappa_cast, *(alpha1 + mn*inca + k*lda), *(pi1 + mn*dfac + d + k*ldp) ); \ +} while(0) + +#define PACKM_DIAG_BODY_L( ctype, ch, op ) \ + PACKM_DIAG_BODY( ctype, ch, k+1, cdim, inca_l, lda_l, op ) + +#define PACKM_DIAG_BODY_U( ctype, ch, op ) \ + PACKM_DIAG_BODY( ctype, ch, 0, k, inca_u, lda_u, op ) + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \ +\ +void PASTEMAC3(ch,opname,arch,suf) \ + ( \ + struc_t struca, \ + diag_t diaga, \ + uplo_t uploa, \ + conj_t conja, \ + pack_t schema, \ + bool invdiag, \ + dim_t cdim, \ + dim_t n_max, \ + ctype* restrict kappa, \ + ctype* restrict a, inc_t inca, inc_t lda, \ + ctype* restrict p, inc_t ldp, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt, mnr0, cntx ); \ + const dim_t cdim_pack = bli_cntx_get_blksz_max_dt( dt, mnr0, cntx ); \ + const dim_t dfac = bli_cntx_get_blksz_def_dt( dt, bb0, cntx ); \ +\ + /* start by zeroing out the whole block */ \ + PASTEMAC(ch,set0s_mxn) \ + ( \ + cdim_pack, \ + n_max, \ + p, 1, ldp \ + ); \ +\ + ctype kappa_cast = *( ctype* )kappa; \ + ctype* restrict alpha1 = a; \ + ctype* restrict pi1 = p; \ +\ + /* write the strictly lower part if it exists */ \ + if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \ + { \ + dim_t inca_l = inca; \ + dim_t lda_l = lda; \ + conj_t conja_l = conja; \ +\ + if ( bli_is_upper( uploa ) ) \ + { \ + bli_swap_incs( &inca_l, &lda_l ); \ + if ( bli_is_hermitian( struca ) ) \ + bli_toggle_conj( &conja_l ); \ + } \ +\ + if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_L( ctype, ch, scal2js ); \ + else PACKM_DIAG_BODY_L( ctype, ch, scal2s ); \ + } \ +\ + /* write the strictly upper part if it exists */ \ + /* assume either symmetric, hermitian, or triangular */ \ + if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \ + { \ + dim_t inca_u = inca; \ + dim_t lda_u = lda; \ + conj_t conja_u = conja; \ +\ + if ( bli_is_lower( uploa ) ) \ + { \ + bli_swap_incs( &inca_u, &lda_u ); \ + if ( bli_is_hermitian( struca ) ) \ + bli_toggle_conj( &conja_u ); \ + } \ +\ + if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_U( ctype, ch, scal2js ); \ + else PACKM_DIAG_BODY_U( ctype, ch, scal2s ); \ + } \ +\ + /* write the diagonal */ \ + if ( bli_is_unit_diag( diaga ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \ + } \ + else if ( bli_is_hermitian( struca ) ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + { \ + ctype mu; \ + PASTEMAC(ch,copys)( *(alpha1 + mnk*(inca + lda)), mu ); \ + PASTEMAC(ch,seti0s)( mu ); \ + PASTEMAC(ch,scal2s)( kappa_cast, mu, *(pi1 + mnk*(dfac + ldp) + d) ); \ + } \ + } \ + else if ( bli_is_conj( conja )) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PASTEMAC(ch,scal2js)( kappa_cast, *(alpha1 + mnk*(inca + lda)), *(pi1 + mnk*(dfac + ldp) + d) ); \ + } \ + else \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PASTEMAC(ch,scal2s)( kappa_cast, *(alpha1 + mnk*(inca + lda)), *(pi1 + mnk*(dfac + ldp) + d) ); \ + } \ +\ + /* invert the diagonal if requested */ \ + if ( invdiag ) \ + { \ + for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PASTEMAC(ch,inverts)( *(pi1 + mnk*(dfac + ldp) + d) ); \ + } \ +\ + /* if this an edge case in both directions, extend the diagonal with ones */ \ + for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \ +} + +INSERT_GENTFUNC_BASIC4( packm_mrxmr_diag, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNC_BASIC4( packm_nrxnr_diag, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) + diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c index 03ec46d14..56d8379be 100644 --- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c +++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c @@ -34,458 +34,48 @@ #include "blis.h" -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ +#define PACKM_1E_BODY( ctype, ch, pragma, cdim, inca2, op ) \ \ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ +do \ { \ - if ( cdim == mnr ) \ - { \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ - { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ + for ( dim_t k = n; k != 0; --k ) \ { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ + pragma \ + for ( dim_t mn = 0; mn < cdim; ++mn ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ + PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \ + *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \ + PASTEMAC(ch,op)( -kappa_i, kappa_r, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \ + *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \ } \ - } \ -\ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ \ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ + alpha1 += lda2; \ + pi1_ri += ldp2; \ + pi1_ir += ldp2; \ } \ -} - -INSERT_GENTFUNCCO_BASIC3( packm_2xk_1er, 2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - +} while(0) -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ +#define PACKM_1R_BODY( ctype, ch, pragma, cdim, inca2, op ) \ \ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ +do \ { \ - if ( cdim == mnr ) \ + for ( dim_t k = n; k != 0; --k ) \ { \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ - { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ + pragma \ + for ( dim_t mn = 0; mn < cdim; ++mn ) \ + for ( dim_t d = 0; d < dfac; ++d ) \ + PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \ + *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \ \ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ + alpha1 += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ } \ -\ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC3( packm_4xk_1er, 4, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - +} while(0) #undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ +#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr0, bb0, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ @@ -500,1719 +90,94 @@ void PASTEMAC3(ch,opname,arch,suf) \ cntx_t* restrict cntx \ ) \ { \ - if ( cdim == mnr ) \ - { \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ - { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + const dim_t dfac = PASTECH2(bb0, _, chr); \ + const num_t dt_r = PASTEMAC(chr,type); \ \ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ + if ( bli_is_1e_packed( schema ) ) \ { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ + /* cdim and mnr are in units of complex values */ \ + const dim_t mnr = PASTECH2(mnr0, _, chr) == -1 ? -1 : PASTECH2(mnr0, _, chr) / 2; \ + const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \ \ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ - } \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ \ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ + ctype_r kappa_r = ( ( ctype_r* )kappa )[0]; \ + ctype_r kappa_i = ( ( ctype_r* )kappa )[1]; \ + ctype_r* restrict alpha1 = ( ctype_r* )a; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p; \ + ctype_r* restrict pi1_ir = ( ctype_r* )p + ldp; \ \ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC3( packm_6xk_1er, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - if ( cdim == mnr ) \ - { \ - if ( bli_is_1e_packed( schema ) ) \ + if ( cdim == mnr && mnr != -1 ) \ { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + if ( inca == 1 ) \ { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ + if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2jris ); \ + else PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2ris ); \ } \ else \ { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ + if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2jris ); \ + else PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2ris ); \ } \ } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ + else \ { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ + if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, , cdim, inca2, scal2jris ); \ + else PACKM_1E_BODY( ctype, ch, , cdim, inca2, scal2ris ); \ } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ \ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ -\ - PASTEMAC(ch,set1ms_mxn) \ + PASTEMAC(chr,set0s_edge) \ ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ + 2*cdim*dfac, 2*cdim_max*dfac, \ + 2*n, 2*n_max, \ + ( ctype_r* )p, ldp \ ); \ } \ -} - -INSERT_GENTFUNCCO_BASIC3( packm_8xk_1er, 8, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - if ( cdim == mnr ) \ + else /* ( bli_is_1r_packed( schema ) ) */ \ { \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + const dim_t mnr = PASTECH2(mnr0, _, chr); \ + const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \ \ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ \ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + ctype_r kappa_r = ( ( ctype_r* )kappa )[0]; \ + ctype_r kappa_i = ( ( ctype_r* )kappa )[1]; \ + ctype_r* restrict alpha1 = ( ctype_r* )a; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ \ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ + if ( cdim == mnr && mnr != -1 ) \ { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + if ( inca == 1 ) \ { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ + if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2jris ); \ + else PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2ris ); \ } \ else \ { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ + if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2jris ); \ + else PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2ris ); \ } \ } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ + else \ { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ + if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, , cdim, inca2, scal2jris ); \ + else PACKM_1R_BODY( ctype, ch, , cdim, inca2, scal2ris ); \ } \ - } \ -\ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC3( packm_10xk_1er, 10, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - if ( cdim == mnr ) \ - { \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ - { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC3( packm_12xk_1er, 12, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - if ( cdim == mnr ) \ - { \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ - { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNCCO_BASIC3( packm_14xk_1er, 14, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - if ( cdim == mnr ) \ - { \ - if ( bli_is_1e_packed( schema ) ) \ - { \ - const inc_t inca1 = inca; \ - const inc_t lda1 = lda; \ - const inc_t ldp1 = ldp; \ -\ - ctype* restrict kappa_cast = ( ctype* )kappa; \ - ctype* restrict alpha1_ri = ( ctype* )a; \ - ctype* restrict pi1_ri = ( ctype* )p; \ - ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ - PASTEMAC(ch,copyj1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ - PASTEMAC(ch,copy1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ - PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ - PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ -\ - alpha1_ri += lda1; \ - pi1_ri += ldp1; \ - pi1_ir += ldp1; \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_1r_packed( schema ) ) */ \ - { \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ -\ - ctype* kappa_cast = kappa; \ - ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ - ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ - ctype_r* restrict alpha1_r = ( ctype_r* )a; \ - ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ - PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ - PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ - PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ - PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ -\ - alpha1_r += lda2; \ - alpha1_i += lda2; \ - pi1_r += ldp2; \ - pi1_i += ldp2; \ - } \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal21ms_mxn) \ - ( \ - schema, \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = cdim; \ - const dim_t offn = 0; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ -\ - PASTEMAC(ch,set1ms_mxn) \ - ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - ctype* restrict zero = PASTEMAC(ch,0); \ - const dim_t offm = 0; \ - const dim_t offn = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ \ - PASTEMAC(ch,set1ms_mxn) \ + PASTEMAC(chr,set0s_edge) \ ( \ - schema, \ - offm, \ - offn, \ - m_edge, \ - n_edge, \ - zero, \ - p, 1, ldp, ldp \ + cdim*dfac, cdim_max*dfac, \ + 2*n, 2*n_max, \ + ( ctype_r* )p, ldp \ ); \ } \ } -INSERT_GENTFUNCCO_BASIC3( packm_16xk_1er, 16, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC4( packm_mrxk_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNCCO_BASIC4( packm_nrxk_1er, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) diff --git a/ref_kernels/1m/bli_packm_cxk_bb_ref.c b/ref_kernels/1m/bli_packm_cxk_bb_ref.c deleted file mode 100644 index e7498a735..000000000 --- a/ref_kernels/1m/bli_packm_cxk_bb_ref.c +++ /dev/null @@ -1,656 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -// -- 6xk, duplication factor 2 ------------------------------------------------ - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - const dim_t dfac = 2; \ -\ - /* Handle the packing of B (column panel schemas) separately from packing - of A (row panel schemas). */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal2bbs_mxn) \ - ( \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, dfac, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*dfac; \ -\ - PASTEMAC(ch,set0bbs_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, dfac, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0bbs_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, dfac, ldp \ - ); \ - } \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal2s_mxn) \ - ( \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_6xk_bb2, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - -// -- 6xk, duplication factor 4 ------------------------------------------------ - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - const dim_t dfac = 4; \ -\ - /* Handle the packing of B (column panel schemas) separately from packing - of A (row panel schemas). */ \ - if ( bli_is_col_packed( schema ) ) \ - { \ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 11) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 12) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 13) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 14) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 15) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 16) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 17) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 18) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 19) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 20) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 21) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 22) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 11) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 12) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 13) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 14) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 15) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 16) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 17) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 18) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 19) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 20) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 21) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 22) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 11) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 12) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 13) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 14) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 15) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 16) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 17) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 18) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 19) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 20) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 21) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 22) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 10) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 11) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 12) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 13) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 14) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 15) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 16) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 17) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 18) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 19) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 20) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 21) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 22) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal2bbs_mxn) \ - ( \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, dfac, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*dfac; \ -\ - PASTEMAC(ch,set0bbs_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, dfac, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0bbs_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, dfac, ldp \ - ); \ - } \ - } \ - else /* if ( bli_is_row_packed( schema ) ) */ \ - { \ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else /* if ( bli_is_noconj( conja ) ) */ \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC(ch,scal2s_mxn) \ - ( \ - conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_6xk_bb4, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c index c98f1b250..eefdb464b 100644 --- a/ref_kernels/1m/bli_packm_cxk_ref.c +++ b/ref_kernels/1m/bli_packm_cxk_ref.c @@ -34,469 +34,24 @@ #include "blis.h" -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - dim_t n_iter = n / 4; \ - dim_t n_left = n % 4; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n_iter != 0; --n_iter ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \ -\ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \ -\ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 2*lda), *(pi1 + 0 + 2*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 2*lda), *(pi1 + 1 + 2*ldp) ); \ -\ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 3*lda), *(pi1 + 0 + 3*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 3*lda), *(pi1 + 1 + 3*ldp) ); \ -\ - alpha1 += 4*lda; \ - pi1 += 4*ldp; \ - } \ -\ - for ( ; n_left != 0; --n_left ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ +#define PACKM_BODY( ctype, ch, pragma, cdim, inca, op ) \ \ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_2xk, 2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - dim_t n_iter = n / 4; \ - dim_t n_left = n % 4; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n_iter != 0; --n_iter ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \ -\ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \ -\ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 2*lda), *(pi1 + 0 + 2*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 2*lda), *(pi1 + 1 + 2*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 2*lda), *(pi1 + 2 + 2*ldp) ); \ -\ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 3*lda), *(pi1 + 0 + 3*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 3*lda), *(pi1 + 1 + 3*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 3*lda), *(pi1 + 2 + 3*ldp) ); \ -\ - alpha1 += 4*lda; \ - pi1 += 4*ldp; \ - } \ -\ - for ( ; n_left != 0; --n_left ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_3xk, 3, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ +do \ { \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - dim_t n_iter = n / 2; \ - dim_t n_left = n % 2; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n_iter != 0; --n_iter ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca + 0*lda), *(pi1 + 3 + 0*ldp) ); \ -\ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca + 1*lda), *(pi1 + 3 + 1*ldp) ); \ -\ - alpha1 += 2*lda; \ - pi1 += 2*ldp; \ - } \ -\ - for ( ; n_left != 0; --n_left ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ + for ( dim_t k = n; k != 0; --k ) \ { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ + pragma \ + for ( dim_t mn = 0; mn < cdim; mn++ ) \ + for ( dim_t d = 0; d < dfac; d++ ) \ + PASTEMAC(ch,op)( kappa_cast, *(alpha1 + mn*inca), *(pi1 + mn*dfac + d) ); \ \ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ + alpha1 += lda; \ + pi1 += ldp; \ } \ -} - -INSERT_GENTFUNC_BASIC3( packm_4xk, 4, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - +} while(0) #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ +#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ @@ -511,1212 +66,42 @@ void PASTEMAC3(ch,opname,arch,suf) \ cntx_t* restrict cntx \ ) \ { \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ + const dim_t mnr = PASTECH2(mnr0, _, ch); \ + const num_t dt = PASTEMAC(ch,type); \ + const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt, mnr0, cntx ); \ + const dim_t dfac = PASTECH2(bb0, _, ch); \ \ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_6xk, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ + ctype kappa_cast = *( ctype* )kappa; \ ctype* restrict alpha1 = a; \ ctype* restrict pi1 = p; \ \ - dim_t n_iter = n / 2; \ - dim_t n_left = n % 2; \ -\ - if ( cdim == mnr ) \ + if ( cdim == mnr && mnr != -1 ) \ { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + if ( inca == 1 ) \ { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( ; n_iter != 0; --n_iter ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca + 0*lda), *(pi1 + 3 + 0*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca + 0*lda), *(pi1 + 4 + 0*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca + 0*lda), *(pi1 + 5 + 0*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca + 0*lda), *(pi1 + 6 + 0*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca + 0*lda), *(pi1 + 7 + 0*ldp) ); \ -\ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca + 1*lda), *(pi1 + 3 + 1*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca + 1*lda), *(pi1 + 4 + 1*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca + 1*lda), *(pi1 + 5 + 1*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca + 1*lda), *(pi1 + 6 + 1*ldp) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca + 1*lda), *(pi1 + 7 + 1*ldp) ); \ -\ - alpha1 += 2*lda; \ - pi1 += 2*ldp; \ - } \ -\ - for ( ; n_left != 0; --n_left ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ + if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2js ); \ + else PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2s ); \ } \ else \ { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ + if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2js ); \ + else PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2s ); \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ + if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, , cdim, inca, scal2js ); \ + else PACKM_BODY( ctype, ch, , cdim, inca, scal2s ); \ } \ -} - -INSERT_GENTFUNC_BASIC3( packm_8xk, 8, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ \ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_10xk, 10, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_12xk, 12, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_14xk, 14, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ -} - -INSERT_GENTFUNC_BASIC3( packm_16xk, 16, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - pack_t schema, \ - dim_t cdim, \ - dim_t n, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t inca, inc_t lda, \ - ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict alpha1 = a; \ - ctype* restrict pi1 = p; \ -\ - if ( cdim == mnr ) \ - { \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +16*inca), *(pi1 +16) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +17*inca), *(pi1 +17) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +18*inca), *(pi1 +18) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +19*inca), *(pi1 +19) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +20*inca), *(pi1 +20) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +21*inca), *(pi1 +21) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +22*inca), *(pi1 +22) ); \ - PASTEMAC(ch,copyjs)( *(alpha1 +23*inca), *(pi1 +23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \ - PASTEMAC(ch,copys)( *(alpha1 +16*inca), *(pi1 +16) ); \ - PASTEMAC(ch,copys)( *(alpha1 +17*inca), *(pi1 +17) ); \ - PASTEMAC(ch,copys)( *(alpha1 +18*inca), *(pi1 +18) ); \ - PASTEMAC(ch,copys)( *(alpha1 +19*inca), *(pi1 +19) ); \ - PASTEMAC(ch,copys)( *(alpha1 +20*inca), *(pi1 +20) ); \ - PASTEMAC(ch,copys)( *(alpha1 +21*inca), *(pi1 +21) ); \ - PASTEMAC(ch,copys)( *(alpha1 +22*inca), *(pi1 +22) ); \ - PASTEMAC(ch,copys)( *(alpha1 +23*inca), *(pi1 +23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conja ) ) \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +16*inca), *(pi1 +16) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +17*inca), *(pi1 +17) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +18*inca), *(pi1 +18) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +19*inca), *(pi1 +19) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +20*inca), *(pi1 +20) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +21*inca), *(pi1 +21) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +22*inca), *(pi1 +22) ); \ - PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +23*inca), *(pi1 +23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - else \ - { \ - for ( dim_t k = n; k != 0; --k ) \ - { \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +16*inca), *(pi1 +16) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +17*inca), *(pi1 +17) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +18*inca), *(pi1 +18) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +19*inca), *(pi1 +19) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +20*inca), *(pi1 +20) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +21*inca), *(pi1 +21) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +22*inca), *(pi1 +22) ); \ - PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +23*inca), *(pi1 +23) ); \ -\ - alpha1 += lda; \ - pi1 += ldp; \ - } \ - } \ - } \ - } \ - else /* if ( cdim < mnr ) */ \ - { \ - PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ - ( \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - ( trans_t )conja, \ - cdim, \ - n, \ - kappa, \ - a, inca, lda, \ - p, 1, ldp, \ - cntx, \ - NULL \ - ); \ -\ - /* if ( cdim < mnr ) */ \ - { \ - const dim_t i = cdim; \ - const dim_t m_edge = mnr - cdim; \ - const dim_t n_edge = n_max; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (i )*1; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ - } \ -\ - if ( n < n_max ) \ - { \ - const dim_t j = n; \ - const dim_t m_edge = mnr; \ - const dim_t n_edge = n_max - n; \ - ctype* restrict p_cast = p; \ - ctype* restrict p_edge = p_cast + (j )*ldp; \ -\ - PASTEMAC(ch,set0s_mxn) \ - ( \ - m_edge, \ - n_edge, \ - p_edge, 1, ldp \ - ); \ - } \ + PASTEMAC(ch,set0s_edge) \ + ( \ + cdim*dfac, cdim_max*dfac, \ + n, n_max, \ + p, ldp \ + ); \ } -INSERT_GENTFUNC_BASIC3( packm_24xk, 24, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNC_BASIC4( packm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNC_BASIC4( packm_nrxk, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) diff --git a/ref_kernels/1m/bli_unpackm_cxk_ref.c b/ref_kernels/1m/bli_unpackm_cxk_ref.c index 00dc02eb4..73d98e268 100644 --- a/ref_kernels/1m/bli_unpackm_cxk_ref.c +++ b/ref_kernels/1m/bli_unpackm_cxk_ref.c @@ -34,816 +34,64 @@ #include "blis.h" -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ +#define UNPACKM_BODY( ctype, ch, pragma, cdim, inca, op ) \ \ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conjp, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ) \ +do \ { \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ - ctype* restrict alpha1 = a; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + for ( dim_t k = n; k != 0; --k ) \ { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ + pragma \ + for ( dim_t mn = 0; mn < cdim; mn++ ) \ + PASTEMAC(ch,op)( *kappa_cast, *(pi1 + mn*dfac), *(alpha1 + mn*inca) ); \ \ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ + alpha1 += lda; \ + pi1 += ldp; \ } \ - else \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( unpackm_2xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - +} while(0) #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ +#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ - conj_t conjp, \ + conj_t conja, \ + pack_t schema, \ + dim_t cdim, \ dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ + ctype* restrict kappa, \ + ctype* restrict p, inc_t ldp, \ + ctype* restrict a, inc_t inca, inc_t lda, \ + cntx_t* restrict cntx \ ) \ { \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ - ctype* restrict alpha1 = a; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ + const dim_t mnr = PASTECH2(mnr0, _, ch); \ + /* It's not clear if unpack needs to care about BB storage... */ \ + const dim_t dfac = PASTECH2(bb0, _, ch); \ \ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( unpackm_4xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conjp, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ) \ -{ \ ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ ctype* restrict alpha1 = a; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( unpackm_6xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conjp, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ ctype* restrict pi1 = p; \ - ctype* restrict alpha1 = a; \ \ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + if ( cdim == mnr && mnr != -1 ) \ { \ - if ( bli_is_conj( conjp ) ) \ + if ( inca == 1 ) \ { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ + if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2js ); \ + else UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2s ); \ } \ else \ { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ + if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2js ); \ + else UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2s ); \ } \ } \ - else \ + else /* if ( cdim < mnr ) */ \ { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( unpackm_8xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conjp, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ - ctype* restrict alpha1 = a; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( unpackm_10xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conjp, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ - ctype* restrict alpha1 = a; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( unpackm_12xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conjp, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ - ctype* restrict alpha1 = a; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( unpackm_14xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - - - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conjp, \ - dim_t n, \ - void* restrict kappa, \ - void* restrict p, inc_t ldp, \ - void* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ - ) \ -{ \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict pi1 = p; \ - ctype* restrict alpha1 = a; \ -\ - if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 14), *(alpha1 + 14*inca) ); \ - PASTEMAC2(ch,ch,copyjs)( *(pi1 + 15), *(alpha1 + 15*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 14), *(alpha1 + 14*inca) ); \ - PASTEMAC2(ch,ch,copys)( *(pi1 + 15), *(alpha1 + 15*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - } \ - else \ - { \ - if ( bli_is_conj( conjp ) ) \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 14), *(alpha1 + 14*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 15), *(alpha1 + 15*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ - else \ - { \ - for ( ; n != 0; --n ) \ - { \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 14), *(alpha1 + 14*inca) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 15), *(alpha1 + 15*inca) ); \ -\ - pi1 += ldp; \ - alpha1 += lda; \ - } \ - } \ + if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, , cdim, inca, scal2js ); \ + else UNPACKM_BODY( ctype, ch, , cdim, inca, scal2s ); \ } \ } -INSERT_GENTFUNC_BASIC2( unpackm_16xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNC_BASIC4( unpackm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) +INSERT_GENTFUNC_BASIC4( unpackm_nrxk, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) diff --git a/ref_kernels/3/bb/bli_gemmbb_ref.c b/ref_kernels/3/bb/bli_gemmbb_ref.c deleted file mode 100644 index 4c75c064c..000000000 --- a/ref_kernels/3/bb/bli_gemmbb_ref.c +++ /dev/null @@ -1,141 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -// An implementation that indexes through B with the assumption that all -// elements were broadcast (duplicated) by a factor of NP/NR. - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ -\ - /* Assume that the degree of duplication is equal to packnr / nr. */ \ - const inc_t cs_b = packnr / nr; \ -\ - ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = mr; \ -\ - dim_t l, j, i; \ -\ - ctype ai; \ - ctype bj; \ -\ -\ - /* Initialize the accumulator elements in ab to zero. */ \ - for ( i = 0; i < m * n; ++i ) \ - { \ - PASTEMAC(ch,set0s)( *(ab + i) ); \ - } \ -\ - /* Perform a series of k rank-1 updates into ab. */ \ - for ( l = 0; l < k; ++l ) \ - { \ - ctype* restrict abij = ab; \ -\ - /* In an optimized implementation, these two loops over MR and NR - are typically fully unrolled. */ \ - for ( j = 0; j < n; ++j ) \ - { \ - bj = *(b + j*cs_b); \ -\ - for ( i = 0; i < m; ++i ) \ - { \ - ai = *(a + i); \ -\ - PASTEMAC(ch,dots)( ai, bj, *abij ); \ -\ - abij += rs_ab; \ - } \ - } \ -\ - a += cs_a; \ - b += rs_b; \ - } \ -\ - /* Scale the result in ab by alpha. */ \ - for ( i = 0; i < m * n; ++i ) \ - { \ - PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \ - } \ -\ - /* If beta is zero, overwrite c with the scaled result in ab. Otherwise, - scale by beta and then add the scaled redult in ab. */ \ - if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,copys_mxn)( m, \ - n, \ - ab, rs_ab, cs_ab, \ - c, rs_c, cs_c ); \ - } \ - else \ - { \ - PASTEMAC(ch,xpbys_mxn)( m, \ - n, \ - ab, rs_ab, cs_ab, \ - beta, \ - c, rs_c, cs_c ); \ - } \ -} - -INSERT_GENTFUNC_BASIC2( gemmbb, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) - diff --git a/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c b/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c deleted file mode 100644 index dd4e1f153..000000000 --- a/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -// An implementation that indexes through B with the assumption that all -// elements were broadcast (duplicated) by a factor of NP/NR. - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf, trsmkerid ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a1x, \ - ctype* restrict a11, \ - ctype* restrict bx1, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t rs_b = packnr; \ -\ - /* Assume that the degree of duplication is equal to packnr / nr. */ \ - const inc_t cs_b = packnr / nr; \ -/* -printf( "bli_gemmtrsmbb_ref(): cs_b = %d\n", (int)cs_b ); \ -printf( "bli_gemmtrsmbb_ref(): k nr = %d %d\n", (int)k, (int)nr ); \ -*/ \ -\ - ctype* minus_one = PASTEMAC(ch,m1); \ -\ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ - PASTECH(ch,trsm_ukr_ft) \ - trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \ -\ -/* -PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \ - (double*)bx1, rs_b, cs_b, "%5.2f", "" ); \ -PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \ - (double*)b11, rs_b, 1, "%5.2f", "" ); \ -*/ \ -\ - /* lower: b11 = alpha * b11 - a10 * b01; */ \ - /* upper: b11 = alpha * b11 - a12 * b21; */ \ - gemm_ukr \ - ( \ - mr, \ - nr, \ - k, \ - minus_one, \ - a1x, \ - bx1, \ - alpha, \ - b11, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -/* -PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \ - (double*)b11, rs_b, 1, "%5.2f", "" ); \ -*/ \ -\ - /* b11 = inv(a11) * b11; - c11 = b11; */ \ - trsm_ukr \ - ( \ - a11, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ -/* -PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \ - (double*)b11, rs_b, 1, "%5.2f", "" ); \ -*/ \ -\ - /* Broadcast the elements of the updated b11 submatrix to their - duplicated neighbors. */ \ - PASTEMAC(ch,bcastbbs_mxn) \ - ( \ - mr, \ - nr, \ - b11, rs_b, cs_b \ - ); \ -\ -/* -PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_r after", k+3, 8, \ - ( double* )b01, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \ -PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_i after", k+3, 8, \ - ( double* )b01 + 1, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \ -*/ \ -} - -INSERT_GENTFUNC_BASIC3( gemmtrsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR ) -INSERT_GENTFUNC_BASIC3( gemmtrsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR ) - diff --git a/ref_kernels/3/bb/bli_trsmbb_ref.c b/ref_kernels/3/bb/bli_trsmbb_ref.c deleted file mode 100644 index e3f5500cc..000000000 --- a/ref_kernels/3/bb/bli_trsmbb_ref.c +++ /dev/null @@ -1,214 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -// An implementation that indexes through B with the assumption that all -// elements were broadcast (duplicated) by a factor of NP/NR. - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ -\ - /* Assume that the degree of duplication is equal to packnr / nr. */ \ - const inc_t cs_b = packnr / nr; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = iter; \ - n_behind = i; \ -\ - ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \ - ctype* restrict a10t = a + (i )*rs_a + (0 )*cs_a; \ - ctype* restrict B0 = b + (0 )*rs_b + (0 )*cs_b; \ - ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a10t * B0; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype* restrict b01 = B0 + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype beta11c = *beta11; \ - ctype rho11; \ -\ - /* beta11 = beta11 - a10t * b01; */ \ - PASTEMAC(ch,set0s)( rho11 ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype* restrict alpha10 = a10t + (l )*cs_a; \ - ctype* restrict beta01 = b01 + (l )*rs_b; \ -\ - PASTEMAC(ch,axpys)( *alpha10, *beta01, rho11 ); \ - } \ - PASTEMAC(ch,subs)( rho11, beta11c ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: When preinversion is enabled, the INVERSE of alpha11 - (1.0/alpha11) is stored during packing instead alpha11 so we - can multiply rather than divide. When preinversion is disabled, - alpha11 is stored and division happens below explicitly. */ \ - PASTEMAC(ch,scals)( *alpha11, beta11c ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,copys)( beta11c, *gamma11 ); \ -\ - /* Store the local value back to b11. */ \ - PASTEMAC(ch,copys)( beta11c, *beta11 ); \ - } \ - } \ -} - -#ifdef BLIS_ENABLE_TRSM_PREINVERSION -INSERT_GENTFUNC_BASIC3( trsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals ) -#else -INSERT_GENTFUNC_BASIC3( trsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals ) -#endif - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ -\ - /* Assume that the degree of duplication is equal to packnr / nr. */ \ - const inc_t cs_b = packnr / nr; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = m - iter - 1; \ - n_behind = iter; \ -\ - ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \ - ctype* restrict a12t = a + (i )*rs_a + (i+1)*cs_a; \ - ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \ - ctype* restrict B2 = b + (i+1)*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a12t * B2; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict b21 = B2 + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype beta11c = *beta11; \ - ctype rho11; \ -\ - /* beta11 = beta11 - a12t * b21; */ \ - PASTEMAC(ch,set0s)( rho11 ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype* restrict alpha12 = a12t + (l )*cs_a; \ - ctype* restrict beta21 = b21 + (l )*rs_b; \ -\ - PASTEMAC(ch,axpys)( *alpha12, *beta21, rho11 ); \ - } \ - PASTEMAC(ch,subs)( rho11, beta11c ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: When preinversion is enabled, the INVERSE of alpha11 - (1.0/alpha11) is stored during packing instead alpha11 so we - can multiply rather than divide. When preinversion is disabled, - alpha11 is stored and division happens below explicitly. */ \ - PASTEMAC(ch,diagop)( *alpha11, beta11c ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,copys)( beta11c, *gamma11 ); \ -\ - /* Store the local value back to b11. */ \ - PASTEMAC(ch,copys)( beta11c, *beta11 ); \ - } \ - } \ -} - -#ifdef BLIS_ENABLE_TRSM_PREINVERSION -INSERT_GENTFUNC_BASIC3( trsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals ) -#else -INSERT_GENTFUNC_BASIC3( trsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals ) -#endif - diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c index 51ff9df4b..f284acb98 100644 --- a/ref_kernels/3/bli_gemm_ref.c +++ b/ref_kernels/3/bli_gemm_ref.c @@ -34,13 +34,114 @@ #include "blis.h" -#if 1 +// Completely generic gemm ukr implementation which checks MR/NR at +// runtime. Very slow, but has to be used in certain cases. + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, arch, suf ) \ +\ +static void PASTEMAC3(ch,opname,arch,suf) \ + ( \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ + const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + const inc_t rs_a = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \ + const inc_t cs_a = packmr; \ +\ + const inc_t rs_b = packnr; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \ +\ + ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const inc_t rs_ab = 1; \ + const inc_t cs_ab = m; \ +\ + dim_t l, j, i; \ +\ + ctype ai; \ + ctype bj; \ +\ +\ + /* Initialize the accumulator elements in ab to zero. */ \ + for ( i = 0; i < m * n; ++i ) \ + { \ + PASTEMAC(ch,set0s)( *(ab + i) ); \ + } \ +\ + /* Perform a series of k rank-1 updates into ab. */ \ + for ( l = 0; l < k; ++l ) \ + { \ + ctype* restrict abij = ab; \ +\ + /* In an optimized implementation, these two loops over MR and NR + are typically fully unrolled. */ \ + for ( j = 0; j < n; ++j ) \ + { \ + bj = *(b + j*cs_b); \ +\ + for ( i = 0; i < m; ++i ) \ + { \ + ai = *(a + i*rs_a); \ +\ + PASTEMAC(ch,dots)( ai, bj, *abij ); \ +\ + abij += rs_ab; \ + } \ + } \ +\ + a += cs_a; \ + b += rs_b; \ + } \ +\ + /* Scale the result in ab by alpha. */ \ + for ( i = 0; i < m * n; ++i ) \ + { \ + PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \ + } \ +\ + /* If beta is zero, overwrite c with the scaled result in ab. Otherwise, + scale by beta and then add the scaled redult in ab. */ \ + if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(ch,copys_mxn)( m, \ + n, \ + ab, rs_ab, cs_ab, \ + c, rs_c, cs_c ); \ + } \ + else \ + { \ + PASTEMAC(ch,xpbys_mxn)( m, \ + n, \ + ab, rs_ab, cs_ab, \ + beta, \ + c, rs_c, cs_c ); \ + } \ +} + +INSERT_GENTFUNC_BASIC2( gemm_gen, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) // An implementation that attempts to facilitate emission of vectorized // instructions via constant loop bounds + #pragma omp simd directives. +// If compile-time MR/NR are not available (indicated by BLIS_[MN]R_x = -1), +// then the non-unrolled version (above) is used. #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf, mr, nr ) \ +#define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ @@ -56,14 +157,38 @@ void PASTEMAC3(ch,opname,arch,suf) \ cntx_t* restrict cntx \ ) \ { \ - ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = nr; \ - const inc_t cs_ab = 1; \ \ - const inc_t cs_a = mr; \ - const inc_t rs_b = nr; \ + const dim_t mr = PASTECH(BLIS_MR_,ch); \ + const dim_t nr = PASTECH(BLIS_NR_,ch); \ +\ + if ( mr == -1 || nr == -1 ) \ + { \ + PASTEMAC3(ch,gemm_gen,arch,suf) \ + ( \ + m, \ + n, \ + k, \ + alpha, \ + a, \ + b, \ + beta, \ + c, rs_c, cs_c, \ + data, \ + cntx \ + ); \ + return; \ + } \ +\ + ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const inc_t rs_ab = nr; \ + const inc_t cs_ab = 1; \ +\ + const inc_t rs_a = PASTECH(BLIS_BBM_,ch); \ + const inc_t cs_a = PASTECH(BLIS_PACKMR_,ch); \ + const inc_t rs_b = PASTECH(BLIS_PACKNR_,ch); \ + const inc_t cs_b = PASTECH(BLIS_BBN_,ch); \ \ \ /* Initialize the accumulator elements in ab to zero. */ \ @@ -83,8 +208,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ PASTEMAC(ch,dots) \ ( \ - a[ i ], \ - b[ j ], \ + a[ i*rs_a ], \ + b[ j*cs_b ], \ ab[ i*rs_ab + j*cs_ab ] \ ); \ } \ @@ -157,115 +282,6 @@ void PASTEMAC3(ch,opname,arch,suf) \ } \ } -//INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) -GENTFUNC( float, s, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 16 ) -GENTFUNC( double, d, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 ) -GENTFUNC( scomplex, c, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 ) -GENTFUNC( dcomplex, z, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 4 ) - -#else - -// An implementation that uses variable loop bounds (queried from the context) -// and makes no use of #pragma omp simd. - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ -\ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ -\ - ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = mr; \ -\ - dim_t l, j, i; \ -\ - ctype ai; \ - ctype bj; \ -\ -\ - /* Initialize the accumulator elements in ab to zero. */ \ - for ( i = 0; i < m * n; ++i ) \ - { \ - PASTEMAC(ch,set0s)( *(ab + i) ); \ - } \ -\ - /* Perform a series of k rank-1 updates into ab. */ \ - for ( l = 0; l < k; ++l ) \ - { \ - ctype* restrict abij = ab; \ -\ - /* In an optimized implementation, these two loops over MR and NR - are typically fully unrolled. */ \ - for ( j = 0; j < n; ++j ) \ - { \ - bj = *(b + j); \ -\ - for ( i = 0; i < m; ++i ) \ - { \ - ai = *(a + i); \ -\ - PASTEMAC(ch,dots)( ai, bj, *abij ); \ -\ - abij += rs_ab; \ - } \ - } \ -\ - a += cs_a; \ - b += rs_b; \ - } \ -\ - /* Scale the result in ab by alpha. */ \ - for ( i = 0; i < m * n; ++i ) \ - { \ - PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \ - } \ -\ - /* If beta is zero, overwrite c with the scaled result in ab. Otherwise, - scale by beta and then add the scaled redult in ab. */ \ - if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,copys_mxn)( m, \ - n, \ - ab, rs_ab, cs_ab, \ - c, rs_c, cs_c ); \ - } \ - else \ - { \ - PASTEMAC(ch,xpbys_mxn)( m, \ - n, \ - ab, rs_ab, cs_ab, \ - beta, \ - c, rs_c, cs_c ); \ - } \ -} - INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) -#endif diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c index 30fc3fcd6..046aa5617 100644 --- a/ref_kernels/3/bli_gemmtrsm_ref.c +++ b/ref_kernels/3/bli_gemmtrsm_ref.c @@ -34,6 +34,9 @@ #include "blis.h" +// An implementation that indexes through B with the assumption that all +// elements were broadcast (duplicated) by a factor of NP/NR. + #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, trsmkerid ) \ \ @@ -60,21 +63,38 @@ void PASTEMAC3(ch,opname,arch,suf) \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \ +/* +printf( "bli_gemmtrsm_ref(): cs_b = %d\n", (int)cs_b ); \ +printf( "bli_gemmtrsm_ref(): k nr = %d %d\n", (int)k, (int)nr ); \ +*/ \ \ ctype* minus_one = PASTEMAC(ch,m1); \ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ PASTECH(ch,trsm_ukr_ft) \ - trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \ + trsm_ukr = bli_cntx_get_ukr_dt( dt, trsmkerid, cntx ); \ +\ +/* +PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \ + (double*)bx1, rs_b, cs_b, "%5.2f", "" ); \ +PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \ + (double*)b11, rs_b, 1, "%5.2f", "" ); \ +*/ \ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - /* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR - instead? */ \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + /* to FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR + instead? + + to DAM: Given that this reference kernel is implemented in terms of gemm, + I think that is the preference we want to query. There might be other + circumstances where we would want the gemmtrsm_? operations to have + and exercise their own IO preferences -- I'd have to think about it -- + but this doesn't seem to be one of them. */ \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : nr ); \ const inc_t cs_ct = ( col_pref ? mr : 1 ); \ \ @@ -106,6 +126,19 @@ void PASTEMAC3(ch,opname,arch,suf) \ data, \ cntx \ ); \ +/* +PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \ + (double*)b11, rs_b, 1, "%5.2f", "" ); \ +*/ \ +\ + /* Broadcast the elements of the updated b11 submatrix to their + duplicated neighbors. */ \ + PASTEMAC(ch,bcastbbs_mxn) \ + ( \ + m, \ + n, \ + b11, rs_b, cs_b \ + ); \ \ /* b11 = inv(a11) * b11; c11 = b11; */ \ @@ -117,6 +150,10 @@ void PASTEMAC3(ch,opname,arch,suf) \ data, \ cntx \ ); \ +/* +PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \ + (double*)b11, rs_b, 1, "%5.2f", "" ); \ +*/ \ \ if ( use_ct ) \ { \ diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c index 786f1129d..8234a84cc 100644 --- a/ref_kernels/3/bli_trsm_ref.c +++ b/ref_kernels/3/bli_trsm_ref.c @@ -34,17 +34,8 @@ #include "blis.h" -#if 0 - -// An implementation that attempts to facilitate emission of vectorized -// instructions via constant loop bounds + #pragma omp simd directives. - -// (Deleted. See 'old' directory.) - -#else - -// An implementation that uses variable loop bounds (queried from the context) -// and makes no use of #pragma omp simd. +// An implementation that indexes through B with the assumption that all +// elements were broadcast (duplicated) by a factor of NP/NR. #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \ @@ -69,11 +60,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ const dim_t m = mr; \ const dim_t n = nr; \ \ - const inc_t rs_a = 1; \ + const inc_t rs_a = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \ const inc_t cs_a = packmr; \ \ const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ @@ -114,13 +105,14 @@ void PASTEMAC3(ch,opname,arch,suf) \ (1.0/alpha11) is stored during packing instead alpha11 so we can multiply rather than divide. When preinversion is disabled, alpha11 is stored and division happens below explicitly. */ \ - PASTEMAC(ch,diagop)( *alpha11, beta11c ); \ + PASTEMAC(ch,scals)( *alpha11, beta11c ); \ \ /* Output final result to matrix c. */ \ PASTEMAC(ch,copys)( beta11c, *gamma11 ); \ \ /* Store the local value back to b11. */ \ - PASTEMAC(ch,copys)( beta11c, *beta11 ); \ + for ( dim_t d = 0; d < cs_b; ++d ) \ + PASTEMAC(ch,copys)( beta11c, *(beta11 + d) ); \ } \ } \ } @@ -155,19 +147,16 @@ void PASTEMAC3(ch,opname,arch,suf) \ const dim_t m = mr; \ const dim_t n = nr; \ \ - const inc_t rs_a = 1; \ + const inc_t rs_a = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \ const inc_t cs_a = packmr; \ \ const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \ \ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ - for ( iter = 0; iter < m; ++iter ) \ + for ( dim_t iter = 0; iter < m; ++iter ) \ { \ - i = m - iter - 1; \ - n_behind = iter; \ + dim_t i = m - iter - 1; \ + dim_t n_behind = iter; \ \ ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \ ctype* restrict a12t = a + (i )*rs_a + (i+1)*cs_a; \ @@ -176,7 +165,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* b1 = b1 - a12t * B2; */ \ /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ + for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \ ctype* restrict b21 = B2 + (0 )*rs_b + (j )*cs_b; \ @@ -186,7 +175,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ /* beta11 = beta11 - a12t * b21; */ \ PASTEMAC(ch,set0s)( rho11 ); \ - for ( l = 0; l < n_behind; ++l ) \ + for ( dim_t l = 0; l < n_behind; ++l ) \ { \ ctype* restrict alpha12 = a12t + (l )*cs_a; \ ctype* restrict beta21 = b21 + (l )*rs_b; \ @@ -206,7 +195,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ PASTEMAC(ch,copys)( beta11c, *gamma11 ); \ \ /* Store the local value back to b11. */ \ - PASTEMAC(ch,copys)( beta11c, *beta11 ); \ + for ( dim_t d = 0; d < cs_b; ++d ) \ + PASTEMAC(ch,copys)( beta11c, *(beta11 + d) ); \ } \ } \ } @@ -217,4 +207,3 @@ INSERT_GENTFUNC_BASIC3( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals ) INSERT_GENTFUNC_BASIC3( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals ) #endif -#endif diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c index 33e74ecaa..69c546cd4 100644 --- a/ref_kernels/bli_cntx_ref.c +++ b/ref_kernels/bli_cntx_ref.c @@ -107,60 +107,30 @@ // -- Level-1m (packm/unpackm) kernel prototype redefinitions ------------------ -#undef packm_2xk_ker_name -#define packm_2xk_ker_name GENARNAME(packm_2xk) -#undef packm_3xk_ker_name -#define packm_3xk_ker_name GENARNAME(packm_3xk) -#undef packm_4xk_ker_name -#define packm_4xk_ker_name GENARNAME(packm_4xk) -#undef packm_6xk_ker_name -#define packm_6xk_ker_name GENARNAME(packm_6xk) -#undef packm_8xk_ker_name -#define packm_8xk_ker_name GENARNAME(packm_8xk) -#undef packm_10xk_ker_name -#define packm_10xk_ker_name GENARNAME(packm_10xk) -#undef packm_12xk_ker_name -#define packm_12xk_ker_name GENARNAME(packm_12xk) -#undef packm_14xk_ker_name -#define packm_14xk_ker_name GENARNAME(packm_14xk) -#undef packm_16xk_ker_name -#define packm_16xk_ker_name GENARNAME(packm_16xk) -#undef packm_24xk_ker_name -#define packm_24xk_ker_name GENARNAME(packm_24xk) - -#undef unpackm_2xk_ker_name -#define unpackm_2xk_ker_name GENARNAME(unpackm_2xk) -#undef unpackm_4xk_ker_name -#define unpackm_4xk_ker_name GENARNAME(unpackm_4xk) -#undef unpackm_6xk_ker_name -#define unpackm_6xk_ker_name GENARNAME(unpackm_6xk) -#undef unpackm_8xk_ker_name -#define unpackm_8xk_ker_name GENARNAME(unpackm_8xk) -#undef unpackm_10xk_ker_name -#define unpackm_10xk_ker_name GENARNAME(unpackm_10xk) -#undef unpackm_12xk_ker_name -#define unpackm_12xk_ker_name GENARNAME(unpackm_12xk) -#undef unpackm_14xk_ker_name -#define unpackm_14xk_ker_name GENARNAME(unpackm_14xk) -#undef unpackm_16xk_ker_name -#define unpackm_16xk_ker_name GENARNAME(unpackm_16xk) - -#undef packm_2xk_1er_ker_name -#define packm_2xk_1er_ker_name GENARNAME(packm_2xk_1er) -#undef packm_4xk_1er_ker_name -#define packm_4xk_1er_ker_name GENARNAME(packm_4xk_1er) -#undef packm_6xk_1er_ker_name -#define packm_6xk_1er_ker_name GENARNAME(packm_6xk_1er) -#undef packm_8xk_1er_ker_name -#define packm_8xk_1er_ker_name GENARNAME(packm_8xk_1er) -#undef packm_10xk_1er_ker_name -#define packm_10xk_1er_ker_name GENARNAME(packm_10xk_1er) -#undef packm_12xk_1er_ker_name -#define packm_12xk_1er_ker_name GENARNAME(packm_12xk_1er) -#undef packm_14xk_1er_ker_name -#define packm_14xk_1er_ker_name GENARNAME(packm_14xk_1er) -#undef packm_16xk_1er_ker_name -#define packm_16xk_1er_ker_name GENARNAME(packm_16xk_1er) +#undef packm_mrxk_ker_name +#define packm_mrxk_ker_name GENARNAME(packm_mrxk) +#undef packm_nrxk_ker_name +#define packm_nrxk_ker_name GENARNAME(packm_nrxk) + +#undef packm_mrxk_1er_ker_name +#define packm_mrxk_1er_ker_name GENARNAME(packm_mrxk_1er) +#undef packm_nrxk_1er_ker_name +#define packm_nrxk_1er_ker_name GENARNAME(packm_nrxk_1er) + +#undef packm_mrxmr_diag_ker_name +#define packm_mrxmr_diag_ker_name GENARNAME(packm_mrxmr_diag) +#undef packm_nrxnr_diag_ker_name +#define packm_nrxnr_diag_ker_name GENARNAME(packm_nrxnr_diag) + +#undef packm_mrxmr_diag_1er_ker_name +#define packm_mrxmr_diag_1er_ker_name GENARNAME(packm_mrxmr_diag_1er) +#undef packm_nrxnr_diag_1er_ker_name +#define packm_nrxnr_diag_1er_ker_name GENARNAME(packm_nrxnr_diag_1er) + +#undef unpackm_mrxk_ker_name +#define unpackm_mrxk_ker_name GENARNAME(unpackm_mrxk) +#undef unpackm_nrxk_ker_name +#define unpackm_nrxk_ker_name GENARNAME(unpackm_nrxk) // Instantiate prototypes for above functions via the level-1m kernel API // template. @@ -259,11 +229,10 @@ void GENBARNAME(cntx_init) ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; func_t* funcs; mbool_t* mbools; dim_t i; - void** vfuncs; + void_fp* vfuncs; // -- Clear the context ---------------------------------------------------- @@ -273,59 +242,87 @@ void GENBARNAME(cntx_init) // -- Set blocksizes ------------------------------------------------------- + // NOTE: The macro values for register blocksizes and packm broadcast factors are + // used here as defined in the bli_kernel_defs_.h or generic values from + // bli_kernel_macro_defs.h otherwise. Configurations should also initialize the + // blocksizes in the context explicitly, but using the correct values here helps + // to prevent accidents. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_KR ], 1, 1, 1, 1 ); + bli_blksz_init ( &blkszs[ BLIS_MR ], BLIS_MR_s, BLIS_MR_d, BLIS_MR_c, BLIS_MR_z, + BLIS_PACKMR_s, BLIS_PACKMR_d, BLIS_PACKMR_c, BLIS_PACKMR_z ); + bli_blksz_init ( &blkszs[ BLIS_NR ], BLIS_NR_s, BLIS_NR_d, BLIS_NR_c, BLIS_NR_z, + BLIS_PACKNR_s, BLIS_PACKNR_d, BLIS_PACKNR_c, BLIS_PACKNR_z ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 ); + bli_blksz_init_easy( &blkszs[ BLIS_M2 ], 1000, 1000, 1000, 1000 ); + bli_blksz_init_easy( &blkszs[ BLIS_N2 ], 1000, 1000, 1000, 1000 ); + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 ); + bli_blksz_init_easy( &blkszs[ BLIS_DF ], 6, 6, 6, 6 ); + bli_blksz_init_easy( &blkszs[ BLIS_XF ], 4, 4, 4, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_BBM ], BLIS_BBM_s, BLIS_BBM_d, BLIS_BBM_c, BLIS_BBM_z ); + bli_blksz_init_easy( &blkszs[ BLIS_BBN ], BLIS_BBN_s, BLIS_BBN_d, BLIS_BBN_c, BLIS_BBN_z ); + + // -- Set level-3 small/unpacked thresholds -------------------------------- + + // NOTE: The default thresholds are set to zero so that the sup framework + // does not activate by default. Note that the semantic meaning of the + // thresholds is that the sup code path is executed if a dimension is + // strictly less than its corresponding threshold. So actually, the + // thresholds specify the minimum dimension size that will still dispatch + // the non-sup/large code path. This "strictly less than" behavior was + // chosen over "less than or equal to" so that threshold values of 0 would + // effectively disable sup (even for matrix dimensions of 0). // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_KR ], 1, 1, 1, 1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, 4, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, 128, 64 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 ); - bli_blksz_init_easy( &blkszs[ BLIS_M2 ], 1000, 1000, 1000, 1000 ); - bli_blksz_init_easy( &blkszs[ BLIS_N2 ], 1000, 1000, 1000, 1000 ); - bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 ); - bli_blksz_init_easy( &blkszs[ BLIS_DF ], 6, 6, 6, 6 ); - bli_blksz_init_easy( &blkszs[ BLIS_XF ], 4, 4, 4, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MT ], 0, 0, 0, 0 ); + bli_blksz_init_easy( &blkszs[ BLIS_NT ], 0, 0, 0, 0 ); + bli_blksz_init_easy( &blkszs[ BLIS_KT ], 0, 0, 0, 0 ); // Initialize the context with the default blocksize objects and their // multiples. bli_cntx_set_blkszs ( - BLIS_NAT, 11, - BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, - BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, - BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, - BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, - BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - BLIS_KR, &blkszs[ BLIS_KR ], BLIS_KR, - BLIS_M2, &blkszs[ BLIS_M2 ], BLIS_M2, - BLIS_N2, &blkszs[ BLIS_N2 ], BLIS_N2, - BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, - BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - BLIS_XF, &blkszs[ BLIS_XF ], BLIS_XF, - cntx + cntx, + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + BLIS_KR, &blkszs[ BLIS_KR ], BLIS_KR, + BLIS_M2, &blkszs[ BLIS_M2 ], BLIS_M2, + BLIS_N2, &blkszs[ BLIS_N2 ], BLIS_N2, + BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, + BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, + BLIS_XF, &blkszs[ BLIS_XF ], BLIS_XF, + BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT, + BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT, + BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT, + BLIS_BBM, &blkszs[ BLIS_BBM ], BLIS_BBM, + BLIS_BBN, &blkszs[ BLIS_BBN ], BLIS_BBN, + BLIS_VA_END ); // -- Set level-3 virtual micro-kernels ------------------------------------ - funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); + funcs = bli_cntx_ukrs_buf( cntx ); // NOTE: We set the virtual micro-kernel slots to contain the addresses // of the native micro-kernels. In general, the ukernels in the virtual // ukernel slots are always called, and if the function called happens to // be a virtual micro-kernel, it will then know to find its native ukernel // (i.e., in the native ukernel slots). - gen_func_init( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name ); - gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name ); - gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name ); - gen_func_init( &funcs[ BLIS_TRSM_L_UKR ], trsm_l_ukr_name ); - gen_func_init( &funcs[ BLIS_TRSM_U_UKR ], trsm_u_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMM_VIR_UKR ], gemm_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm_l_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm_u_ukr_name ); + gen_func_init( &funcs[ BLIS_TRSM_L_VIR_UKR ], trsm_l_ukr_name ); + gen_func_init( &funcs[ BLIS_TRSM_U_VIR_UKR ], trsm_u_ukr_name ); // -- Set level-3 native micro-kernels and preferences --------------------- - funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); - mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); + mbools = bli_cntx_ukr_prefs_buf( cntx ); gen_func_init( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name ); gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name ); @@ -333,108 +330,47 @@ void GENBARNAME(cntx_init) gen_func_init( &funcs[ BLIS_TRSM_L_UKR ], trsm_l_ukr_name ); gen_func_init( &funcs[ BLIS_TRSM_U_UKR ], trsm_u_ukr_name ); - // s d c z - bli_mbool_init( &mbools[ BLIS_GEMM_UKR ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_GEMMTRSM_L_UKR ], FALSE, FALSE, FALSE, FALSE ); - bli_mbool_init( &mbools[ BLIS_GEMMTRSM_U_UKR ], FALSE, FALSE, FALSE, FALSE ); - bli_mbool_init( &mbools[ BLIS_TRSM_L_UKR ], FALSE, FALSE, FALSE, FALSE ); - bli_mbool_init( &mbools[ BLIS_TRSM_U_UKR ], FALSE, FALSE, FALSE, FALSE ); - - - // -- Set level-3 small/unpacked thresholds -------------------------------- - - // NOTE: The default thresholds are set to zero so that the sup framework - // does not activate by default. Note that the semantic meaning of the - // thresholds is that the sup code path is executed if a dimension is - // strictly less than its corresponding threshold. So actually, the - // thresholds specify the minimum dimension size that will still dispatch - // the non-sup/large code path. This "strictly less than" behavior was - // chosen over "less than or equal to" so that threshold values of 0 would - // effectively disable sup (even for matrix dimensions of 0). - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 0, 0, 0, 0 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 0, 0, 0, 0 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 0, 0, 0, 0 ); - - // Initialize the context with the default thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - - - // -- Set level-3 small/unpacked handlers ---------------------------------- - - vfuncs = bli_cntx_l3_sup_handlers_buf( cntx ); - - // Initialize all of the function pointers to NULL; - for ( i = 0; i < BLIS_NUM_LEVEL3_OPS; ++i ) vfuncs[ i ] = NULL; - - // The level-3 sup handlers are oapi-based, so we only set one slot per - // operation. - - // Set the gemm slot to the default gemm sup handler. - vfuncs[ BLIS_GEMM ] = bli_gemmsup_ref; - vfuncs[ BLIS_GEMMT ] = bli_gemmtsup_ref; + // s d c z + bli_mbool_init( &mbools[ BLIS_GEMM_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMTRSM_L_UKR_ROW_PREF ], FALSE, FALSE, FALSE, FALSE ); + bli_mbool_init( &mbools[ BLIS_GEMMTRSM_U_UKR_ROW_PREF ], FALSE, FALSE, FALSE, FALSE ); + bli_mbool_init( &mbools[ BLIS_TRSM_L_UKR_ROW_PREF ], FALSE, FALSE, FALSE, FALSE ); + bli_mbool_init( &mbools[ BLIS_TRSM_U_UKR_ROW_PREF ], FALSE, FALSE, FALSE, FALSE ); // -- Set level-3 small/unpacked micro-kernels and preferences ------------- - funcs = bli_cntx_l3_sup_kers_buf( cntx ); - mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); - -#if 0 - // Adhere to the small/unpacked ukernel mappings: - // - rv -> rrr, rcr - // - rg -> rrc, rcc - // - cv -> ccr, ccc - // - cg -> crr, crc - gen_sup_func_init( &funcs[ BLIS_RRR ], - &funcs[ BLIS_RCR ], gemmsup_rv_ukr_name ); - gen_sup_func_init( &funcs[ BLIS_RRC ], - &funcs[ BLIS_RCC ], gemmsup_rg_ukr_name ); - gen_sup_func_init( &funcs[ BLIS_CCR ], - &funcs[ BLIS_CCC ], gemmsup_cv_ukr_name ); - gen_sup_func_init( &funcs[ BLIS_CRR ], - &funcs[ BLIS_CRC ], gemmsup_cg_ukr_name ); -#endif - gen_func_init( &funcs[ BLIS_RRR ], gemmsup_rv_ukr_name ); - gen_func_init( &funcs[ BLIS_RRC ], gemmsup_rv_ukr_name ); - gen_func_init( &funcs[ BLIS_RCR ], gemmsup_rv_ukr_name ); - gen_func_init( &funcs[ BLIS_RCC ], gemmsup_rv_ukr_name ); - gen_func_init( &funcs[ BLIS_CRR ], gemmsup_rv_ukr_name ); - gen_func_init( &funcs[ BLIS_CRC ], gemmsup_rv_ukr_name ); - gen_func_init( &funcs[ BLIS_CCR ], gemmsup_rv_ukr_name ); - gen_func_init( &funcs[ BLIS_CCC ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_RRR_UKR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_RRC_UKR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_RCR_UKR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_RCC_UKR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_CRR_UKR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_CRC_UKR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_CCR_UKR ], gemmsup_rv_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_CCC_UKR ], gemmsup_rv_ukr_name ); // Register the general-stride/generic ukernel to the "catch-all" slot // associated with the BLIS_XXX enum value. This slot will be queried if // *any* operand is stored with general stride. - gen_func_init( &funcs[ BLIS_XXX ], gemmsup_gx_ukr_name ); + gen_func_init( &funcs[ BLIS_GEMMSUP_XXX_UKR ], gemmsup_gx_ukr_name ); // Set the l3 sup ukernel storage preferences. - // s d c z - bli_mbool_init( &mbools[ BLIS_RRR ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_RRC ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_RCR ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_RCC ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_CRR ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_CRC ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_CCR ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_CCC ], TRUE, TRUE, TRUE, TRUE ); + // s d c z + bli_mbool_init( &mbools[ BLIS_GEMMSUP_RRR_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_RRC_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_RCR_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_RCC_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_CRR_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_CRC_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_CCR_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_CCC_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); - bli_mbool_init( &mbools[ BLIS_XXX ], TRUE, TRUE, TRUE, TRUE ); + bli_mbool_init( &mbools[ BLIS_GEMMSUP_XXX_UKR_ROW_PREF ], TRUE, TRUE, TRUE, TRUE ); // -- Set level-1f kernels ------------------------------------------------- - funcs = bli_cntx_l1f_kers_buf( cntx ); - gen_func_init( &funcs[ BLIS_AXPY2V_KER ], axpy2v_ker_name ); gen_func_init( &funcs[ BLIS_DOTAXPYV_KER ], dotaxpyv_ker_name ); gen_func_init( &funcs[ BLIS_AXPYF_KER ], axpyf_ker_name ); @@ -444,8 +380,6 @@ void GENBARNAME(cntx_init) // -- Set level-1v kernels ------------------------------------------------- - funcs = bli_cntx_l1v_kers_buf( cntx ); - gen_func_init( &funcs[ BLIS_ADDV_KER ], addv_ker_name ); gen_func_init( &funcs[ BLIS_AMAXV_KER ], amaxv_ker_name ); gen_func_init( &funcs[ BLIS_AXPBYV_KER ], axpbyv_ker_name ); @@ -464,41 +398,35 @@ void GENBARNAME(cntx_init) // -- Set level-1m (packm/unpackm) kernels --------------------------------- - funcs = bli_cntx_packm_kers_buf( cntx ); + gen_func_init( &funcs[ BLIS_PACKM_MRXK_KER ], packm_mrxk_ker_name ); + gen_func_init( &funcs[ BLIS_PACKM_NRXK_KER ], packm_nrxk_ker_name ); - // Initialize all packm kernel func_t entries to NULL. - for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i ) - { - bli_func_init_null( &funcs[ i ] ); - } + gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_1ER_KER ], packm_mrxk_1er_ker_name ); + gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_1ER_KER ], packm_nrxk_1er_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_2XK_KER ], packm_2xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_3XK_KER ], packm_3xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_4XK_KER ], packm_4xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_6XK_KER ], packm_6xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_8XK_KER ], packm_8xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_24XK_KER ], packm_24xk_ker_name ); - - funcs = bli_cntx_unpackm_kers_buf( cntx ); - - // Initialize all packm kernel func_t entries to NULL. - for ( i = BLIS_UNPACKM_0XK_KER; i <= BLIS_UNPACKM_31XK_KER; ++i ) - { - bli_func_init_null( &funcs[ i ] ); - } + gen_func_init( &funcs[ BLIS_PACKM_MRXMR_DIAG_KER ], packm_mrxmr_diag_ker_name ); + gen_func_init( &funcs[ BLIS_PACKM_NRXNR_DIAG_KER ], packm_nrxnr_diag_ker_name ); + + gen_func_init_co( &funcs[ BLIS_PACKM_MRXMR_DIAG_1ER_KER ], packm_mrxmr_diag_1er_ker_name ); + gen_func_init_co( &funcs[ BLIS_PACKM_NRXNR_DIAG_1ER_KER ], packm_nrxnr_diag_1er_ker_name ); + + gen_func_init( &funcs[ BLIS_UNPACKM_MRXK_KER ], unpackm_mrxk_ker_name ); + gen_func_init( &funcs[ BLIS_UNPACKM_NRXK_KER ], unpackm_nrxk_ker_name ); + + + // -- Set level-3 small/unpacked handlers ---------------------------------- + + vfuncs = bli_cntx_l3_sup_handlers_buf( cntx ); - gen_func_init( &funcs[ BLIS_UNPACKM_2XK_KER ], unpackm_2xk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_4XK_KER ], unpackm_4xk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_6XK_KER ], unpackm_6xk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_8XK_KER ], unpackm_8xk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_10XK_KER ], unpackm_10xk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_12XK_KER ], unpackm_12xk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_14XK_KER ], unpackm_14xk_ker_name ); - gen_func_init( &funcs[ BLIS_UNPACKM_16XK_KER ], unpackm_16xk_ker_name ); + // Initialize all of the function pointers to NULL; + for ( i = 0; i < BLIS_NUM_LEVEL3_OPS; ++i ) vfuncs[ i ] = NULL; + + // The level-3 sup handlers are oapi-based, so we only set one slot per + // operation. + + // Set the gemm slot to the default gemm sup handler. + vfuncs[ BLIS_GEMM ] = bli_gemmsup_ref; + vfuncs[ BLIS_GEMMT ] = bli_gemmtsup_ref; // -- Set miscellaneous fields --------------------------------------------- @@ -515,7 +443,6 @@ void GENBAINAME(cntx_init) ) { func_t* funcs; - dim_t i; // This function is designed to modify a copy of an existing native // context to enable computation via an induced method for complex @@ -525,23 +452,23 @@ void GENBAINAME(cntx_init) // -- Set induced method level-3 virtual micro-kernels --------------------- - funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); + funcs = bli_cntx_ukrs_buf( cntx ); if ( method == BLIS_1M ) { - gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm1m_ukr_name ); - gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm1m_l_ukr_name ); - gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm1m_u_ukr_name ); - gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ], trsm1m_l_ukr_name ); - gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ], trsm1m_u_ukr_name ); + gen_func_init_co( &funcs[ BLIS_GEMM_VIR_UKR ], gemm1m_ukr_name ); + gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm1m_l_ukr_name ); + gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm1m_u_ukr_name ); + gen_func_init_co( &funcs[ BLIS_TRSM_L_VIR_UKR ], trsm1m_l_ukr_name ); + gen_func_init_co( &funcs[ BLIS_TRSM_U_VIR_UKR ], trsm1m_u_ukr_name ); } else // if ( method == BLIS_NAT ) { - gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name ); - gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name ); - gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name ); - gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ], trsm_l_ukr_name ); - gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ], trsm_u_ukr_name ); + gen_func_init_co( &funcs[ BLIS_GEMM_VIR_UKR ], gemm_ukr_name ); + gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm_l_ukr_name ); + gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm_u_ukr_name ); + gen_func_init_co( &funcs[ BLIS_TRSM_L_VIR_UKR ], trsm_l_ukr_name ); + gen_func_init_co( &funcs[ BLIS_TRSM_U_VIR_UKR ], trsm_u_ukr_name ); } // For 1m, we employ an optimization which requires that we copy the native @@ -556,8 +483,8 @@ void GENBAINAME(cntx_init) // beta has a zero imaginary component and C is either row- or column-stored). if ( method == BLIS_1M ) { - func_t* gemm_nat_ukrs = bli_cntx_get_l3_nat_ukrs( BLIS_GEMM_UKR, cntx ); - func_t* gemm_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, cntx ); + func_t* gemm_nat_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_UKR, cntx ); + func_t* gemm_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, cntx ); bli_func_copy_dt( BLIS_FLOAT, gemm_nat_ukrs, BLIS_FLOAT, gemm_vir_ukrs ); bli_func_copy_dt( BLIS_DOUBLE, gemm_nat_ukrs, BLIS_DOUBLE, gemm_vir_ukrs ); @@ -566,39 +493,23 @@ void GENBAINAME(cntx_init) // -- Set induced method packm kernels ------------------------------------- - funcs = bli_cntx_packm_kers_buf( cntx ); - - // Initialize all packm kernel func_t entries to NULL. - for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i ) - { - bli_func_init_null( &funcs[ i ] ); - } - if ( method == BLIS_1M ) { - gen_func_init_co( &funcs[ BLIS_PACKM_2XK_KER ], packm_2xk_1er_ker_name ); - gen_func_init_co( &funcs[ BLIS_PACKM_4XK_KER ], packm_4xk_1er_ker_name ); - gen_func_init_co( &funcs[ BLIS_PACKM_6XK_KER ], packm_6xk_1er_ker_name ); - gen_func_init_co( &funcs[ BLIS_PACKM_8XK_KER ], packm_8xk_1er_ker_name ); - gen_func_init_co( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_1er_ker_name ); - gen_func_init_co( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_1er_ker_name ); - gen_func_init_co( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_1er_ker_name ); - gen_func_init_co( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_1er_ker_name ); + gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_KER ], packm_mrxk_1er_ker_name ); + gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_KER ], packm_nrxk_1er_ker_name ); } else // if ( method == BLIS_NAT ) { - gen_func_init( &funcs[ BLIS_PACKM_2XK_KER ], packm_2xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_3XK_KER ], packm_3xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_4XK_KER ], packm_4xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_6XK_KER ], packm_6xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_8XK_KER ], packm_8xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_ker_name ); - gen_func_init( &funcs[ BLIS_PACKM_24XK_KER ], packm_24xk_ker_name ); + gen_func_init( &funcs[ BLIS_PACKM_MRXK_KER ], packm_mrxk_ker_name ); + gen_func_init( &funcs[ BLIS_PACKM_NRXK_KER ], packm_nrxk_ker_name ); } + gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_1ER_KER ], packm_mrxk_1er_ker_name ); + gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_1ER_KER ], packm_nrxk_1er_ker_name ); + + gen_func_init( &funcs[ BLIS_UNPACKM_MRXK_KER ], unpackm_mrxk_ker_name ); + gen_func_init( &funcs[ BLIS_UNPACKM_NRXK_KER ], unpackm_nrxk_ker_name ); + // -- Set induced method cache and register blocksizes --------------------- @@ -628,50 +539,44 @@ void GENBAINAME(cntx_init_blkszs) cntx_t* cntx ) { - // We MUST set the induced method in the context prior to calling - // bli_cntx_l3_vir_ukr_prefers_cols_dt() because that function queries - // the induced method. That function needs the induced method value in - // order to determine whether to evaluate the "prefers column storage" - // predicate using the storage preference of the kernel for dt, or - // the storage preference of the kernel for the real projection of - // dt. Failing to set the induced method here can lead to strange - // undefined behavior at runtime if the native complex kernel's - // storage preference happens to not equal that of the native real - // kernel. + // Set the induced method in the context. bli_cntx_set_method( method, cntx ); + num_t dt_r = bli_dt_proj_to_real( dt ); + // Initialize the blocksizes according to the micro-kernel preference as // well as the algorithm. - if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) + //if ( bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) + if ( ! bli_cntx_get_ukr_prefs_dt( dt_r, BLIS_GEMM_UKR_ROW_PREF, cntx ) ) { // This branch is used for algorithm 1m_c_bp. bli_cntx_set_ind_blkszs ( - method, dt, 6, + method, dt, cntx, BLIS_NC, 1.0, 1.0, BLIS_KC, 2.0, 2.0, // halve kc... BLIS_MC, 2.0, 2.0, // halve mc... BLIS_NR, 1.0, 1.0, BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr) BLIS_KR, 1.0, 1.0, - cntx + BLIS_VA_END ); } - else // if ( bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) + else // if ( bli_cntx_get_ukr_prefs_dt( dt, BLIS_GEMM_UKR_ROW_PREF, cntx ) ) { // This branch is used for algorithm 1m_r_bp. bli_cntx_set_ind_blkszs ( - method, dt, 6, + method, dt, cntx, BLIS_NC, 2.0, 2.0, // halve nc... BLIS_KC, 2.0, 2.0, // halve kc... BLIS_MC, 1.0, 1.0, BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr) BLIS_MR, 1.0, 1.0, BLIS_KR, 1.0, 1.0, - cntx + BLIS_VA_END ); } } diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c index fbd15d695..2f0808389 100644 --- a/ref_kernels/ind/bli_gemm1m_ref.c +++ b/ref_kernels/ind/bli_gemm1m_ref.c @@ -55,8 +55,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ const num_t dt_r = PASTEMAC(chr,type); \ \ PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ + rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ const bool row_pref = !col_pref; \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c index 08823f073..6cfb83cae 100644 --- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c +++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c @@ -56,12 +56,12 @@ void PASTEMAC3(ch,opname,arch,suf) \ const num_t dt_r = PASTEMAC(chr,type); \ \ PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ + rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ \ PASTECH(ch,trsm_ukr_ft) \ ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ \ - const bool col_pref_r = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref_r = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ @@ -89,7 +89,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype_r* restrict bx1_r = ( ctype_r* )bx1; \ \ const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \ \ ctype_r* restrict zero_r = PASTEMAC(chr,0); \ ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ @@ -106,7 +106,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ /* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR instead? */ \ - const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : nr ); \ const inc_t cs_ct = ( col_pref ? mr : 1 ); \ \ @@ -192,24 +192,25 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ if ( bli_is_1e_packed( schema_b ) ) \ { \ - const inc_t ld_b = rs_b; \ + const inc_t ld_b = rs_b; \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = 2 * cs_b; \ \ - ctype* restrict b11_ri = ( ctype* )b11; \ - ctype* restrict b11_ir = ( ctype* )b11 + ld_b/2; \ -\ - dim_t i, j; \ + ctype_r* restrict b11_ri = ( ctype_r* )b11; \ + ctype_r* restrict b11_ir = ( ctype_r* )b11 + ld_b; \ \ /* b11 = alpha * b11 + bt; */ \ - for ( j = 0; j < nr; ++j ) \ - for ( i = 0; i < mr; ++i ) \ + for ( dim_t j = 0; j < nr; ++j ) \ + for ( dim_t i = 0; i < mr; ++i ) \ + for ( dim_t d = 0; d < cs_b; ++d ) \ { \ - ctype* restrict beta11t = bt + i*rs_bt + j*cs_bt; \ - ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \ - ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \ - ctype* restrict beta11_ri = b11_ri + i*rs_b + j*cs_b; \ - ctype_r* restrict beta11_r = &PASTEMAC(ch,real)( *beta11_ri ); \ - ctype_r* restrict beta11_i = &PASTEMAC(ch,imag)( *beta11_ri ); \ - ctype* restrict beta11_ir = b11_ir + i*rs_b + j*cs_b; \ + ctype* restrict beta11t = bt + i*rs_bt + j*cs_bt; \ + ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \ + ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \ + ctype_r* restrict beta11_ri_r = b11_ri + i*rs_b2 + j*cs_b2 + 0*cs_b + d; \ + ctype_r* restrict beta11_ri_i = b11_ri + i*rs_b2 + j*cs_b2 + 1*cs_b + d; \ + ctype_r* restrict beta11_ir_r = b11_ir + i*rs_b2 + j*cs_b2 + 0*cs_b + d; \ + ctype_r* restrict beta11_ir_i = b11_ir + i*rs_b2 + j*cs_b2 + 1*cs_b + d; \ \ PASTEMAC3(ch,chr,ch,xpbyris) \ ( \ @@ -217,12 +218,12 @@ void PASTEMAC3(ch,opname,arch,suf) \ *beta11t_i, \ alpha_r, \ alpha_i, /* alpha_i not referenced */ \ - *beta11_r, \ - *beta11_i \ + *beta11_ri_r, \ + *beta11_ri_i \ ); \ \ - PASTEMAC(ch,sets)( -*beta11_i, \ - *beta11_r, *beta11_ir ); \ + PASTEMAC(ch,copyris)( -*beta11_ri_i, *beta11_ri_r, \ + *beta11_ir_r, *beta11_ir_i ); \ } \ } \ else /* if ( bli_is_1r_packed( schema_b ) ) */ \ @@ -233,18 +234,17 @@ void PASTEMAC3(ch,opname,arch,suf) \ \ ctype_r* restrict b11_r = ( ctype_r* )b11; \ ctype_r* restrict b11_i = ( ctype_r* )b11 + ld_b; \ -\ - dim_t i, j; \ \ /* b11 = alpha * b11 + bt; */ \ - for ( j = 0; j < nr; ++j ) \ - for ( i = 0; i < mr; ++i ) \ + for ( dim_t j = 0; j < nr; ++j ) \ + for ( dim_t i = 0; i < mr; ++i ) \ + for ( dim_t d = 0; d < cs_b; ++d ) \ { \ ctype* restrict beta11t = bt + i*rs_bt + j*cs_bt; \ ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \ ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \ - ctype_r* restrict beta11_r = b11_r + i*rs_b2 + j*cs_b2; \ - ctype_r* restrict beta11_i = b11_i + i*rs_b2 + j*cs_b2; \ + ctype_r* restrict beta11_r = b11_r + i*rs_b2 + j*cs_b2 + d; \ + ctype_r* restrict beta11_i = b11_i + i*rs_b2 + j*cs_b2 + d; \ \ PASTEMAC3(ch,chr,ch,xpbyris) \ ( \ diff --git a/ref_kernels/ind/bli_trsm1m_ref.c b/ref_kernels/ind/bli_trsm1m_ref.c index 68717f7a6..5eda20f20 100644 --- a/ref_kernels/ind/bli_trsm1m_ref.c +++ b/ref_kernels/ind/bli_trsm1m_ref.c @@ -48,6 +48,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ @@ -58,11 +59,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ const dim_t m = mr; \ const dim_t n = nr; \ \ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ + const inc_t rs_a = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBM, cntx ); \ + const inc_t cs_a = packmr; \ \ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ + const inc_t rs_b = packnr; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \ \ const inc_t ld_a = cs_a; \ const inc_t ld_b = rs_b; \ @@ -77,12 +78,14 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ const inc_t rs_a2 = 1 * rs_a; \ const inc_t cs_a2 = 2 * cs_a; \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = 2 * cs_b; \ \ ctype_r* restrict a_r = ( ctype_r* )a; \ ctype_r* restrict a_i = ( ctype_r* )a + ld_a; \ \ - ctype* restrict b_ri = ( ctype* )b; \ - ctype* restrict b_ir = ( ctype* )b + ld_b/2; \ + ctype_r* restrict b_ri = ( ctype_r* )b; \ + ctype_r* restrict b_ir = ( ctype_r* )b + ld_b; \ \ for ( iter = 0; iter < m; ++iter ) \ { \ @@ -93,20 +96,22 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ ctype_r* restrict a10t_r = a_r + (i )*rs_a2 + (0 )*cs_a2; \ ctype_r* restrict a10t_i = a_i + (i )*rs_a2 + (0 )*cs_a2; \ - ctype* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ - ctype* restrict b1_ir = b_ir + (i )*rs_b + (0 )*cs_b; \ - ctype* restrict B0_ri = b_ri + (0 )*rs_b + (0 )*cs_b; \ + ctype_r* restrict b1_ri = b_ri + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict b1_ir = b_ir + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B0_ri = b_ri + (0 )*rs_b2 + (0 )*cs_b2; \ \ /* b1 = b1 - a10t * B0; */ \ /* b1 = b1 / alpha11; */ \ for ( j = 0; j < n; ++j ) \ { \ - ctype* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict beta11_ir = b1_ir + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict b01_ri = B0_ri + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \ - ctype_r beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \ + ctype_r* restrict beta11_ri_r = b1_ri + (0 )*rs_b2 + (j )*cs_b2 + 0*cs_b; \ + ctype_r* restrict beta11_ri_i = b1_ri + (0 )*rs_b2 + (j )*cs_b2 + 1*cs_b; \ + ctype_r* restrict beta11_ir_r = b1_ir + (0 )*rs_b2 + (j )*cs_b2 + 0*cs_b; \ + ctype_r* restrict beta11_ir_i = b1_ir + (0 )*rs_b2 + (j )*cs_b2 + 1*cs_b; \ + ctype_r* restrict b01_ri = B0_ri + (0 )*rs_b2 + (j )*cs_b2; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = *beta11_ri_r; \ + ctype_r beta11c_i = *beta11_ri_i; \ ctype_r rho11_r; \ ctype_r rho11_i; \ \ @@ -117,9 +122,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a2; \ ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a2; \ - ctype* restrict beta01_ri = b01_ri + (l )*rs_b; \ - ctype_r* restrict beta01_r = &PASTEMAC(ch,real)( *beta01_ri ); \ - ctype_r* restrict beta01_i = &PASTEMAC(ch,imag)( *beta01_ri ); \ + ctype_r* restrict beta01_r = b01_ri + (l )*rs_b2 + 0*cs_b; \ + ctype_r* restrict beta01_i = b01_ri + (l )*rs_b2 + 1*cs_b; \ \ PASTEMAC(ch,axpyris)( *alpha10_r, \ *alpha10_i, \ @@ -147,8 +151,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *gamma11 ); \ \ /* Store the local values back to b11. */ \ - PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *beta11_ri ); \ - PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \ + for ( dim_t d = 0; d < cs_b; ++d ) \ + { \ + PASTEMAC(ch,copyris)( beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \ + PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \ + } \ } \ } \ } \ @@ -229,10 +236,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ beta11c_i, *gamma11 ); \ \ /* Store the local values back to b11. */ \ - PASTEMAC(ch,copyris)( beta11c_r, \ - beta11c_i, \ - *beta11_r, \ - *beta11_i ); \ + for ( dim_t d = 0; d < cs_b; ++d ) \ + PASTEMAC(ch,copyris)( beta11c_r, \ + beta11c_i, \ + *(beta11_r + d), \ + *(beta11_i + d) ); \ } \ } \ } \ @@ -258,6 +266,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ @@ -268,11 +277,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ const dim_t m = mr; \ const dim_t n = nr; \ \ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ + const inc_t rs_a = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBM, cntx ); \ + const inc_t cs_a = packmr; \ \ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ + const inc_t rs_b = packnr; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \ \ const inc_t ld_a = cs_a; \ const inc_t ld_b = rs_b; \ @@ -287,12 +296,14 @@ void PASTEMAC3(ch,opname,arch,suf) \ { \ const inc_t rs_a2 = 1 * rs_a; \ const inc_t cs_a2 = 2 * cs_a; \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = 2 * cs_b; \ \ ctype_r* restrict a_r = ( ctype_r* )a; \ ctype_r* restrict a_i = ( ctype_r* )a + ld_a; \ \ - ctype* restrict b_ri = ( ctype* )b; \ - ctype* restrict b_ir = ( ctype* )b + ld_b/2; \ + ctype_r* restrict b_ri = ( ctype_r* )b; \ + ctype_r* restrict b_ir = ( ctype_r* )b + ld_b; \ \ for ( iter = 0; iter < m; ++iter ) \ { \ @@ -303,20 +314,22 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ ctype_r* restrict a12t_r = a_r + (i )*rs_a2 + (i+1)*cs_a2; \ ctype_r* restrict a12t_i = a_i + (i )*rs_a2 + (i+1)*cs_a2; \ - ctype* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ - ctype* restrict b1_ir = b_ir + (i )*rs_b + (0 )*cs_b; \ - ctype* restrict B2_ri = b_ri + (i+1)*rs_b + (0 )*cs_b; \ + ctype_r* restrict b1_ri = b_ri + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict b1_ir = b_ir + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B2_ri = b_ri + (i+1)*rs_b2 + (0 )*cs_b2; \ \ /* b1 = b1 - a12t * B2; */ \ /* b1 = b1 / alpha11; */ \ for ( j = 0; j < n; ++j ) \ { \ - ctype* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict beta11_ir = b1_ir + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict b21_ri = B2_ri + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \ - ctype_r beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \ + ctype_r* restrict beta11_ri_r = b1_ri + (0 )*rs_b2 + (j )*cs_b2 + 0*cs_b; \ + ctype_r* restrict beta11_ri_i = b1_ri + (0 )*rs_b2 + (j )*cs_b2 + 1*cs_b; \ + ctype_r* restrict beta11_ir_r = b1_ir + (0 )*rs_b2 + (j )*cs_b2 + 0*cs_b; \ + ctype_r* restrict beta11_ir_i = b1_ir + (0 )*rs_b2 + (j )*cs_b2 + 1*cs_b; \ + ctype_r* restrict b21_ri = B2_ri + (0 )*rs_b2 + (j )*cs_b2; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = *beta11_ri_r; \ + ctype_r beta11c_i = *beta11_ri_i; \ ctype_r rho11_r; \ ctype_r rho11_i; \ \ @@ -325,11 +338,10 @@ void PASTEMAC3(ch,opname,arch,suf) \ rho11_i ); \ for ( l = 0; l < n_behind; ++l ) \ { \ - ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a2; \ - ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a2; \ - ctype* restrict beta21_ri = b21_ri + (l )*rs_b; \ - ctype_r* restrict beta21_r = &PASTEMAC(ch,real)( *beta21_ri ); \ - ctype_r* restrict beta21_i = &PASTEMAC(ch,imag)( *beta21_ri ); \ + ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a2; \ + ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a2; \ + ctype_r* restrict beta21_r = b21_ri + (l )*rs_b2 + 0*cs_b; \ + ctype_r* restrict beta21_i = b21_ri + (l )*rs_b2 + 1*cs_b; \ \ PASTEMAC(ch,axpyris)( *alpha12_r, \ *alpha12_i, \ @@ -357,8 +369,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *gamma11 ); \ \ /* Store the local values back to b11. */ \ - PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *beta11_ri ); \ - PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \ + for ( dim_t d = 0; d < cs_b; ++d ) \ + { \ + PASTEMAC(ch,copyris)( beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \ + PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \ + } \ } \ } \ } \ @@ -439,10 +454,11 @@ void PASTEMAC3(ch,opname,arch,suf) \ beta11c_i, *gamma11 ); \ \ /* Store the local values back to b11. */ \ - PASTEMAC(ch,copyris)( beta11c_r, \ - beta11c_i, \ - *beta11_r, \ - *beta11_i ); \ + for ( dim_t d = 0; d < cs_b; ++d ) \ + PASTEMAC(ch,copyris)( beta11c_r, \ + beta11c_i, \ + *(beta11_r + d), \ + *(beta11_i + d) ); \ } \ } \ } \ diff --git a/sandbox/gemmlike/attic/bls_gemm_bp_var2.c b/sandbox/gemmlike/attic/bls_gemm_bp_var2.c index 957cd5794..8caccf923 100644 --- a/sandbox/gemmlike/attic/bls_gemm_bp_var2.c +++ b/sandbox/gemmlike/attic/bls_gemm_bp_var2.c @@ -157,7 +157,7 @@ void PASTECH2(bls_,ch,varname) \ function pointer type. */ \ /* PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ */ \ \ /* Temporary C buffer for edge cases. Note that the strides of this @@ -168,7 +168,7 @@ void PASTECH2(bls_,ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ @@ -524,7 +524,7 @@ void PASTECH2(bls_,ch,varname) \ /* Query the context for the microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the @@ -533,7 +533,7 @@ void PASTECH2(bls_,ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c index f2f8b7e25..ec5d8d5b1 100644 --- a/sandbox/gemmlike/bls_gemm.c +++ b/sandbox/gemmlike/bls_gemm.c @@ -134,7 +134,7 @@ void bls_gemm_ex // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) ) { bli_obj_swap( &a_local, &b_local ); diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c index 62dc462d5..1e3e5ea03 100644 --- a/sandbox/gemmlike/bls_gemm_bp_var1.c +++ b/sandbox/gemmlike/bls_gemm_bp_var1.c @@ -156,7 +156,7 @@ void PASTECH2(bls_,ch,varname) \ /* Query the context for the microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = cs_c; \ diff --git a/sandbox/gemmlike/bls_packm_cxk.c b/sandbox/gemmlike/bls_packm_cxk.c index ca11c207c..2ed178c65 100644 --- a/sandbox/gemmlike/bls_packm_cxk.c +++ b/sandbox/gemmlike/bls_packm_cxk.c @@ -54,15 +54,16 @@ void PASTECH2(bls_,ch,opname) \ /* Note that we use panel_dim_max, not panel_dim, to query the packm kernel function pointer. This means that we always use the same kernel, even for edge cases. */ \ - num_t dt = PASTEMAC(ch,type); \ - l1mkr_t ker_id = panel_dim_max; \ + num_t dt = PASTEMAC(ch,type); \ + ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \ + : BLIS_PACKM_MRXK_KER; \ \ PASTECH2(ch,opname,_ker_ft) f; \ \ /* Query the context for the packm kernel corresponding to the current panel dimension, or kernel id. If the id is invalid, the function will return NULL. */ \ - f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \ + f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \ \ /* If there exists a kernel implementation for the micro-panel dimension provided, we invoke the implementation. Otherwise, we use scal2m. */ \ diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index b07da91cc..9568dfee7 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -305,7 +305,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); if ( bli_obj_is_complex( &b ) ) *perf *= 4.0; // Perform checks. - libblis_test_trsm_ukr_check( params, side, &ap, &c, &b, resid ); + libblis_test_trsm_ukr_check( params, side, &a, &c, &b, resid ); // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c, perf, resid ); @@ -418,9 +418,11 @@ void libblis_test_trsm_ukr_check bli_printm( "a11", a, "%5.2f", "" ); #endif +#if 0 // Restore the diagonal of a11 to its original, un-inverted state // (needed for trsv). bli_invertd( a ); +#endif if ( bli_is_left( side ) ) { From 9fea633748ed27ef3853bba7cd955690c61092b4 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 13 Apr 2022 15:59:06 -0500 Subject: [PATCH 051/230] Partial addition of 'const' to all interfaces above the (micro)kernels. (#625) Details: - Added 'const' qualifier to applicable function arguments wherever the the pointed-to object is not internally modified. This change affects all interfaces that reside above the level of the (micro)kernels. - Typecast certain function return values to discard 'const' qualifier. - Removed 'restrict' from various arguments, including cntx_t*, auxinfo_t*, rntm_t*, thrinfo_t*, mem_t*, and others - Removed parts of some APIs, such as bli_cntx_*(), due to limited use. - Merged some variable declarations with their corresponding initialization statements. - Whitespace changes. --- build/detect/config/config_detect.c | 4 +- .../kernels/1/bli_axpyv_template_noopt_var1.c | 2 +- .../kernels/1/bli_dotv_template_noopt_var1.c | 4 +- .../1f/bli_axpy2v_template_noopt_var1.c | 2 +- .../1f/bli_axpyf_template_noopt_var1.c | 2 +- .../1f/bli_dotaxpyv_template_noopt_var1.c | 2 +- .../1f/bli_dotxaxpyf_template_noopt_var1.c | 2 +- .../1f/bli_dotxf_template_noopt_var1.c | 4 +- .../kernels/3/bli_gemm_template_noopt_mxn.c | 4 +- .../3/bli_gemmtrsm_l_template_noopt_mxn.c | 4 +- .../3/bli_gemmtrsm_u_template_noopt_mxn.c | 4 +- .../kernels/3/bli_trsm_l_template_noopt_mxn.c | 4 +- .../kernels/3/bli_trsm_u_template_noopt_mxn.c | 4 +- frame/0/bli_l0_check.c | 50 +- frame/0/bli_l0_check.h | 50 +- frame/0/bli_l0_ft.h | 42 +- frame/0/bli_l0_oapi.c | 64 +- frame/0/bli_l0_oapi.h | 34 +- frame/0/bli_l0_tapi.c | 64 +- frame/0/bli_l0_tapi.h | 54 +- frame/0/copysc/bli_copysc.c | 32 +- frame/0/copysc/bli_copysc.h | 10 +- frame/1/bli_l1v_check.c | 94 +- frame/1/bli_l1v_check.h | 94 +- frame/1/bli_l1v_ft.h | 92 +- frame/1/bli_l1v_ker_prot.h | 28 +- frame/1/bli_l1v_oapi.c | 194 +-- frame/1/bli_l1v_oapi.h | 68 +- frame/1/bli_l1v_tapi.c | 158 +-- frame/1/bli_l1v_tapi.h | 92 +- frame/1d/bli_l1d_check.c | 38 +- frame/1d/bli_l1d_check.h | 38 +- frame/1d/bli_l1d_ft.h | 86 +- frame/1d/bli_l1d_oapi.c | 44 +- frame/1d/bli_l1d_oapi.h | 30 +- frame/1d/bli_l1d_tapi.c | 164 +-- frame/1d/bli_l1d_tapi.h | 86 +- frame/1f/bli_l1f_check.c | 56 +- frame/1f/bli_l1f_check.h | 56 +- frame/1f/bli_l1f_ft.h | 94 +- frame/1f/bli_l1f_ker_prot.h | 10 +- frame/1f/bli_l1f_oapi.c | 66 +- frame/1f/bli_l1f_oapi.h | 56 +- frame/1f/bli_l1f_tapi.c | 156 +- frame/1f/bli_l1f_tapi.h | 94 +- frame/1m/bli_l1m_check.c | 34 +- frame/1m/bli_l1m_check.h | 34 +- frame/1m/bli_l1m_ft.h | 86 +- frame/1m/bli_l1m_ft_ker.h | 6 +- frame/1m/bli_l1m_ker_prot.h | 6 +- frame/1m/bli_l1m_oapi.c | 40 +- frame/1m/bli_l1m_oapi.h | 20 +- frame/1m/bli_l1m_oft_var.h | 22 +- frame/1m/bli_l1m_tapi.c | 176 +-- frame/1m/bli_l1m_tapi.h | 86 +- frame/1m/packm/bli_packm_alloc.c | 18 +- frame/1m/packm/bli_packm_alloc.h | 18 +- frame/1m/packm/bli_packm_blk_var1.c | 16 +- frame/1m/packm/bli_packm_blk_var1.h | 16 +- frame/1m/packm/bli_packm_check.c | 12 +- frame/1m/packm/bli_packm_check.h | 12 +- frame/1m/packm/bli_packm_cntl.h | 30 +- frame/1m/packm/bli_packm_init.c | 12 +- frame/1m/packm/bli_packm_init.h | 12 +- frame/1m/packm/bli_packm_int.c | 12 +- frame/1m/packm/bli_packm_int.h | 12 +- frame/1m/packm/bli_packm_part.c | 32 +- frame/1m/packm/bli_packm_part.h | 38 +- frame/1m/packm/bli_packm_struc_cxk.c | 34 +- frame/1m/packm/bli_packm_struc_cxk.h | 36 +- frame/1m/unpackm/bli_unpackm_blk_var1.c | 84 +- frame/1m/unpackm/bli_unpackm_blk_var1.h | 10 +- frame/1m/unpackm/bli_unpackm_check.c | 6 +- frame/1m/unpackm/bli_unpackm_check.h | 6 +- frame/1m/unpackm/bli_unpackm_int.c | 22 +- frame/1m/unpackm/bli_unpackm_int.h | 10 +- frame/2/bli_l2_check.c | 96 +- frame/2/bli_l2_check.h | 48 +- frame/2/bli_l2_ft.h | 106 +- frame/2/bli_l2_oapi.c | 192 +-- frame/2/bli_l2_oapi.h | 30 +- frame/2/bli_l2_tapi.c | 174 +-- frame/2/bli_l2_tapi.h | 106 +- .../gemv/{ => other}/bli_gemv_var_oapi.c.prev | 0 frame/3/bli_l3_blocksize.c | 157 +-- frame/3/bli_l3_blocksize.h | 42 +- frame/3/bli_l3_check.c | 208 +-- frame/3/bli_l3_check.h | 124 +- frame/3/bli_l3_cntl.c | 18 +- frame/3/bli_l3_cntl.h | 18 +- frame/3/bli_l3_direct.c | 32 +- frame/3/bli_l3_direct.h | 14 +- frame/3/bli_l3_ft_ukr.h | 12 +- frame/3/bli_l3_ind_ukr.h | 12 +- frame/3/bli_l3_int.c | 18 +- frame/3/bli_l3_int.h | 18 +- frame/3/bli_l3_oapi.c | 38 +- frame/3/bli_l3_oapi.h | 38 +- frame/3/bli_l3_oapi_ex.c | 152 +- frame/3/bli_l3_oapi_ex.h | 54 +- frame/3/bli_l3_oft.h | 54 +- frame/3/bli_l3_oft_var.h | 14 +- frame/3/bli_l3_packab.c | 28 +- frame/3/bli_l3_packab.h | 28 +- frame/3/bli_l3_prune.c | 246 ++-- frame/3/bli_l3_prune.h | 57 +- frame/3/bli_l3_schema.c | 8 +- frame/3/bli_l3_schema.h | 8 +- frame/3/bli_l3_sup.c | 28 +- frame/3/bli_l3_sup.h | 28 +- frame/3/bli_l3_sup_ft_ker.h | 4 +- frame/3/bli_l3_sup_int.c | 32 +- frame/3/bli_l3_sup_int.h | 32 +- frame/3/bli_l3_sup_ker_prot.h | 4 +- frame/3/bli_l3_sup_oft.h | 14 +- frame/3/bli_l3_sup_packm_a.c | 96 +- frame/3/bli_l3_sup_packm_a.h | 88 +- frame/3/bli_l3_sup_packm_b.c | 96 +- frame/3/bli_l3_sup_packm_b.h | 88 +- frame/3/bli_l3_sup_packm_var.c | 116 +- frame/3/bli_l3_sup_packm_var.h | 42 +- frame/3/bli_l3_sup_ref.c | 28 +- frame/3/bli_l3_sup_ref.h | 28 +- frame/3/bli_l3_sup_var12.c | 166 +-- frame/3/bli_l3_sup_var1n2m.c | 432 +++--- frame/3/bli_l3_sup_vars.h | 92 +- frame/3/bli_l3_tapi.c | 176 +-- frame/3/bli_l3_tapi.h | 156 +- frame/3/bli_l3_tapi_ex.c | 294 ++-- frame/3/bli_l3_tapi_ex.h | 188 +-- frame/3/bli_l3_ukr_prot.h | 12 +- frame/3/bli_l3_ukr_tapi.c | 12 +- frame/3/gemm/bli_gemm_blk_var1.c | 35 +- frame/3/gemm/bli_gemm_blk_var2.c | 35 +- frame/3/gemm/bli_gemm_blk_var3.c | 38 +- frame/3/gemm/bli_gemm_front.c | 22 +- frame/3/gemm/bli_gemm_front.h | 30 +- frame/3/gemm/bli_gemm_ker_var2.c | 110 +- frame/3/gemm/bli_gemm_md.c | 282 ++-- frame/3/gemm/bli_gemm_md.h | 88 +- frame/3/gemm/bli_gemm_md_c2r_ref.c | 4 +- frame/3/gemm/bli_gemm_var.h | 14 +- frame/3/gemm/ind/bli_gemm_ind_opt.h | 20 +- frame/3/gemmt/bli_gemmt_front.c | 16 +- frame/3/gemmt/bli_gemmt_front.h | 16 +- frame/3/gemmt/bli_gemmt_l_ker_var2.c | 105 +- frame/3/gemmt/bli_gemmt_u_ker_var2.c | 105 +- frame/3/gemmt/bli_gemmt_var.h | 14 +- frame/3/gemmt/bli_gemmt_x_ker_var2.c | 14 +- frame/3/hemm/bli_hemm_front.c | 18 +- frame/3/hemm/bli_hemm_front.h | 18 +- frame/3/symm/bli_symm_front.c | 18 +- frame/3/symm/bli_symm_front.h | 18 +- frame/3/trmm/bli_trmm_front.c | 14 +- frame/3/trmm/bli_trmm_front.h | 14 +- frame/3/trmm/bli_trmm_ll_ker_var2.c | 97 +- frame/3/trmm/bli_trmm_lu_ker_var2.c | 97 +- frame/3/trmm/bli_trmm_rl_ker_var2.c | 97 +- frame/3/trmm/bli_trmm_ru_ker_var2.c | 97 +- frame/3/trmm/bli_trmm_var.h | 14 +- frame/3/trmm/bli_trmm_xx_ker_var2.c | 14 +- frame/3/trmm3/bli_trmm3_front.c | 18 +- frame/3/trmm3/bli_trmm3_front.h | 18 +- frame/3/trsm/bli_trsm_blk_var1.c | 44 +- frame/3/trsm/bli_trsm_blk_var2.c | 35 +- frame/3/trsm/bli_trsm_blk_var3.c | 41 +- frame/3/trsm/bli_trsm_front.c | 14 +- frame/3/trsm/bli_trsm_front.h | 14 +- frame/3/trsm/bli_trsm_ll_ker_var2.c | 93 +- frame/3/trsm/bli_trsm_lu_ker_var2.c | 93 +- frame/3/trsm/bli_trsm_rl_ker_var2.c | 93 +- frame/3/trsm/bli_trsm_ru_ker_var2.c | 93 +- frame/3/trsm/bli_trsm_var.h | 14 +- frame/3/trsm/bli_trsm_xx_ker_var2.c | 14 +- frame/base/bli_apool.c | 70 +- frame/base/bli_apool.h | 34 +- frame/base/bli_arch.c | 12 +- frame/base/bli_arch.h | 4 +- frame/base/bli_array.c | 43 +- frame/base/bli_array.h | 28 +- frame/base/bli_auxinfo.h | 36 +- frame/base/bli_blksz.c | 48 +- frame/base/bli_blksz.h | 60 +- frame/base/bli_check.c | 84 +- frame/base/bli_check.h | 86 +- frame/base/bli_cntl.c | 4 +- frame/base/bli_cntl.h | 24 +- frame/base/bli_cntx.c | 20 +- frame/base/bli_cntx.h | 142 +- frame/base/bli_const.c | 10 +- frame/base/bli_env.c | 4 +- frame/base/bli_error.c | 6 +- frame/base/bli_error.h | 4 +- frame/base/bli_func.c | 6 +- frame/base/bli_func.h | 18 +- frame/base/bli_getopt.c | 8 +- frame/base/bli_getopt.h | 10 +- frame/base/bli_gks.c | 66 +- frame/base/bli_gks.h | 32 +- frame/base/bli_ind.c | 12 +- frame/base/bli_ind.h | 28 +- frame/base/bli_info.c | 43 +- frame/base/bli_info.h | 36 +- frame/base/bli_mbool.h | 2 +- frame/base/bli_mem.h | 16 +- frame/base/bli_memsys.c | 2 +- frame/base/bli_obj.c | 49 +- frame/base/bli_obj.h | 14 +- frame/base/bli_obj_scalar.c | 45 +- frame/base/bli_obj_scalar.h | 26 +- frame/base/bli_part.c | 153 +- frame/base/bli_part.h | 58 +- frame/base/bli_pba.c | 119 +- frame/base/bli_pba.h | 38 +- frame/base/bli_pool.c | 84 +- frame/base/bli_pool.h | 92 +- frame/base/bli_query.c | 6 +- frame/base/bli_query.h | 6 +- frame/base/bli_rntm.c | 6 +- frame/base/bli_rntm.h | 36 +- frame/base/bli_sba.c | 22 +- frame/base/bli_sba.h | 18 +- frame/base/bli_setgetijm.c | 68 +- frame/base/bli_setgetijm.h | 40 +- frame/base/bli_setgetijv.c | 52 +- frame/base/bli_setgetijv.h | 32 +- frame/base/bli_setri.c | 16 +- frame/base/bli_setri.h | 16 +- frame/base/cast/bli_castm.c | 89 +- frame/base/cast/bli_castm.h | 18 +- frame/base/cast/bli_castnzm.c | 89 +- frame/base/cast/bli_castnzm.h | 18 +- frame/base/cast/bli_castv.c | 53 +- frame/base/cast/bli_castv.h | 16 +- frame/base/check/bli_obj_check.c | 47 +- frame/base/check/bli_obj_check.h | 51 +- frame/base/check/bli_part_check.c | 30 +- frame/base/check/bli_part_check.h | 34 +- frame/base/proj/bli_projm.c | 8 +- frame/base/proj/bli_projm.h | 8 +- frame/base/proj/bli_projv.c | 8 +- frame/base/proj/bli_projv.h | 8 +- frame/compat/extra/bla_gemm3m.c | 4 +- frame/include/bli_extern_defs.h | 10 +- frame/include/bli_oapi_ba.h | 4 +- frame/include/bli_oapi_ex.h | 2 +- frame/include/bli_obj_macro_defs.h | 272 ++-- frame/include/bli_tapi_ba.h | 4 +- frame/include/bli_tapi_ex.h | 2 +- frame/include/bli_type_defs.h | 44 +- frame/thread/bli_l3_decor.h | 38 +- frame/thread/bli_l3_decor_openmp.c | 51 +- frame/thread/bli_l3_decor_openmp.h | 2 +- frame/thread/bli_l3_decor_pthreads.c | 118 +- frame/thread/bli_l3_decor_single.c | 44 +- frame/thread/bli_l3_sup_decor.h | 34 +- frame/thread/bli_l3_sup_decor_openmp.c | 26 +- frame/thread/bli_l3_sup_decor_pthreads.c | 74 +- frame/thread/bli_l3_sup_decor_single.c | 27 +- frame/thread/bli_thread.c | 360 ++--- frame/thread/bli_thread.h | 104 +- frame/thread/bli_thrinfo.c | 10 +- frame/thread/bli_thrinfo.h | 28 +- frame/thread/bli_thrinfo_sup.c | 38 +- frame/thread/bli_thrinfo_sup.h | 22 +- frame/util/bli_util_check.c | 94 +- frame/util/bli_util_check.h | 80 +- frame/util/bli_util_ft.h | 96 +- frame/util/bli_util_oapi.c | 260 ++-- frame/util/bli_util_oapi.h | 75 +- frame/util/bli_util_tapi.c | 152 +- frame/util/bli_util_tapi.h | 112 +- frame/util/bli_util_unb_var1.c | 150 +- frame/util/bli_util_unb_var1.h | 34 +- .../armsve/1m/bli_dpackm_armsve256_int_8xk.c | 2 +- .../armsve/1m/bli_dpackm_armsve512_asm_10xk.c | 2 +- .../armsve/1m/bli_dpackm_armsve512_asm_16xk.c | 2 +- .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c | 4 +- .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c | 4 +- .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c | 4 +- .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c | 4 +- kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c | 24 +- kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c | 8 +- kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c | 6 +- kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c | 6 +- .../armv8a/1m/bli_packm_armv8a_int_s12xk.c | 6 +- kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c | 6 +- kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 16 +- kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c | 8 +- .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c | 4 +- .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c | 4 +- .../3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c | 6 +- .../3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c | 6 +- .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c | 4 +- .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c | 4 +- .../3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c | 4 +- .../sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c | 4 +- .../sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c | 4 +- .../sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c | 12 +- .../sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c | 4 +- .../d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c | 4 +- .../d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c | 6 +- kernels/bgq/1/bli_axpyv_bgq_int.c | 8 +- kernels/bgq/1/bli_dotv_bgq_int.c | 4 +- kernels/bgq/1f/bli_axpyf_bgq_int.c | 6 +- kernels/bgq/3/bli_gemm_bgq_int_8x8.c | 8 +- .../3/bli_gemm_bulldozer_asm_d4x6_fma4.c | 16 +- .../haswell/1m/bli_packm_haswell_asm_c3xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_c8xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_d6xk.c | 26 +- .../haswell/1m/bli_packm_haswell_asm_d8xk.c | 26 +- .../haswell/1m/bli_packm_haswell_asm_s16xk.c | 28 +- .../haswell/1m/bli_packm_haswell_asm_s6xk.c | 28 +- .../haswell/1m/bli_packm_haswell_asm_z3xk.c | 2 +- .../haswell/1m/bli_packm_haswell_asm_z4xk.c | 2 +- kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c | 16 +- kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c | 16 +- .../3/bli_gemmtrsm_l_haswell_asm_d6x8.c | 8 +- .../3/bli_gemmtrsm_u_haswell_asm_d6x8.c | 8 +- .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c | 476 +++---- .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c | 612 ++++---- .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c | 790 +++++------ .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c | 614 ++++---- .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c | 828 +++++------ .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c | 1050 +++++++------- .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c | 1252 ++++++++--------- .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c | 1048 +++++++------- .../sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c | 40 +- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c | 16 +- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c | 600 ++++---- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c | 12 +- .../d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c | 460 +++--- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c | 1044 +++++++------- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c | 1036 +++++++------- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c | 1018 +++++++------- .../d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c | 1018 +++++++------- .../s6x16/bli_gemmsup_r_haswell_ref_sMx1.c | 44 +- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c | 608 ++++---- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c | 462 +++--- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c | 462 +++--- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c | 608 ++++---- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c | 462 +++--- .../s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c | 462 +++--- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c | 1008 ++++++------- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c | 1010 ++++++------- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c | 1082 +++++++------- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c | 1082 +++++++------- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c | 1202 ++++++++-------- .../s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c | 1008 ++++++------- kernels/knc/3/bli_dgemm_knc_asm_30x8.c | 4 +- kernels/knc/3/bli_sgemm_knc_asm_30x16.c | 4 +- kernels/knl/1m/bli_dpackm_knl_asm_24x8.c | 4 +- kernels/knl/1m/bli_spackm_knl_asm_24x16.c | 4 +- kernels/knl/3/bli_dgemm_knl_asm_24x8.c | 4 +- kernels/knl/3/bli_sgemm_knl_asm_24x16.c | 4 +- kernels/penryn/1/bli_axpyv_penryn_int.c | 2 +- kernels/penryn/1/bli_dotv_penryn_int.c | 2 +- kernels/penryn/1f/bli_axpy2v_penryn_int.c | 2 +- kernels/penryn/1f/bli_axpyf_penryn_int.c | 2 +- kernels/penryn/1f/bli_dotaxpyv_penryn_int.c | 2 +- kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c | 2 +- kernels/penryn/1f/bli_dotxf_penryn_int.c | 2 +- kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c | 8 +- .../penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c | 214 +-- .../penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c | 206 +-- kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c | 76 +- kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c | 76 +- .../3/bli_gemm_piledriver_asm_d8x3.c | 16 +- kernels/power10/3/bli_dgemm_power10_mma.c | 4 +- kernels/power10/3/bli_i16gemm_power10_mma.c | 4 +- kernels/power10/3/bli_i16sgemm_power10_mma.c | 4 +- kernels/power10/3/bli_i4gemm_power10_mma.c | 4 +- kernels/power10/3/bli_i8gemm_power10_mma.c | 4 +- kernels/power10/3/bli_sbgemm_power10_mma.c | 4 +- kernels/power10/3/bli_sgemm_power10_mma.c | 4 +- kernels/power10/3/bli_shgemm_power10_mma.c | 4 +- kernels/power7/3/bli_gemm_power7_int_8x4.c | 16 +- .../power7/3/test/bli_gemm_power7_int_8x4.h | 16 +- kernels/power9/3/bli_gemm_power9_asm_d12x6.c | 4 +- .../3/bli_gemm_sandybridge_asm_d8x4.c | 16 +- .../3/bli_gemm_sandybridge_int_d8x4.c | 16 +- kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c | 2 +- kernels/skx/3/bli_dgemm_skx_asm_16x14.c | 2 +- kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c | 2 +- kernels/zen/1/bli_amaxv_zen_int.c | 16 +- kernels/zen/1/bli_axpyv_zen_int.c | 4 +- kernels/zen/1/bli_axpyv_zen_int10.c | 4 +- kernels/zen/1/bli_copyv_zen_int.c | 660 ++++----- kernels/zen/1/bli_dotv_zen_int.c | 4 +- kernels/zen/1/bli_dotv_zen_int10.c | 4 +- kernels/zen/1/bli_dotxv_zen_int.c | 6 +- kernels/zen/1/bli_scalv_zen_int.c | 4 +- kernels/zen/1/bli_scalv_zen_int10.c | 8 +- kernels/zen/1/bli_setv_zen_int.c | 4 +- kernels/zen/1/bli_swapv_zen_int8.c | 4 +- kernels/zen/1f/bli_axpyf_zen_int_4.c | 2 +- kernels/zen/1f/bli_axpyf_zen_int_5.c | 6 +- kernels/zen/1f/bli_axpyf_zen_int_8.c | 4 +- kernels/zen/1f/bli_dotxf_zen_int_8.c | 4 +- .../sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c | 45 +- .../sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c | 34 +- .../sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c | 56 +- .../sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c | 36 +- .../sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c | 30 +- kernels/zen2/1f/old/bli_axpyf_zen_int_5.c | 599 ++++++++ ref_kernels/1/bli_addv_ref.c | 2 +- ref_kernels/1/bli_amaxv_ref.c | 2 +- ref_kernels/1/bli_axpbyv_ref.c | 2 +- ref_kernels/1/bli_axpyv_ref.c | 4 +- ref_kernels/1/bli_copyv_ref.c | 2 +- ref_kernels/1/bli_dotv_ref.c | 2 +- ref_kernels/1/bli_dotxv_ref.c | 2 +- ref_kernels/1/bli_invertv_ref.c | 2 +- ref_kernels/1/bli_scal2v_ref.c | 2 +- ref_kernels/1/bli_scalv_ref.c | 2 +- ref_kernels/1/bli_setv_ref.c | 2 +- ref_kernels/1/bli_subv_ref.c | 2 +- ref_kernels/1/bli_swapv_ref.c | 2 +- ref_kernels/1/bli_xpbyv_ref.c | 2 +- ref_kernels/1f/bli_axpy2v_ref.c | 2 +- ref_kernels/1f/bli_axpyf_ref.c | 2 +- ref_kernels/1f/bli_dotaxpyv_ref.c | 2 +- ref_kernels/1f/bli_dotxaxpyf_ref.c | 2 +- ref_kernels/1f/bli_dotxf_ref.c | 2 +- ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c | 2 +- ref_kernels/1m/bli_packm_cxc_diag_ref.c | 2 +- ref_kernels/1m/bli_packm_cxk_1er_ref.c | 34 +- ref_kernels/1m/bli_packm_cxk_ref.c | 2 +- ref_kernels/1m/bli_unpackm_cxk_ref.c | 2 +- ref_kernels/3/bli_gemm_ref.c | 66 +- ref_kernels/3/bli_gemmsup_ref.c | 60 +- ref_kernels/3/bli_gemmtrsm_ref.c | 36 +- ref_kernels/3/bli_trsm_ref.c | 24 +- ref_kernels/bli_cntx_ref.c | 12 +- ref_kernels/ind/bli_gemm1m_ref.c | 4 +- ref_kernels/ind/bli_gemmtrsm1m_ref.c | 4 +- ref_kernels/ind/bli_trsm1m_ref.c | 22 +- testsuite/src/test_axpy2v.c | 4 +- testsuite/src/test_axpyf.c | 4 +- testsuite/src/test_dotaxpyv.c | 6 +- testsuite/src/test_dotxaxpyf.c | 4 +- testsuite/src/test_dotxf.c | 4 +- testsuite/src/test_gemm_ukr.c | 2 +- testsuite/src/test_gemmtrsm_ukr.c | 2 +- testsuite/src/test_libblis.c | 10 +- testsuite/src/test_trsm_ukr.c | 2 +- 446 files changed, 19651 insertions(+), 19345 deletions(-) rename frame/2/gemv/{ => other}/bli_gemv_var_oapi.c.prev (100%) create mode 100644 kernels/zen2/1f/old/bli_axpyf_zen_int_5.c diff --git a/build/detect/config/config_detect.c b/build/detect/config/config_detect.c index 5e29defe1..5f1ea0f42 100644 --- a/build/detect/config/config_detect.c +++ b/build/detect/config/config_detect.c @@ -69,8 +69,8 @@ int main( int argc, char** argv ) { - arch_t id = bli_cpuid_query_id(); - char* s = bli_arch_string( id ); + arch_t id = bli_cpuid_query_id(); + const char* s = bli_arch_string( id ); printf( "%s\n", s ); diff --git a/config/template/kernels/1/bli_axpyv_template_noopt_var1.c b/config/template/kernels/1/bli_axpyv_template_noopt_var1.c index d1918466f..8796bab26 100644 --- a/config/template/kernels/1/bli_axpyv_template_noopt_var1.c +++ b/config/template/kernels/1/bli_axpyv_template_noopt_var1.c @@ -42,7 +42,7 @@ void bli_zaxpyv_template_noopt dcomplex* restrict alpha, dcomplex* restrict x, inc_t incx, dcomplex* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { /* diff --git a/config/template/kernels/1/bli_dotv_template_noopt_var1.c b/config/template/kernels/1/bli_dotv_template_noopt_var1.c index 3761d2e76..90f93b817 100644 --- a/config/template/kernels/1/bli_dotv_template_noopt_var1.c +++ b/config/template/kernels/1/bli_dotv_template_noopt_var1.c @@ -43,7 +43,7 @@ void bli_zdotv_template_noopt dcomplex* restrict x, inc_t incx, dcomplex* restrict y, inc_t incy, dcomplex* restrict rho, - cntx_t* restrict cntx + cntx_t* cntx ) { /* @@ -187,7 +187,7 @@ void bli_zdotv_template_noopt // Initialize accumulator to zero. bli_zset0s( dotxy ); - + conjx_use = conjx; // If y must be conjugated, we compute the result indirectly by first diff --git a/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c b/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c index 7080abce0..5a12bf761 100644 --- a/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c +++ b/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c @@ -45,7 +45,7 @@ void bli_zaxpy2v_template_noopt dcomplex* restrict x, inc_t incx, dcomplex* restrict y, inc_t incy, dcomplex* restrict z, inc_t incz, - cntx_t* restrict cntx + cntx_t* cntx ) { /* diff --git a/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c b/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c index a0afedfca..f7b492286 100644 --- a/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c +++ b/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c @@ -45,7 +45,7 @@ void bli_zaxpyf_template_noopt dcomplex* restrict a, inc_t inca, inc_t lda, dcomplex* restrict x, inc_t incx, dcomplex* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { /* diff --git a/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c b/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c index 275c39998..31a3097c0 100644 --- a/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c +++ b/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c @@ -46,7 +46,7 @@ void bli_zdotaxpyv_template_noopt dcomplex* restrict y, inc_t incy, dcomplex* restrict rho, dcomplex* restrict z, inc_t incz, - cntx_t* restrict cntx + cntx_t* cntx ) { /* diff --git a/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c b/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c index 6754d86ce..aeb502f35 100644 --- a/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c +++ b/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c @@ -50,7 +50,7 @@ void bli_zdotxaxpyf_template_noopt dcomplex* restrict beta, dcomplex* restrict y, inc_t incy, dcomplex* restrict z, inc_t incz, - cntx_t* restrict cntx + cntx_t* cntx ) { diff --git a/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c b/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c index 430fb277d..650303afe 100644 --- a/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c +++ b/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c @@ -46,7 +46,7 @@ void bli_zdotxf_template_noopt dcomplex* restrict x, inc_t incx, dcomplex* restrict beta, dcomplex* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { /* @@ -239,7 +239,7 @@ void bli_zdotxf_template_noopt if ( bli_is_conj( conjx ) ) bli_toggle_conj( &conjat_use ); - + // Iterate over columns of A and rows of x to compute: // Atx = conjat_use( A^T ) * x; if ( bli_is_noconj( conjat_use ) ) diff --git a/config/template/kernels/3/bli_gemm_template_noopt_mxn.c b/config/template/kernels/3/bli_gemm_template_noopt_mxn.c index 06f25a0e9..190519fa0 100644 --- a/config/template/kernels/3/bli_gemm_template_noopt_mxn.c +++ b/config/template/kernels/3/bli_gemm_template_noopt_mxn.c @@ -45,8 +45,8 @@ void bli_zgemm_template_noopt dcomplex* restrict b1, dcomplex* restrict beta, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { /* diff --git a/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c index 87c21f7ed..d44fa4c1e 100644 --- a/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c +++ b/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c @@ -44,8 +44,8 @@ void bli_zgemmtrsm_l_template_noopt dcomplex* restrict b01, dcomplex* restrict b11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { /* diff --git a/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c index 0b4544ae1..0a3d59622 100644 --- a/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c +++ b/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c @@ -44,8 +44,8 @@ void bli_zgemmtrsm_u_template_noopt dcomplex* restrict b01, dcomplex* restrict b11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { /* diff --git a/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c b/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c index ce15798b0..4e6634dea 100644 --- a/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c +++ b/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c @@ -40,8 +40,8 @@ void bli_ztrsm_l_template_noopt dcomplex* restrict a11, dcomplex* restrict b11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { /* diff --git a/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c b/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c index 661167c9c..42982459a 100644 --- a/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c +++ b/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c @@ -40,8 +40,8 @@ void bli_ztrsm_u_template_noopt dcomplex* restrict a11, dcomplex* restrict b11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { /* diff --git a/frame/0/bli_l0_check.c b/frame/0/bli_l0_check.c index 966f0c6aa..02867a22d 100644 --- a/frame/0/bli_l0_check.c +++ b/frame/0/bli_l0_check.c @@ -43,8 +43,8 @@ \ void PASTEMAC(opname,_check) \ ( \ - obj_t* chi, \ - obj_t* psi \ + const obj_t* chi, \ + const obj_t* psi \ ) \ { \ bli_l0_xxsc_check( chi, psi ); \ @@ -63,7 +63,7 @@ GENFRONT( subsc ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* chi \ + const obj_t* chi \ ) \ { \ bli_l0_xsc_check( chi ); \ @@ -77,8 +77,8 @@ GENFRONT( invertsc ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* chi, \ - obj_t* norm \ + const obj_t* chi, \ + const obj_t* norm \ ) \ { \ bli_l0_xx2sc_check( chi, norm ); \ @@ -91,9 +91,9 @@ GENFRONT( normfsc ) void bli_getsc_check ( - obj_t* chi, - double* zeta_r, - double* zeta_i + const obj_t* chi, + const double* zeta_r, + const double* zeta_i ) { err_t e_val; @@ -117,9 +117,9 @@ void bli_getsc_check void bli_setsc_check ( - double zeta_r, - double zeta_i, - obj_t* chi + double zeta_r, + double zeta_i, + const obj_t* chi ) { err_t e_val; @@ -143,9 +143,9 @@ void bli_setsc_check void bli_unzipsc_check ( - obj_t* chi, - obj_t* zeta_r, - obj_t* zeta_i + const obj_t* chi, + const obj_t* zeta_r, + const obj_t* zeta_i ) { err_t e_val; @@ -199,9 +199,9 @@ void bli_unzipsc_check void bli_zipsc_check ( - obj_t* zeta_r, - obj_t* zeta_i, - obj_t* chi + const obj_t* zeta_r, + const obj_t* zeta_i, + const obj_t* chi ) { err_t e_val; @@ -254,7 +254,7 @@ void bli_zipsc_check void bli_l0_xsc_check ( - obj_t* chi + const obj_t* chi ) { err_t e_val; @@ -280,8 +280,8 @@ void bli_l0_xsc_check void bli_l0_xxsc_check ( - obj_t* chi, - obj_t* psi + const obj_t* chi, + const obj_t* psi ) { err_t e_val; @@ -316,8 +316,8 @@ void bli_l0_xxsc_check void bli_l0_xx2sc_check ( - obj_t* chi, - obj_t* absq + const obj_t* chi, + const obj_t* absq ) { err_t e_val; @@ -355,9 +355,9 @@ void bli_l0_xx2sc_check void bli_l0_xxbsc_check ( - obj_t* chi, - obj_t* psi, - bool* is_eq + const obj_t* chi, + const obj_t* psi, + const bool* is_eq ) { err_t e_val; diff --git a/frame/0/bli_l0_check.h b/frame/0/bli_l0_check.h index f495866c6..1bbb4a756 100644 --- a/frame/0/bli_l0_check.h +++ b/frame/0/bli_l0_check.h @@ -42,8 +42,8 @@ \ void PASTEMAC(opname,_check) \ ( \ - obj_t* chi, \ - obj_t* psi \ + const obj_t* chi, \ + const obj_t* psi \ ); GENTPROT( addsc ) @@ -59,7 +59,7 @@ GENTPROT( subsc ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* chi \ + const obj_t* chi \ ); GENTPROT( invertsc ) @@ -70,8 +70,8 @@ GENTPROT( invertsc ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* chi, \ - obj_t* absq \ + const obj_t* chi, \ + const obj_t* absq \ ); GENTPROT( absqsc ) @@ -83,9 +83,9 @@ GENTPROT( normfsc ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* chi, \ - double* zeta_r, \ - double* zeta_i \ + const obj_t* chi, \ + const double* zeta_r, \ + const double* zeta_i \ ); GENTPROT( getsc ) @@ -96,9 +96,9 @@ GENTPROT( getsc ) \ void PASTEMAC(opname,_check) \ ( \ - double zeta_r, \ - double zeta_i, \ - obj_t* chi \ + double zeta_r, \ + double zeta_i, \ + const obj_t* chi \ ); GENTPROT( setsc ) @@ -109,9 +109,9 @@ GENTPROT( setsc ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* chi, \ - obj_t* zeta_r, \ - obj_t* zeta_i \ + const obj_t* chi, \ + const obj_t* zeta_r, \ + const obj_t* zeta_i \ ); GENTPROT( unzipsc ) @@ -122,9 +122,9 @@ GENTPROT( unzipsc ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* zeta_r, \ - obj_t* zeta_i, \ - obj_t* chi \ + const obj_t* zeta_r, \ + const obj_t* zeta_i, \ + const obj_t* chi \ ); GENTPROT( zipsc ) @@ -133,24 +133,24 @@ GENTPROT( zipsc ) void bli_l0_xsc_check ( - obj_t* chi + const obj_t* chi ); void bli_l0_xxsc_check ( - obj_t* chi, - obj_t* psi + const obj_t* chi, + const obj_t* psi ); void bli_l0_xx2sc_check ( - obj_t* chi, - obj_t* norm + const obj_t* chi, + const obj_t* norm ); void bli_l0_xxbsc_check ( - obj_t* chi, - obj_t* psi, - bool* is_eq + const obj_t* chi, + const obj_t* psi, + const bool* is_eq ); diff --git a/frame/0/bli_l0_ft.h b/frame/0/bli_l0_ft.h index b90e35eb5..01d90cc3b 100644 --- a/frame/0/bli_l0_ft.h +++ b/frame/0/bli_l0_ft.h @@ -44,9 +44,9 @@ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ - conj_t conjchi, \ - ctype* chi, \ - ctype* psi \ + conj_t conjchi, \ + const ctype* chi, \ + ctype* psi \ ); INSERT_GENTDEF( addsc ) @@ -73,9 +73,9 @@ INSERT_GENTDEF( invertsc ) \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ - conj_t conjchi, \ - ctype* chi, \ - ctype* psi \ + conj_t conjchi, \ + const ctype* chi, \ + ctype* psi \ ); INSERT_GENTDEF( mulsc ) @@ -87,8 +87,8 @@ INSERT_GENTDEF( mulsc ) \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ - ctype* chi, \ - ctype_r* absq \ + const ctype* chi, \ + ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) @@ -100,8 +100,8 @@ INSERT_GENTDEFR( absqsc ) \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ - ctype* chi, \ - ctype_r* norm \ + const ctype* chi, \ + ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) @@ -113,8 +113,8 @@ INSERT_GENTDEFR( normfsc ) \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ - ctype* chi, \ - ctype* psi \ + const ctype* chi, \ + ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) @@ -126,9 +126,9 @@ INSERT_GENTDEF( sqrtsc ) \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ - ctype* chi, \ - double* zeta_r, \ - double* zeta_i \ + const ctype* chi, \ + double* zeta_r, \ + double* zeta_i \ ); INSERT_GENTDEF( getsc ) @@ -154,9 +154,9 @@ INSERT_GENTDEF( setsc ) \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ - ctype* chi, \ - ctype_r* zeta_r, \ - ctype_r* zeta_i \ + const ctype* chi, \ + ctype_r* zeta_r, \ + ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) @@ -168,9 +168,9 @@ INSERT_GENTDEFR( unzipsc ) \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ - ctype_r* zeta_r, \ - ctype_r* zeta_i, \ - ctype* chi \ + const ctype_r* zeta_r, \ + const ctype_r* zeta_i, \ + ctype* chi \ ); INSERT_GENTDEFR( zipsc ) diff --git a/frame/0/bli_l0_oapi.c b/frame/0/bli_l0_oapi.c index ac62530db..0bfdbe3b3 100644 --- a/frame/0/bli_l0_oapi.c +++ b/frame/0/bli_l0_oapi.c @@ -43,25 +43,25 @@ \ void PASTEMAC0(opname) \ ( \ - obj_t* chi, \ - obj_t* absq \ + const obj_t* chi, \ + const obj_t* absq \ ) \ { \ bli_init_once(); \ \ - num_t dt_chi; \ - num_t dt_absq_c = bli_obj_dt_proj_to_complex( absq ); \ + num_t dt_chi; \ + num_t dt_absq_c = bli_obj_dt_proj_to_complex( absq ); \ \ - void* buf_chi; \ - void* buf_absq = bli_obj_buffer_at_off( absq ); \ + const void* buf_chi; \ + void* buf_absq = bli_obj_buffer_at_off( absq ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( chi, absq ); \ + PASTEMAC(opname,_check)( chi, absq ); \ \ /* If chi is a scalar constant, use dt_absq_c to extract the address of the corresponding constant value; otherwise, use the datatype encoded within the chi object and extract the buffer at the chi offset. */ \ - bli_obj_scalar_set_dt_buffer( chi, dt_absq_c, &dt_chi, &buf_chi ); \ + bli_obj_scalar_set_dt_buffer( chi, dt_absq_c, &dt_chi, ( void** )&buf_chi ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -83,8 +83,8 @@ GENFRONT( normfsc ) \ void PASTEMAC0(opname) \ ( \ - obj_t* chi, \ - obj_t* psi \ + const obj_t* chi, \ + const obj_t* psi \ ) \ { \ bli_init_once(); \ @@ -97,7 +97,7 @@ void PASTEMAC0(opname) \ void* buf_psi = bli_obj_buffer_at_off( psi ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( chi, psi ); \ + PASTEMAC(opname,_check)( chi, psi ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -122,7 +122,7 @@ GENFRONT( subsc ) \ void PASTEMAC0(opname) \ ( \ - obj_t* chi \ + const obj_t* chi \ ) \ { \ bli_init_once(); \ @@ -134,7 +134,7 @@ void PASTEMAC0(opname) \ void* buf_chi = bli_obj_buffer_for_1x1( dt, chi ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( chi ); \ + PASTEMAC(opname,_check)( chi ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -155,8 +155,8 @@ GENFRONT( invertsc ) \ void PASTEMAC0(opname) \ ( \ - obj_t* chi, \ - obj_t* psi \ + const obj_t* chi, \ + const obj_t* psi \ ) \ { \ bli_init_once(); \ @@ -167,7 +167,7 @@ void PASTEMAC0(opname) \ void* buf_psi = bli_obj_buffer_at_off( psi ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( chi, psi ); \ + PASTEMAC(opname,_check)( chi, psi ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -188,9 +188,9 @@ GENFRONT( sqrtsc ) \ void PASTEMAC0(opname) \ ( \ - obj_t* chi, \ - double* zeta_r, \ - double* zeta_i \ + const obj_t* chi, \ + double* zeta_r, \ + double* zeta_i \ ) \ { \ bli_init_once(); \ @@ -205,7 +205,7 @@ void PASTEMAC0(opname) \ void* buf_chi = bli_obj_buffer_for_1x1( dt_def, chi ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \ + PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \ \ /* The _check() routine prevents integer types, so we know that chi is either a constant or an actual floating-point type. */ \ @@ -232,9 +232,9 @@ GENFRONT( getsc ) \ void PASTEMAC0(opname) \ ( \ - double zeta_r, \ - double zeta_i, \ - obj_t* chi \ + double zeta_r, \ + double zeta_i, \ + const obj_t* chi \ ) \ { \ bli_init_once(); \ @@ -244,7 +244,7 @@ void PASTEMAC0(opname) \ void* buf_chi = bli_obj_buffer_at_off( chi ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( zeta_r, zeta_i, chi ); \ + PASTEMAC(opname,_check)( zeta_r, zeta_i, chi ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -266,9 +266,9 @@ GENFRONT( setsc ) \ void PASTEMAC0(opname) \ ( \ - obj_t* chi, \ - obj_t* zeta_r, \ - obj_t* zeta_i \ + const obj_t* chi, \ + const obj_t* zeta_r, \ + const obj_t* zeta_i \ ) \ { \ bli_init_once(); \ @@ -282,7 +282,7 @@ void PASTEMAC0(opname) \ void* buf_zeta_i = bli_obj_buffer_at_off( zeta_i ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \ + PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \ \ /* If chi is a scalar constant, use dt_zeta_c to extract the address of the corresponding constant value; otherwise, use the datatype encoded @@ -309,9 +309,9 @@ GENFRONT( unzipsc ) \ void PASTEMAC0(opname) \ ( \ - obj_t* zeta_r, \ - obj_t* zeta_i, \ - obj_t* chi \ + const obj_t* zeta_r, \ + const obj_t* zeta_i, \ + const obj_t* chi \ ) \ { \ bli_init_once(); \ @@ -324,7 +324,7 @@ void PASTEMAC0(opname) \ void* buf_chi = bli_obj_buffer_at_off( chi ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \ + PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ diff --git a/frame/0/bli_l0_oapi.h b/frame/0/bli_l0_oapi.h index 702bb40ea..a34252cf7 100644 --- a/frame/0/bli_l0_oapi.h +++ b/frame/0/bli_l0_oapi.h @@ -42,8 +42,8 @@ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - obj_t* chi, \ - obj_t* absq \ + const obj_t* chi, \ + const obj_t* absq \ ); GENPROT( absqsc ) @@ -55,8 +55,8 @@ GENPROT( normfsc ) \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - obj_t* chi, \ - obj_t* psi \ + const obj_t* chi, \ + const obj_t* psi \ ); GENPROT( addsc ) @@ -71,7 +71,7 @@ GENPROT( subsc ) \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - obj_t* chi \ + const obj_t* chi \ ); GENPROT( invertsc ) @@ -82,9 +82,9 @@ GENPROT( invertsc ) \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - obj_t* chi, \ - double* zeta_r, \ - double* zeta_i \ + const obj_t* chi, \ + double* zeta_r, \ + double* zeta_i \ ); GENPROT( getsc ) @@ -95,9 +95,9 @@ GENPROT( getsc ) \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - double zeta_r, \ - double zeta_i, \ - obj_t* chi \ + double zeta_r, \ + double zeta_i, \ + const obj_t* chi \ ); GENPROT( setsc ) @@ -108,9 +108,9 @@ GENPROT( setsc ) \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - obj_t* chi, \ - obj_t* zeta_r, \ - obj_t* zeta_i \ + const obj_t* chi, \ + const obj_t* zeta_r, \ + const obj_t* zeta_i \ ); GENPROT( unzipsc ) @@ -121,9 +121,9 @@ GENPROT( unzipsc ) \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - obj_t* zeta_r, \ - obj_t* zeta_i, \ - obj_t* chi \ + const obj_t* zeta_r, \ + const obj_t* zeta_i, \ + const obj_t* chi \ ); GENPROT( zipsc ) diff --git a/frame/0/bli_l0_tapi.c b/frame/0/bli_l0_tapi.c index 620cad299..e0cdffcf3 100644 --- a/frame/0/bli_l0_tapi.c +++ b/frame/0/bli_l0_tapi.c @@ -43,9 +43,9 @@ \ void PASTEMAC(ch,opname) \ ( \ - conj_t conjchi, \ - ctype* chi, \ - ctype* psi \ + conj_t conjchi, \ + const ctype* chi, \ + ctype* psi \ ) \ { \ bli_init_once(); \ @@ -66,8 +66,8 @@ INSERT_GENTFUNC_BASIC( subsc, subs ) \ void PASTEMAC(ch,opname) \ ( \ - conj_t conjchi, \ - ctype* chi \ + conj_t conjchi, \ + ctype* chi \ ) \ { \ bli_init_once(); \ @@ -87,9 +87,9 @@ INSERT_GENTFUNC_BASIC( invertsc, inverts ) \ void PASTEMAC(ch,opname) \ ( \ - conj_t conjchi, \ - ctype* chi, \ - ctype* psi \ + conj_t conjchi, \ + const ctype* chi, \ + ctype* psi \ ) \ { \ bli_init_once(); \ @@ -116,8 +116,8 @@ INSERT_GENTFUNC_BASIC( mulsc, scals ) \ void PASTEMAC(ch,opname) \ ( \ - ctype* chi, \ - ctype_r* absq \ + const ctype* chi, \ + ctype_r* absq \ ) \ { \ bli_init_once(); \ @@ -145,8 +145,8 @@ INSERT_GENTFUNCR_BASIC0( absqsc ) \ void PASTEMAC(ch,opname) \ ( \ - ctype* chi, \ - ctype_r* norm \ + const ctype* chi, \ + ctype_r* norm \ ) \ { \ bli_init_once(); \ @@ -163,8 +163,8 @@ INSERT_GENTFUNCR_BASIC0( normfsc ) \ void PASTEMAC(ch,opname) \ ( \ - ctype* chi, \ - ctype* psi \ + const ctype* chi, \ + ctype* psi \ ) \ { \ bli_init_once(); \ @@ -181,9 +181,9 @@ INSERT_GENTFUNC_BASIC0( sqrtsc ) \ void PASTEMAC(ch,opname) \ ( \ - ctype* chi, \ - double* zeta_r, \ - double* zeta_i \ + const ctype* chi, \ + double* zeta_r, \ + double* zeta_i \ ) \ { \ bli_init_once(); \ @@ -199,9 +199,9 @@ INSERT_GENTFUNC_BASIC0( getsc ) \ void PASTEMAC(ch,opname) \ ( \ - double zeta_r, \ - double zeta_i, \ - ctype* chi \ + double zeta_r, \ + double zeta_i, \ + ctype* chi \ ) \ { \ bli_init_once(); \ @@ -217,9 +217,9 @@ INSERT_GENTFUNC_BASIC0( setsc ) \ void PASTEMAC(ch,opname) \ ( \ - ctype* chi, \ - ctype_r* zeta_r, \ - ctype_r* zeta_i \ + const ctype* chi, \ + ctype_r* zeta_r, \ + ctype_r* zeta_i \ ) \ { \ bli_init_once(); \ @@ -235,9 +235,9 @@ INSERT_GENTFUNCR_BASIC0( unzipsc ) \ void PASTEMAC(ch,opname) \ ( \ - ctype_r* zeta_r, \ - ctype_r* zeta_i, \ - ctype* chi \ + const ctype_r* zeta_r, \ + const ctype_r* zeta_i, \ + ctype* chi \ ) \ { \ bli_init_once(); \ @@ -251,9 +251,9 @@ INSERT_GENTFUNCR_BASIC0( zipsc ) void bli_igetsc ( - dim_t* chi, - double* zeta_r, - double* zeta_i + const dim_t* chi, + double* zeta_r, + double* zeta_i ) { bli_init_once(); @@ -263,9 +263,9 @@ void bli_igetsc void bli_isetsc ( - double zeta_r, - double zeta_i, - dim_t* chi + double zeta_r, + double zeta_i, + dim_t* chi ) { bli_init_once(); diff --git a/frame/0/bli_l0_tapi.h b/frame/0/bli_l0_tapi.h index c2d600d66..b39303410 100644 --- a/frame/0/bli_l0_tapi.h +++ b/frame/0/bli_l0_tapi.h @@ -42,9 +42,9 @@ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - conj_t conjchi, \ - ctype* chi, \ - ctype* psi \ + conj_t conjchi, \ + const ctype* chi, \ + ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) @@ -58,8 +58,8 @@ INSERT_GENTPROT_BASIC0( subsc ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - conj_t conjchi, \ - ctype* chi \ + conj_t conjchi, \ + ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) @@ -70,8 +70,8 @@ INSERT_GENTPROT_BASIC0( invertsc ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - ctype* chi, \ - ctype_r* absq \ + const ctype* chi, \ + ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) @@ -83,8 +83,8 @@ INSERT_GENTPROTR_BASIC0( normfsc ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - ctype* chi, \ - ctype* psi \ + const ctype* chi, \ + ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) @@ -95,9 +95,9 @@ INSERT_GENTPROT_BASIC0( sqrtsc ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - ctype* chi, \ - double* zeta_r, \ - double* zeta_i \ + const ctype* chi, \ + double* zeta_r, \ + double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) @@ -108,9 +108,9 @@ INSERT_GENTPROT_BASIC0( getsc ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - double zeta_r, \ - double zeta_i, \ - ctype* chi \ + double zeta_r, \ + double zeta_i, \ + ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) @@ -121,9 +121,9 @@ INSERT_GENTPROT_BASIC0( setsc ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - ctype* chi, \ - ctype_r* zeta_r, \ - ctype_r* zeta_i \ + const ctype* chi, \ + ctype_r* zeta_r, \ + ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) @@ -134,9 +134,9 @@ INSERT_GENTPROTR_BASIC0( unzipsc ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - ctype_r* zeta_r, \ - ctype_r* zeta_i, \ - ctype* chi \ + const ctype_r* zeta_r, \ + const ctype_r* zeta_i, \ + ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) @@ -145,15 +145,15 @@ INSERT_GENTPROTR_BASIC0( zipsc ) BLIS_EXPORT_BLIS void bli_igetsc ( - dim_t* chi, - double* zeta_r, - double* zeta_i + const dim_t* chi, + double* zeta_r, + double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( - double zeta_r, - double zeta_i, - dim_t* chi + double zeta_r, + double zeta_i, + dim_t* chi ); diff --git a/frame/0/copysc/bli_copysc.c b/frame/0/copysc/bli_copysc.c index 3001aa6c7..c2e01d07b 100644 --- a/frame/0/copysc/bli_copysc.c +++ b/frame/0/copysc/bli_copysc.c @@ -41,9 +41,9 @@ typedef void (*FUNCPTR_T) ( - conj_t conjchi, - void* chi, - void* psi + conj_t conjchi, + const void* chi, + void* psi ); static FUNCPTR_T GENARRAY2_ALL(ftypes,copysc); @@ -57,24 +57,24 @@ static FUNCPTR_T GENARRAY2_ALL(ftypes,copysc); \ void PASTEMAC0(opname) \ ( \ - obj_t* chi, \ - obj_t* psi \ + const obj_t* chi, \ + const obj_t* psi \ ) \ { \ bli_init_once(); \ \ - conj_t conjchi = bli_obj_conj_status( chi ); \ + conj_t conjchi = bli_obj_conj_status( chi ); \ \ - num_t dt_psi = bli_obj_dt( psi ); \ - void* buf_psi = bli_obj_buffer_at_off( psi ); \ + num_t dt_psi = bli_obj_dt( psi ); \ + void* buf_psi = bli_obj_buffer_at_off( psi ); \ \ - num_t dt_chi; \ - void* buf_chi; \ + num_t dt_chi; \ + void* buf_chi; \ \ FUNCPTR_T f; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( chi, psi ); \ + PASTEMAC(opname,_check)( chi, psi ); \ \ /* If chi is a scalar constant, use dt_psi to extract the address of the corresponding constant value; otherwise, use the datatype encoded @@ -105,15 +105,15 @@ GENFRONT( copysc ) \ void PASTEMAC2(chx,chy,varname) \ ( \ - conj_t conjchi, \ - void* chi, \ - void* psi \ + conj_t conjchi, \ + const void* chi, \ + void* psi \ ) \ { \ bli_init_once(); \ \ - ctype_x* chi_cast = chi; \ - ctype_y* psi_cast = psi; \ + const ctype_x* chi_cast = chi; \ + ctype_y* psi_cast = psi; \ \ if ( bli_is_conj( conjchi ) ) \ { \ diff --git a/frame/0/copysc/bli_copysc.h b/frame/0/copysc/bli_copysc.h index 1dfd9d7bc..cd5481e57 100644 --- a/frame/0/copysc/bli_copysc.h +++ b/frame/0/copysc/bli_copysc.h @@ -42,8 +42,8 @@ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - obj_t* chi, \ - obj_t* psi \ + const obj_t* chi, \ + const obj_t* psi \ ); GENFRONT( copysc ) @@ -57,9 +57,9 @@ GENFRONT( copysc ) \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ - conj_t conjchi, \ - void* chi, \ - void* psi \ + conj_t conjchi, \ + const void* chi, \ + void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) diff --git a/frame/1/bli_l1v_check.c b/frame/1/bli_l1v_check.c index 74b60febd..8ab470bf4 100644 --- a/frame/1/bli_l1v_check.c +++ b/frame/1/bli_l1v_check.c @@ -43,8 +43,8 @@ \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* y \ ) \ { \ bli_l1v_xy_check( x, y ); \ @@ -61,8 +61,8 @@ GENFRONT( swapv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* index \ + const obj_t* x, \ + const obj_t* index \ ) \ { \ bli_l1v_xi_check( x, index ); \ @@ -76,10 +76,10 @@ GENFRONT( amaxv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ ) \ { \ bli_l1v_axby_check( alpha, x, beta, y ); \ @@ -93,9 +93,9 @@ GENFRONT( axpbyv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y \ ) \ { \ bli_l1v_axy_check( alpha, x, y ); \ @@ -110,9 +110,9 @@ GENFRONT( scal2v ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* y, \ - obj_t* rho \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* rho \ ) \ { \ bli_l1v_dot_check( &BLIS_ONE, x, y, &BLIS_ONE, rho ); \ @@ -126,11 +126,11 @@ GENFRONT( dotv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* beta, \ - obj_t* rho \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* beta, \ + const obj_t* rho \ ) \ { \ bli_l1v_dot_check( alpha, x, y, beta, rho ); \ @@ -144,7 +144,7 @@ GENFRONT( dotxv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x \ + const obj_t* x \ ) \ { \ bli_l1v_x_check( x ); \ @@ -158,8 +158,8 @@ GENFRONT( invertv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ ) \ { \ bli_l1v_ax_check( alpha, x ); \ @@ -174,9 +174,9 @@ GENFRONT( setv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ ) \ { \ bli_l1v_xby_check( x, beta, y ); \ @@ -189,8 +189,8 @@ GENFRONT( xpbyv ) void bli_l1v_xy_check ( - obj_t* x, - obj_t* y + const obj_t* x, + const obj_t* y ) { err_t e_val; @@ -230,9 +230,9 @@ void bli_l1v_xy_check void bli_l1v_axy_check ( - obj_t* alpha, - obj_t* x, - obj_t* y + const obj_t* alpha, + const obj_t* x, + const obj_t* y ) { err_t e_val; @@ -281,9 +281,9 @@ void bli_l1v_axy_check void bli_l1v_xby_check ( - obj_t* x, - obj_t* beta, - obj_t* y + const obj_t* x, + const obj_t* beta, + const obj_t* y ) { err_t e_val; @@ -332,10 +332,10 @@ void bli_l1v_xby_check void bli_l1v_axby_check ( - obj_t* alpha, - obj_t* x, - obj_t* beta, - obj_t* y + const obj_t* alpha, + const obj_t* x, + const obj_t* beta, + const obj_t* y ) { err_t e_val; @@ -393,11 +393,11 @@ void bli_l1v_axby_check void bli_l1v_dot_check ( - obj_t* alpha, - obj_t* x, - obj_t* y, - obj_t* beta, - obj_t* rho + const obj_t* alpha, + const obj_t* x, + const obj_t* y, + const obj_t* beta, + const obj_t* rho ) { err_t e_val; @@ -467,7 +467,7 @@ void bli_l1v_dot_check void bli_l1v_x_check ( - obj_t* x + const obj_t* x ) { err_t e_val; @@ -490,8 +490,8 @@ void bli_l1v_x_check void bli_l1v_ax_check ( - obj_t* alpha, - obj_t* x + const obj_t* alpha, + const obj_t* x ) { err_t e_val; @@ -523,8 +523,8 @@ void bli_l1v_ax_check void bli_l1v_xi_check ( - obj_t* x, - obj_t* index + const obj_t* x, + const obj_t* index ) { err_t e_val; diff --git a/frame/1/bli_l1v_check.h b/frame/1/bli_l1v_check.h index 98051d0cd..110b25d55 100644 --- a/frame/1/bli_l1v_check.h +++ b/frame/1/bli_l1v_check.h @@ -42,8 +42,8 @@ \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* y \ ); GENTPROT( addv ) @@ -57,8 +57,8 @@ GENTPROT( swapv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* index \ + const obj_t* x, \ + const obj_t* index \ ); GENTPROT( amaxv ) @@ -69,10 +69,10 @@ GENTPROT( amaxv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ ); GENTPROT( axpbyv ) @@ -83,9 +83,9 @@ GENTPROT( axpbyv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y \ ); GENTPROT( axpyv ) @@ -97,9 +97,9 @@ GENTPROT( scal2v ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* y, \ - obj_t* rho \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* rho \ ); GENTPROT( dotv ) @@ -110,11 +110,11 @@ GENTPROT( dotv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* beta, \ - obj_t* rho \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* beta, \ + const obj_t* rho \ ); GENTPROT( dotxv ) @@ -125,7 +125,7 @@ GENTPROT( dotxv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x \ + const obj_t* x \ ); GENTPROT( invertv ) @@ -136,8 +136,8 @@ GENTPROT( invertv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ ); GENTPROT( scalv ) @@ -149,9 +149,9 @@ GENTPROT( setv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ ); GENTPROT( xpbyv ) @@ -162,55 +162,55 @@ GENTPROT( xpbyv ) void bli_l1v_xy_check ( - obj_t* x, - obj_t* y + const obj_t* x, + const obj_t* y ); void bli_l1v_axy_check ( - obj_t* alpha, - obj_t* x, - obj_t* y + const obj_t* alpha, + const obj_t* x, + const obj_t* y ); void bli_l1v_xby_check ( - obj_t* x, - obj_t* beta, - obj_t* y + const obj_t* x, + const obj_t* beta, + const obj_t* y ); void bli_l1v_axby_check ( - obj_t* alpha, - obj_t* x, - obj_t* beta, - obj_t* y + const obj_t* alpha, + const obj_t* x, + const obj_t* beta, + const obj_t* y ); void bli_l1v_dot_check ( - obj_t* alpha, - obj_t* x, - obj_t* y, - obj_t* beta, - obj_t* rho + const obj_t* alpha, + const obj_t* x, + const obj_t* y, + const obj_t* beta, + const obj_t* rho ); void bli_l1v_x_check ( - obj_t* x + const obj_t* x ); void bli_l1v_ax_check ( - obj_t* alpha, - obj_t* x + const obj_t* alpha, + const obj_t* x ); void bli_l1v_xi_check ( - obj_t* x, - obj_t* index + const obj_t* x, + const obj_t* index ); diff --git a/frame/1/bli_l1v_ft.h b/frame/1/bli_l1v_ft.h index 162f1bf60..57f9d223a 100644 --- a/frame/1/bli_l1v_ft.h +++ b/frame/1/bli_l1v_ft.h @@ -44,10 +44,10 @@ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); @@ -62,9 +62,9 @@ INSERT_GENTDEF( subv ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - dim_t* index \ + dim_t n, \ + const ctype* x, inc_t incx, \ + dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); @@ -77,12 +77,12 @@ INSERT_GENTDEF( amaxv ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); @@ -95,11 +95,11 @@ INSERT_GENTDEF( axpbyv ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); @@ -113,12 +113,12 @@ INSERT_GENTDEF( scal2v ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* rho \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); @@ -131,14 +131,14 @@ INSERT_GENTDEF( dotv ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* beta, \ - ctype* rho \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + const ctype* beta, \ + ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); @@ -151,8 +151,8 @@ INSERT_GENTDEF( dotxv ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - dim_t n, \ - ctype* x, inc_t incx \ + dim_t n, \ + ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); @@ -165,10 +165,10 @@ INSERT_GENTDEF( invertv ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conjalpha, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx \ + conj_t conjalpha, \ + dim_t n, \ + const ctype* alpha, \ + ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); @@ -182,9 +182,9 @@ INSERT_GENTDEF( setv ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ + dim_t n, \ + ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); @@ -197,11 +197,11 @@ INSERT_GENTDEF( swapv ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); diff --git a/frame/1/bli_l1v_ker_prot.h b/frame/1/bli_l1v_ker_prot.h index 1a1eec3f3..b912ba7e0 100644 --- a/frame/1/bli_l1v_ker_prot.h +++ b/frame/1/bli_l1v_ker_prot.h @@ -45,7 +45,7 @@ void PASTEMAC(ch,opname) \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); @@ -56,7 +56,7 @@ void PASTEMAC(ch,opname) \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); \ @@ -70,7 +70,7 @@ void PASTEMAC(ch,opname) \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); \ @@ -83,7 +83,7 @@ void PASTEMAC(ch,opname) \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); \ @@ -95,7 +95,7 @@ void PASTEMAC(ch,opname) \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); @@ -109,7 +109,7 @@ void PASTEMAC(ch,opname) \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); \ @@ -125,7 +125,7 @@ void PASTEMAC(ch,opname) \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); \ @@ -135,7 +135,7 @@ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); \ @@ -147,7 +147,7 @@ void PASTEMAC(ch,opname) \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); \ @@ -160,7 +160,7 @@ void PASTEMAC(ch,opname) \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); \ @@ -172,7 +172,7 @@ void PASTEMAC(ch,opname) \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); \ @@ -184,7 +184,7 @@ void PASTEMAC(ch,opname) \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); @@ -195,7 +195,7 @@ void PASTEMAC(ch,opname) \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); \ @@ -208,6 +208,6 @@ void PASTEMAC(ch,opname) \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); \ diff --git a/frame/1/bli_l1v_oapi.c b/frame/1/bli_l1v_oapi.c index 201af2e09..4ea241693 100644 --- a/frame/1/bli_l1v_oapi.c +++ b/frame/1/bli_l1v_oapi.c @@ -45,8 +45,8 @@ \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -64,7 +64,7 @@ void PASTEMAC(opname,EX_SUF) \ inc_t inc_y = bli_obj_vector_inc( y ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, y ); \ + PASTEMAC(opname,_check)( x, y ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -92,8 +92,8 @@ GENFRONT( subv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* index \ + const obj_t* x, \ + const obj_t* index \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -110,7 +110,7 @@ void PASTEMAC(opname,EX_SUF) \ void* buf_index = bli_obj_buffer_at_off( index ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, index ); \ + PASTEMAC(opname,_check)( x, index ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -119,11 +119,11 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - n, \ - buf_x, incx, \ - buf_index, \ - cntx, \ - rntm \ + n, \ + buf_x, incx, \ + buf_index, \ + cntx, \ + rntm \ ); \ } @@ -135,10 +135,10 @@ GENFRONT( amaxv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -162,7 +162,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, x, beta, y ); \ + PASTEMAC(opname,_check)( alpha, x, beta, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -180,14 +180,14 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - conjx, \ - n, \ - buf_alpha, \ - buf_x, inc_x, \ - buf_beta, \ - buf_y, inc_y, \ - cntx, \ - rntm \ + conjx, \ + n, \ + buf_alpha, \ + buf_x, inc_x, \ + buf_beta, \ + buf_y, inc_y, \ + cntx, \ + rntm \ ); \ } @@ -199,9 +199,9 @@ GENFRONT( axpbyv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -223,7 +223,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, x, y ); \ + PASTEMAC(opname,_check)( alpha, x, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -238,13 +238,13 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - conjx, \ - n, \ - buf_alpha, \ - buf_x, inc_x, \ - buf_y, inc_y, \ - cntx, \ - rntm \ + conjx, \ + n, \ + buf_alpha, \ + buf_x, inc_x, \ + buf_y, inc_y, \ + cntx, \ + rntm \ ); \ } @@ -257,9 +257,9 @@ GENFRONT( scal2v ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* y, \ - obj_t* rho \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -279,7 +279,7 @@ void PASTEMAC(opname,EX_SUF) \ void* buf_rho = bli_obj_buffer_at_off( rho ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, y, rho ); \ + PASTEMAC(opname,_check)( x, y, rho ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -288,14 +288,14 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - conjx, \ - conjy, \ - n, \ - buf_x, inc_x, \ - buf_y, inc_y, \ - buf_rho, \ - cntx, \ - rntm \ + conjx, \ + conjy, \ + n, \ + buf_x, inc_x, \ + buf_y, inc_y, \ + buf_rho, \ + cntx, \ + rntm \ ); \ } @@ -307,11 +307,11 @@ GENFRONT( dotv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* beta, \ - obj_t* rho \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* beta, \ + const obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -337,7 +337,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, x, y, beta, rho ); \ + PASTEMAC(opname,_check)( alpha, x, y, beta, rho ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -355,16 +355,16 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - conjx, \ - conjy, \ - n, \ - buf_alpha, \ - buf_x, inc_x, \ - buf_y, inc_y, \ - buf_beta, \ - buf_rho, \ - cntx, \ - rntm \ + conjx, \ + conjy, \ + n, \ + buf_alpha, \ + buf_x, inc_x, \ + buf_y, inc_y, \ + buf_beta, \ + buf_rho, \ + cntx, \ + rntm \ ); \ } @@ -376,7 +376,7 @@ GENFRONT( dotxv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -391,7 +391,7 @@ void PASTEMAC(opname,EX_SUF) \ inc_t inc_x = bli_obj_vector_inc( x ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x ); \ + PASTEMAC(opname,_check)( x ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -400,10 +400,10 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - n, \ - buf_x, inc_x, \ - cntx, \ - rntm \ + n, \ + buf_x, inc_x, \ + cntx, \ + rntm \ ); \ } @@ -415,8 +415,8 @@ GENFRONT( invertv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -436,7 +436,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, x ); \ + PASTEMAC(opname,_check)( alpha, x ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -451,12 +451,12 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \ - n, \ - buf_alpha, \ - buf_x, inc_x, \ - cntx, \ - rntm \ + BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \ + n, \ + buf_alpha, \ + buf_x, inc_x, \ + cntx, \ + rntm \ ); \ } @@ -469,8 +469,8 @@ GENFRONT( setv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -487,7 +487,7 @@ void PASTEMAC(opname,EX_SUF) \ inc_t inc_y = bli_obj_vector_inc( y ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, y ); \ + PASTEMAC(opname,_check)( x, y ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -496,11 +496,11 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - n, \ - buf_x, inc_x, \ - buf_y, inc_y, \ - cntx, \ - rntm \ + n, \ + buf_x, inc_x, \ + buf_y, inc_y, \ + cntx, \ + rntm \ ); \ } @@ -512,9 +512,9 @@ GENFRONT( swapv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -536,7 +536,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, beta, y ); \ + PASTEMAC(opname,_check)( x, beta, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -551,13 +551,13 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - conjx, \ - n, \ - buf_x, inc_x, \ - buf_beta, \ - buf_y, inc_y, \ - cntx, \ - rntm \ + conjx, \ + n, \ + buf_x, inc_x, \ + buf_beta, \ + buf_y, inc_y, \ + cntx, \ + rntm \ ); \ } diff --git a/frame/1/bli_l1v_oapi.h b/frame/1/bli_l1v_oapi.h index 41aecdc4d..957747a2a 100644 --- a/frame/1/bli_l1v_oapi.h +++ b/frame/1/bli_l1v_oapi.h @@ -42,10 +42,10 @@ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ - ); + ); GENTPROT( addv ) GENTPROT( copyv ) @@ -57,8 +57,8 @@ GENTPROT( subv ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* index \ + const obj_t* x, \ + const obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); @@ -70,10 +70,10 @@ GENTPROT( amaxv ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); @@ -85,11 +85,11 @@ GENTPROT( axpbyv ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ - ); + ); GENTPROT( axpyv ) GENTPROT( scal2v ) @@ -100,11 +100,11 @@ GENTPROT( scal2v ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* y, \ - obj_t* rho \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* rho \ BLIS_OAPI_EX_PARAMS \ - ); + ); GENTPROT( dotv ) @@ -114,13 +114,13 @@ GENTPROT( dotv ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* beta, \ - obj_t* rho \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* beta, \ + const obj_t* rho \ BLIS_OAPI_EX_PARAMS \ - ); + ); GENTPROT( dotxv ) @@ -130,9 +130,9 @@ GENTPROT( dotxv ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ - ); + ); GENTPROT( invertv ) @@ -142,10 +142,10 @@ GENTPROT( invertv ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ - ); + ); GENTPROT( scalv ) GENTPROT( setv ) @@ -156,10 +156,10 @@ GENTPROT( setv ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ - ); + ); GENTPROT( swapv ) @@ -169,9 +169,9 @@ GENTPROT( swapv ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c index 1d12b42eb..01e3356d5 100644 --- a/frame/1/bli_l1v_tapi.c +++ b/frame/1/bli_l1v_tapi.c @@ -45,10 +45,10 @@ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -67,9 +67,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conjx, \ n, \ - x, incx, \ - y, incy, \ - cntx \ + ( ctype* )x, incx, \ + y, incy, \ + ( cntx_t* )cntx \ ); \ } @@ -83,9 +83,9 @@ INSERT_GENTFUNC_BASIC( subv, BLIS_SUBV_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - dim_t* index \ + dim_t n, \ + const ctype* x, inc_t incx, \ + dim_t* index \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -103,9 +103,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \ f \ ( \ n, \ - x, incx, \ + ( ctype* )x, incx, \ index, \ - cntx \ + ( cntx_t* )cntx \ ); \ } @@ -117,12 +117,12 @@ INSERT_GENTFUNC_BASIC( amaxv, BLIS_AMAXV_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -141,11 +141,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conjx, \ n, \ - alpha, \ - x, incx, \ - beta, \ - y, incy, \ - cntx \ + ( ctype* )alpha, \ + ( ctype* )x, incx, \ + ( ctype* )beta, \ + y, incy, \ + ( cntx_t* )cntx \ ); \ } @@ -157,11 +157,11 @@ INSERT_GENTFUNC_BASIC( axpbyv, BLIS_AXPBYV_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -181,10 +181,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conjx, \ n, \ - alpha, \ - x, incx, \ - y, incy, \ - cntx \ + ( ctype* )alpha, \ + ( ctype* )x, incx, \ + y, incy, \ + ( cntx_t* )cntx \ ); \ } @@ -197,12 +197,12 @@ INSERT_GENTFUNC_BASIC( scal2v, BLIS_SCAL2V_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* rho \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* rho \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -222,10 +222,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \ conjx, \ conjy, \ n, \ - x, incx, \ - y, incy, \ - rho, \ - cntx \ + ( ctype* )x, incx, \ + ( ctype* )y, incy, \ + rho, \ + ( cntx_t* )cntx \ ); \ } @@ -237,14 +237,14 @@ INSERT_GENTFUNC_BASIC( dotv, BLIS_DOTV_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* beta, \ - ctype* rho \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + const ctype* beta, \ + ctype* rho \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -264,12 +264,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \ conjx, \ conjy, \ n, \ - alpha, \ - x, incx, \ - y, incy, \ - beta, \ - rho, \ - cntx \ + ( ctype* )alpha, \ + ( ctype* )x, incx, \ + ( ctype* )y, incy, \ + ( ctype* )beta, \ + rho, \ + ( cntx_t* )cntx \ ); \ } @@ -281,8 +281,8 @@ INSERT_GENTFUNC_BASIC( dotxv, BLIS_DOTXV_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - dim_t n, \ - ctype* x, inc_t incx \ + dim_t n, \ + ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -301,7 +301,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ n, \ x, incx, \ - cntx \ + ( cntx_t* )cntx \ ); \ } @@ -313,10 +313,10 @@ INSERT_GENTFUNC_BASIC( invertv, BLIS_INVERTV_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjalpha, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx \ + conj_t conjalpha, \ + dim_t n, \ + const ctype* alpha, \ + ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -335,9 +335,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conjalpha, \ n, \ - alpha, \ - x, incx, \ - cntx \ + ( ctype* )alpha, \ + x, incx, \ + ( cntx_t* )cntx \ ); \ } @@ -350,9 +350,9 @@ INSERT_GENTFUNC_BASIC( setv, BLIS_SETV_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ + dim_t n, \ + ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -372,7 +372,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ n, \ x, incx, \ y, incy, \ - cntx \ + ( cntx_t* )cntx \ ); \ } @@ -383,11 +383,11 @@ INSERT_GENTFUNC_BASIC( swapv, BLIS_SWAPV_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -406,10 +406,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conjx, \ n, \ - x, incx, \ - beta, \ - y, incy, \ - cntx \ + ( ctype* )x, incx, \ + ( ctype* )beta, \ + y, incy, \ + ( cntx_t* )cntx \ ); \ } diff --git a/frame/1/bli_l1v_tapi.h b/frame/1/bli_l1v_tapi.h index 5cb3295ef..c1965cb3c 100644 --- a/frame/1/bli_l1v_tapi.h +++ b/frame/1/bli_l1v_tapi.h @@ -42,10 +42,10 @@ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); @@ -59,9 +59,9 @@ INSERT_GENTPROT_BASIC0( subv ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - dim_t* index \ + dim_t n, \ + const ctype* x, inc_t incx, \ + dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ @@ -73,12 +73,12 @@ INSERT_GENTPROT_BASIC0( amaxv ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ @@ -90,11 +90,11 @@ INSERT_GENTPROT_BASIC0( axpbyv ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ @@ -107,12 +107,12 @@ INSERT_GENTPROT_BASIC0( scal2v ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* rho \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ @@ -124,14 +124,14 @@ INSERT_GENTPROT_BASIC0( dotv ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* beta, \ - ctype* rho \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + const ctype* beta, \ + ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ @@ -143,8 +143,8 @@ INSERT_GENTPROT_BASIC0( dotxv ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - dim_t n, \ - ctype* x, inc_t incx \ + dim_t n, \ + ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ @@ -156,10 +156,10 @@ INSERT_GENTPROT_BASIC0( invertv ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjalpha, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx \ + conj_t conjalpha, \ + dim_t n, \ + const ctype* alpha, \ + ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ @@ -172,9 +172,9 @@ INSERT_GENTPROT_BASIC0( setv ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ + dim_t n, \ + ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ @@ -186,11 +186,11 @@ INSERT_GENTPROT_BASIC0( swapv ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ diff --git a/frame/1d/bli_l1d_check.c b/frame/1d/bli_l1d_check.c index 908a410ad..fcc62a757 100644 --- a/frame/1d/bli_l1d_check.c +++ b/frame/1d/bli_l1d_check.c @@ -43,8 +43,8 @@ \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* y \ ) \ { \ bli_l1d_xy_check( x, y ); \ @@ -60,9 +60,9 @@ GENFRONT( subd ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y \ ) \ { \ bli_l1d_axy_check( alpha, x, y ); \ @@ -77,7 +77,7 @@ GENFRONT( scal2d ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x \ + const obj_t* x \ ) \ { \ bli_l1d_x_check( x ); \ @@ -91,8 +91,8 @@ GENFRONT( invertd ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ ) \ { \ bli_l1d_ax_check( alpha, x ); \ @@ -109,9 +109,9 @@ GENFRONT( shiftd ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ ) \ { \ bli_l1d_axy_check( beta, x, y ); \ @@ -124,8 +124,8 @@ GENFRONT( xpbyd ) void bli_l1d_xy_check ( - obj_t* x, - obj_t* y + const obj_t* x, + const obj_t* y ) { err_t e_val; @@ -165,9 +165,9 @@ void bli_l1d_xy_check void bli_l1d_axy_check ( - obj_t* alpha, - obj_t* x, - obj_t* y + const obj_t* alpha, + const obj_t* x, + const obj_t* y ) { err_t e_val; @@ -216,7 +216,7 @@ void bli_l1d_axy_check void bli_l1d_x_check ( - obj_t* x + const obj_t* x ) { err_t e_val; @@ -239,8 +239,8 @@ void bli_l1d_x_check void bli_l1d_ax_check ( - obj_t* alpha, - obj_t* x + const obj_t* alpha, + const obj_t* x ) { err_t e_val; diff --git a/frame/1d/bli_l1d_check.h b/frame/1d/bli_l1d_check.h index 6d000d314..1ef57e236 100644 --- a/frame/1d/bli_l1d_check.h +++ b/frame/1d/bli_l1d_check.h @@ -42,8 +42,8 @@ \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* y \ ); GENTPROT( addd ) @@ -56,9 +56,9 @@ GENTPROT( subd ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y \ ); GENTPROT( axpyd ) @@ -70,7 +70,7 @@ GENTPROT( scal2d ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x \ + const obj_t* x \ ); GENTPROT( invertd ) @@ -81,8 +81,8 @@ GENTPROT( invertd ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ ); GENTPROT( scald ) @@ -96,9 +96,9 @@ GENTPROT( shiftd ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ ); GENTPROT( xpbyd ) @@ -108,25 +108,25 @@ GENTPROT( xpbyd ) void bli_l1d_xy_check ( - obj_t* x, - obj_t* y + const obj_t* x, + const obj_t* y ); void bli_l1d_axy_check ( - obj_t* alpha, - obj_t* x, - obj_t* y + const obj_t* alpha, + const obj_t* x, + const obj_t* y ); void bli_l1d_x_check ( - obj_t* x + const obj_t* x ); void bli_l1d_ax_check ( - obj_t* alpha, - obj_t* x + const obj_t* alpha, + const obj_t* x ); diff --git a/frame/1d/bli_l1d_ft.h b/frame/1d/bli_l1d_ft.h index 53e296616..3de317527 100644 --- a/frame/1d/bli_l1d_ft.h +++ b/frame/1d/bli_l1d_ft.h @@ -44,13 +44,13 @@ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); @@ -65,14 +65,14 @@ INSERT_GENTDEF( subd ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); @@ -86,10 +86,10 @@ INSERT_GENTDEF( scal2d ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + doff_t diagoffx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); @@ -102,12 +102,12 @@ INSERT_GENTDEF( invertd ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + conj_t conjalpha, \ + doff_t diagoffx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); @@ -121,11 +121,11 @@ INSERT_GENTDEF( setd ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype_r* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + doff_t diagoffx, \ + dim_t m, \ + dim_t n, \ + const ctype_r* alpha, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); @@ -138,11 +138,11 @@ INSERT_GENTDEFR( setid ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + doff_t diagoffx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); @@ -155,14 +155,14 @@ INSERT_GENTDEF( shiftd ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + const ctype* beta, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); diff --git a/frame/1d/bli_l1d_oapi.c b/frame/1d/bli_l1d_oapi.c index 15e68cf50..7027e7780 100644 --- a/frame/1d/bli_l1d_oapi.c +++ b/frame/1d/bli_l1d_oapi.c @@ -45,8 +45,8 @@ \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -69,7 +69,7 @@ void PASTEMAC(opname,EX_SUF) \ inc_t cs_y = bli_obj_col_stride( y ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, y ); \ + PASTEMAC(opname,_check)( x, y ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -100,9 +100,9 @@ GENFRONT( subd ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -129,7 +129,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, x, y ); \ + PASTEMAC(opname,_check)( alpha, x, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -166,7 +166,7 @@ GENFRONT( scal2d ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -184,7 +184,7 @@ void PASTEMAC(opname,EX_SUF) \ inc_t cs_x = bli_obj_col_stride( x ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x ); \ + PASTEMAC(opname,_check)( x ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -210,8 +210,8 @@ GENFRONT( invertd ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -234,7 +234,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, x ); \ + PASTEMAC(opname,_check)( alpha, x ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -269,8 +269,8 @@ GENFRONT( setd ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -290,7 +290,7 @@ void PASTEMAC(opname,EX_SUF) \ void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, x ); \ + PASTEMAC(opname,_check)( alpha, x ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -317,8 +317,8 @@ GENFRONT( setid ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -340,7 +340,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, x ); \ + PASTEMAC(opname,_check)( alpha, x ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -373,9 +373,9 @@ GENFRONT( shiftd ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -402,7 +402,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, beta, y ); \ + PASTEMAC(opname,_check)( x, beta, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ diff --git a/frame/1d/bli_l1d_oapi.h b/frame/1d/bli_l1d_oapi.h index 47129b771..66f9d698c 100644 --- a/frame/1d/bli_l1d_oapi.h +++ b/frame/1d/bli_l1d_oapi.h @@ -42,10 +42,10 @@ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ - ); + ); GENTPROT( addd ) GENTPROT( copyd ) @@ -57,11 +57,11 @@ GENTPROT( subd ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ - ); + ); GENTPROT( axpyd ) GENTPROT( scal2d ) @@ -72,9 +72,9 @@ GENTPROT( scal2d ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ - ); + ); GENTPROT( invertd ) @@ -84,10 +84,10 @@ GENTPROT( invertd ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ - ); + ); GENTPROT( scald ) GENTPROT( setd ) @@ -100,9 +100,9 @@ GENTPROT( shiftd ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c index cfaf5150f..60916cd56 100644 --- a/frame/1d/bli_l1d_tapi.c +++ b/frame/1d/bli_l1d_tapi.c @@ -45,13 +45,13 @@ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -61,12 +61,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \ \ const num_t dt = PASTEMAC(ch,type); \ \ - ctype* x1; \ - ctype* y1; \ - conj_t conjx; \ - dim_t n_elem; \ - dim_t offx, offy; \ - inc_t incx, incy; \ + const ctype* x1; \ + ctype* y1; \ + conj_t conjx; \ + dim_t n_elem; \ + dim_t offx, offy; \ + inc_t incx, incy; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ @@ -108,9 +108,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conjx, \ n_elem, \ - x1, incx, \ - y1, incy, \ - cntx \ + ( ctype* )x1, incx, \ + y1, incy, \ + ( cntx_t* )cntx \ ); \ } @@ -124,14 +124,14 @@ INSERT_GENTFUNC_BASIC2( subd, subv, BLIS_SUBV_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -141,12 +141,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \ \ const num_t dt = PASTEMAC(ch,type); \ \ - ctype* x1; \ - ctype* y1; \ - conj_t conjx; \ - dim_t n_elem; \ - dim_t offx, offy; \ - inc_t incx, incy; \ + const ctype* x1; \ + ctype* y1; \ + conj_t conjx; \ + dim_t n_elem; \ + dim_t offx, offy; \ + inc_t incx, incy; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ @@ -188,10 +188,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conjx, \ n_elem, \ - alpha, \ - x1, incx, \ - y1, incy, \ - cntx \ + ( ctype* )alpha, \ + ( ctype* )x1, incx, \ + y1, incy, \ + ( cntx_t* )cntx \ ); \ } @@ -204,10 +204,10 @@ INSERT_GENTFUNC_BASIC2( scal2d, scal2v, BLIS_SCAL2V_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + doff_t diagoffx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -248,7 +248,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ n_elem, \ x1, incx, \ - cntx \ + ( cntx_t* )cntx \ ); \ } @@ -260,12 +260,12 @@ INSERT_GENTFUNC_BASIC2( invertd, invertv, BLIS_INVERTV_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + conj_t conjalpha, \ + doff_t diagoffx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -306,9 +306,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conjalpha, \ n_elem, \ - alpha, \ - x1, incx, \ - cntx \ + ( ctype* )alpha, \ + x1, incx, \ + ( cntx_t* )cntx \ ); \ } @@ -321,11 +321,11 @@ INSERT_GENTFUNC_BASIC2( setd, setv, BLIS_SETV_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype_r* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + doff_t diagoffx, \ + dim_t m, \ + dim_t n, \ + const ctype_r* alpha, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -383,9 +383,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ n_elem, \ - alpha, \ - x1, incx, \ - cntx \ + ( ctype_r* )alpha, \ + x1, incx, \ + ( cntx_t* )cntx \ ); \ } @@ -397,11 +397,11 @@ INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + doff_t diagoffx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -442,9 +442,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ n_elem, \ - alpha, 0, \ - x1, incx, \ - cntx \ + ( ctype* )alpha, 0, \ + x1, incx, \ + ( cntx_t* )cntx \ ); \ } @@ -456,14 +456,14 @@ INSERT_GENTFUNC_BASIC2( shiftd, addv, BLIS_ADDV_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + const ctype* beta, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -473,12 +473,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \ \ const num_t dt = PASTEMAC(ch,type); \ \ - ctype* x1; \ - ctype* y1; \ - conj_t conjx; \ - dim_t n_elem; \ - dim_t offx, offy; \ - inc_t incx, incy; \ + const ctype* x1; \ + ctype* y1; \ + conj_t conjx; \ + dim_t n_elem; \ + dim_t offx, offy; \ + inc_t incx, incy; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ @@ -520,10 +520,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conjx, \ n_elem, \ - x1, incx, \ - beta, \ - y1, incy, \ - cntx \ + ( ctype* )x1, incx, \ + ( ctype* )beta, \ + y1, incy, \ + ( cntx_t* )cntx \ ); \ } diff --git a/frame/1d/bli_l1d_tapi.h b/frame/1d/bli_l1d_tapi.h index 35d093e86..831b3d390 100644 --- a/frame/1d/bli_l1d_tapi.h +++ b/frame/1d/bli_l1d_tapi.h @@ -42,13 +42,13 @@ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); @@ -62,14 +62,14 @@ INSERT_GENTPROT_BASIC0( subd ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); @@ -82,10 +82,10 @@ INSERT_GENTPROT_BASIC0( scal2d ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + doff_t diagoffx, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); @@ -97,12 +97,12 @@ INSERT_GENTPROT_BASIC0( invertd ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + conj_t conjalpha, \ + doff_t diagoffx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); @@ -115,11 +115,11 @@ INSERT_GENTPROT_BASIC0( setd ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype_r* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + doff_t diagoffx, \ + dim_t m, \ + dim_t n, \ + const ctype_r* alpha, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); @@ -131,11 +131,11 @@ INSERT_GENTPROTR_BASIC0( setid ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + doff_t diagoffx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); @@ -147,14 +147,14 @@ INSERT_GENTPROT_BASIC0( shiftd ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + const ctype* beta, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); diff --git a/frame/1f/bli_l1f_check.c b/frame/1f/bli_l1f_check.c index c880237c1..e05cb7750 100644 --- a/frame/1f/bli_l1f_check.c +++ b/frame/1f/bli_l1f_check.c @@ -40,11 +40,11 @@ void bli_axpy2v_check ( - obj_t* alphax, - obj_t* alphay, - obj_t* x, - obj_t* y, - obj_t* z + const obj_t* alphax, + const obj_t* alphay, + const obj_t* x, + const obj_t* y, + const obj_t* z ) { err_t e_val; @@ -118,10 +118,10 @@ void bli_axpy2v_check void bli_axpyf_check ( - obj_t* alpha, - obj_t* a, - obj_t* x, - obj_t* y + const obj_t* alpha, + const obj_t* a, + const obj_t* x, + const obj_t* y ) { err_t e_val; @@ -186,12 +186,12 @@ void bli_axpyf_check void bli_dotaxpyv_check ( - obj_t* alpha, - obj_t* xt, - obj_t* x, - obj_t* y, - obj_t* rho, - obj_t* z + const obj_t* alpha, + const obj_t* xt, + const obj_t* x, + const obj_t* y, + const obj_t* rho, + const obj_t* z ) { err_t e_val; @@ -288,14 +288,14 @@ void bli_dotaxpyv_check void bli_dotxaxpyf_check ( - obj_t* alpha, - obj_t* at, - obj_t* a, - obj_t* w, - obj_t* x, - obj_t* beta, - obj_t* y, - obj_t* z + const obj_t* alpha, + const obj_t* at, + const obj_t* a, + const obj_t* w, + const obj_t* x, + const obj_t* beta, + const obj_t* y, + const obj_t* z ) { err_t e_val; @@ -425,11 +425,11 @@ void bli_dotxaxpyf_check void bli_dotxf_check ( - obj_t* alpha, - obj_t* a, - obj_t* x, - obj_t* beta, - obj_t* y + const obj_t* alpha, + const obj_t* a, + const obj_t* x, + const obj_t* beta, + const obj_t* y ) { err_t e_val; diff --git a/frame/1f/bli_l1f_check.h b/frame/1f/bli_l1f_check.h index d630f3205..9cd53107a 100644 --- a/frame/1f/bli_l1f_check.h +++ b/frame/1f/bli_l1f_check.h @@ -42,11 +42,11 @@ \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alphax, \ - obj_t* alphay, \ - obj_t* x, \ - obj_t* y, \ - obj_t* z \ + const obj_t* alphax, \ + const obj_t* alphay, \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* z \ ); GENTPROT( axpy2v ) @@ -57,10 +57,10 @@ GENTPROT( axpy2v ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* x, \ + const obj_t* y \ ); GENTPROT( axpyf ) @@ -71,12 +71,12 @@ GENTPROT( axpyf ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* xt, \ - obj_t* x, \ - obj_t* y, \ - obj_t* rho, \ - obj_t* z \ + const obj_t* alpha, \ + const obj_t* xt, \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* rho, \ + const obj_t* z \ ); GENTPROT( dotaxpyv ) @@ -87,14 +87,14 @@ GENTPROT( dotaxpyv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* at, \ - obj_t* a, \ - obj_t* w, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y, \ - obj_t* z \ + const obj_t* alpha, \ + const obj_t* at, \ + const obj_t* a, \ + const obj_t* w, \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y, \ + const obj_t* z \ ); GENTPROT( dotxaxpyf ) @@ -105,11 +105,11 @@ GENTPROT( dotxaxpyf ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ ); GENTPROT( dotxf ) diff --git a/frame/1f/bli_l1f_ft.h b/frame/1f/bli_l1f_ft.h index 1c7bfd9b6..8e143bf54 100644 --- a/frame/1f/bli_l1f_ft.h +++ b/frame/1f/bli_l1f_ft.h @@ -44,14 +44,14 @@ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alpha1, \ - ctype* alpha2, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* z, inc_t incz \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + const ctype* alpha1, \ + const ctype* alpha2, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); @@ -64,14 +64,14 @@ INSERT_GENTDEF( axpy2v ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ + conj_t conja, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + const ctype* alpha, \ + const ctype* a, inc_t inca, inc_t lda, \ + const ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); @@ -84,15 +84,15 @@ INSERT_GENTDEF( axpyf ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* rho, \ - ctype* z, inc_t incz \ + conj_t conjxt, \ + conj_t conjx, \ + conj_t conjy, \ + dim_t m, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* rho, \ + ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); @@ -105,15 +105,15 @@ INSERT_GENTDEF( dotaxpyv ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + conj_t conjat, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + const ctype* alpha, \ + const ctype* a, inc_t inca, inc_t lda, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); @@ -126,19 +126,19 @@ INSERT_GENTDEF( dotxf ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* w, inc_t incw, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy, \ - ctype* z, inc_t incz \ + conj_t conjat, \ + conj_t conja, \ + conj_t conjw, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + const ctype* alpha, \ + const ctype* a, inc_t inca, inc_t lda, \ + const ctype* w, inc_t incw, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy, \ + ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); diff --git a/frame/1f/bli_l1f_ker_prot.h b/frame/1f/bli_l1f_ker_prot.h index 18eea4568..4393faf10 100644 --- a/frame/1f/bli_l1f_ker_prot.h +++ b/frame/1f/bli_l1f_ker_prot.h @@ -49,7 +49,7 @@ void PASTEMAC(ch,opname) \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); @@ -65,7 +65,7 @@ void PASTEMAC(ch,opname) \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); @@ -82,7 +82,7 @@ void PASTEMAC(ch,opname) \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); @@ -103,7 +103,7 @@ void PASTEMAC(ch,opname) \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); @@ -120,6 +120,6 @@ void PASTEMAC(ch,opname) \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); diff --git a/frame/1f/bli_l1f_oapi.c b/frame/1f/bli_l1f_oapi.c index db8fdfb68..f1e65a252 100644 --- a/frame/1f/bli_l1f_oapi.c +++ b/frame/1f/bli_l1f_oapi.c @@ -45,11 +45,11 @@ \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alphax, \ - obj_t* alphay, \ - obj_t* x, \ - obj_t* y, \ - obj_t* z \ + const obj_t* alphax, \ + const obj_t* alphay, \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* z \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -76,7 +76,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t alphay_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alphax, alphay, x, y, z ); \ + PASTEMAC(opname,_check)( alphax, alphay, x, y, z ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -115,10 +115,10 @@ GENFRONT( axpy2v ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -145,7 +145,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, a, x, y ); \ + PASTEMAC(opname,_check)( alpha, a, x, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -184,12 +184,12 @@ GENFRONT( axpyf ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* xt, \ - obj_t* x, \ - obj_t* y, \ - obj_t* rho, \ - obj_t* z \ + const obj_t* alpha, \ + const obj_t* xt, \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* rho, \ + const obj_t* z \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -216,7 +216,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, xt, x, y, rho, z ); \ + PASTEMAC(opname,_check)( alpha, xt, x, y, rho, z ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -253,14 +253,14 @@ GENFRONT( dotaxpyv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* at, \ - obj_t* a, \ - obj_t* w, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y, \ - obj_t* z \ + const obj_t* alpha, \ + const obj_t* at, \ + const obj_t* a, \ + const obj_t* w, \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y, \ + const obj_t* z \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -295,7 +295,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, at, a, w, x, beta, y, z ); \ + PASTEMAC(opname,_check)( alpha, at, a, w, x, beta, y, z ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -342,11 +342,11 @@ GENFRONT( dotxaxpyf ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -375,7 +375,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \ + PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ diff --git a/frame/1f/bli_l1f_oapi.h b/frame/1f/bli_l1f_oapi.h index 0348c4871..d0d53a6df 100644 --- a/frame/1f/bli_l1f_oapi.h +++ b/frame/1f/bli_l1f_oapi.h @@ -42,11 +42,11 @@ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alphax, \ - obj_t* alphay, \ - obj_t* x, \ - obj_t* y, \ - obj_t* z \ + const obj_t* alphax, \ + const obj_t* alphay, \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); @@ -58,10 +58,10 @@ GENTPROT( axpy2v ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); @@ -73,12 +73,12 @@ GENTPROT( axpyf ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* xt, \ - obj_t* x, \ - obj_t* y, \ - obj_t* rho, \ - obj_t* z \ + const obj_t* alpha, \ + const obj_t* xt, \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* rho, \ + const obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); @@ -90,14 +90,14 @@ GENTPROT( dotaxpyv ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* at, \ - obj_t* a, \ - obj_t* w, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y, \ - obj_t* z \ + const obj_t* alpha, \ + const obj_t* at, \ + const obj_t* a, \ + const obj_t* w, \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y, \ + const obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); @@ -109,11 +109,11 @@ GENTPROT( dotxaxpyf ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c index a54379299..04d100cb3 100644 --- a/frame/1f/bli_l1f_tapi.c +++ b/frame/1f/bli_l1f_tapi.c @@ -45,14 +45,14 @@ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alphax, \ - ctype* alphay, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* z, inc_t incz \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + const ctype* alphax, \ + const ctype* alphay, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -72,12 +72,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \ conjx, \ conjy, \ n, \ - alphax, \ - alphay, \ - x, incx, \ - y, incy, \ - z, incz, \ - cntx \ + ( ctype* )alphax, \ + ( ctype* )alphay, \ + ( ctype* )x, incx, \ + ( ctype* )y, incy, \ + z, incz, \ + ( cntx_t* )cntx \ ); \ } @@ -89,14 +89,14 @@ INSERT_GENTFUNC_BASIC( axpy2v, BLIS_AXPY2V_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ + conj_t conja, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + const ctype* alpha, \ + const ctype* a, inc_t inca, inc_t lda, \ + const ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -117,11 +117,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \ conjx, \ m, \ b_n, \ - alpha, \ - a, inca, lda, \ - x, incx, \ - y, incy, \ - cntx \ + ( ctype* )alpha, \ + ( ctype* )a, inca, lda, \ + ( ctype* )x, incx, \ + y, incy, \ + ( cntx_t* )cntx \ ); \ } @@ -133,15 +133,15 @@ INSERT_GENTFUNC_BASIC( axpyf, BLIS_AXPYF_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* rho, \ - ctype* z, inc_t incz \ + conj_t conjxt, \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* rho, \ + ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -162,12 +162,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \ conjx, \ conjy, \ n, \ - alpha, \ - x, incx, \ - y, incy, \ - rho, \ - z, incz, \ - cntx \ + ( ctype* )alpha, \ + ( ctype* )x, incx, \ + ( ctype* )y, incy, \ + rho, \ + z, incz, \ + ( cntx_t* )cntx \ ); \ } @@ -179,19 +179,19 @@ INSERT_GENTFUNC_BASIC( dotaxpyv, BLIS_DOTAXPYV_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* w, inc_t incw, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy, \ - ctype* z, inc_t incz \ + conj_t conjat, \ + conj_t conja, \ + conj_t conjw, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + const ctype* alpha, \ + const ctype* a, inc_t inca, inc_t lda, \ + const ctype* w, inc_t incw, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy, \ + ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -214,14 +214,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \ conjx, \ m, \ b_n, \ - alpha, \ - a, inca, lda, \ - w, incw, \ - x, incx, \ - beta, \ - y, incy, \ - z, incz, \ - cntx \ + ( ctype* )alpha, \ + ( ctype* )a, inca, lda, \ + ( ctype* )w, incw, \ + ( ctype* )x, incx, \ + ( ctype* )beta, \ + y, incy, \ + z, incz, \ + ( cntx_t* )cntx \ ); \ } @@ -233,15 +233,15 @@ INSERT_GENTFUNC_BASIC( dotxaxpyf, BLIS_DOTXAXPYF_KER ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + conj_t conjat, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + const ctype* alpha, \ + const ctype* a, inc_t inca, inc_t lda, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -262,12 +262,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \ conjx, \ m, \ b_n, \ - alpha, \ - a, inca, lda, \ - x, incx, \ - beta, \ - y, incy, \ - cntx \ + ( ctype* )alpha, \ + ( ctype* )a, inca, lda, \ + ( ctype* )x, incx, \ + ( ctype* )beta, \ + y, incy, \ + ( cntx_t* )cntx \ ); \ } diff --git a/frame/1f/bli_l1f_tapi.h b/frame/1f/bli_l1f_tapi.h index 2138b989d..2ea54df4c 100644 --- a/frame/1f/bli_l1f_tapi.h +++ b/frame/1f/bli_l1f_tapi.h @@ -42,14 +42,14 @@ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alphax, \ - ctype* alphay, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* z, inc_t incz \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + const ctype* alphax, \ + const ctype* alphay, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); @@ -61,14 +61,14 @@ INSERT_GENTPROT_BASIC0( axpy2v ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ + conj_t conja, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + const ctype* alpha, \ + const ctype* a, inc_t inca, inc_t lda, \ + const ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); @@ -80,15 +80,15 @@ INSERT_GENTPROT_BASIC0( axpyf ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* rho, \ - ctype* z, inc_t incz \ + conj_t conjxt, \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* rho, \ + ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); @@ -100,19 +100,19 @@ INSERT_GENTPROT_BASIC0( dotaxpyv ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* w, inc_t incw, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy, \ - ctype* z, inc_t incz \ + conj_t conjat, \ + conj_t conja, \ + conj_t conjw, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + const ctype* alpha, \ + const ctype* a, inc_t inca, inc_t lda, \ + const ctype* w, inc_t incw, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy, \ + ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); @@ -124,15 +124,15 @@ INSERT_GENTPROT_BASIC0( dotxaxpyf ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype* alpha, \ - ctype* a, inc_t inca, inc_t lda, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + conj_t conjat, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + const ctype* alpha, \ + const ctype* a, inc_t inca, inc_t lda, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); diff --git a/frame/1m/bli_l1m_check.c b/frame/1m/bli_l1m_check.c index 8914e43b1..f5d4bf1b4 100644 --- a/frame/1m/bli_l1m_check.c +++ b/frame/1m/bli_l1m_check.c @@ -43,8 +43,8 @@ \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* y \ ) \ { \ bli_l1m_xy_check( x, y ); \ @@ -60,9 +60,9 @@ GENFRONT( subm ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y \ ) \ { \ bli_l1m_axy_check( alpha, x, y ); \ @@ -77,8 +77,8 @@ GENFRONT( scal2m ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ ) \ { \ bli_l1m_ax_check( alpha, x ); \ @@ -93,9 +93,9 @@ GENFRONT( setm ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ ) \ { \ bli_l1m_axy_check( beta, x, y ); \ @@ -108,8 +108,8 @@ GENFRONT( xpbym ) void bli_l1m_xy_check ( - obj_t* x, - obj_t* y + const obj_t* x, + const obj_t* y ) { err_t e_val; @@ -149,9 +149,9 @@ void bli_l1m_xy_check void bli_l1m_axy_check ( - obj_t* alpha, - obj_t* x, - obj_t* y + const obj_t* alpha, + const obj_t* x, + const obj_t* y ) { err_t e_val; @@ -200,8 +200,8 @@ void bli_l1m_axy_check void bli_l1m_ax_check ( - obj_t* alpha, - obj_t* x + const obj_t* alpha, + const obj_t* x ) { err_t e_val; diff --git a/frame/1m/bli_l1m_check.h b/frame/1m/bli_l1m_check.h index 030c0e219..6089dfa17 100644 --- a/frame/1m/bli_l1m_check.h +++ b/frame/1m/bli_l1m_check.h @@ -42,8 +42,8 @@ \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* y \ ); GENPROT( addm ) @@ -56,9 +56,9 @@ GENPROT( subm ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y \ ); GENPROT( axpym ) @@ -70,8 +70,8 @@ GENPROT( scal2m ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ ); GENPROT( scalm ) @@ -83,9 +83,9 @@ GENPROT( setm ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ ); GENPROT( xpbym ) @@ -95,20 +95,20 @@ GENPROT( xpbym ) void bli_l1m_xy_check ( - obj_t* x, - obj_t* y + const obj_t* x, + const obj_t* y ); void bli_l1m_axy_check ( - obj_t* alpha, - obj_t* x, - obj_t* y + const obj_t* alpha, + const obj_t* x, + const obj_t* y ); void bli_l1m_ax_check ( - obj_t* alpha, - obj_t* x + const obj_t* alpha, + const obj_t* x ); diff --git a/frame/1m/bli_l1m_ft.h b/frame/1m/bli_l1m_ft.h index af6c384e5..36d06b2fe 100644 --- a/frame/1m/bli_l1m_ft.h +++ b/frame/1m/bli_l1m_ft.h @@ -44,14 +44,14 @@ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); @@ -66,15 +66,15 @@ INSERT_GENTDEF( copym ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); @@ -87,15 +87,15 @@ INSERT_GENTDEF( axpym ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); @@ -108,14 +108,14 @@ INSERT_GENTDEF( scal2m ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + conj_t conjalpha, \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); @@ -129,15 +129,15 @@ INSERT_GENTDEF( setm ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + const ctype* beta, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); diff --git a/frame/1m/bli_l1m_ft_ker.h b/frame/1m/bli_l1m_ft_ker.h index 41d80e217..f25c3c943 100644 --- a/frame/1m/bli_l1m_ft_ker.h +++ b/frame/1m/bli_l1m_ft_ker.h @@ -90,7 +90,7 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); INSERT_GENTDEF( packm_cxk ) @@ -109,7 +109,7 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); INSERT_GENTDEF( unpackm_cxk ) @@ -132,7 +132,7 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); INSERT_GENTDEF( packm_cxc_diag ) diff --git a/frame/1m/bli_l1m_ker_prot.h b/frame/1m/bli_l1m_ker_prot.h index 80284ea22..8430614d2 100644 --- a/frame/1m/bli_l1m_ker_prot.h +++ b/frame/1m/bli_l1m_ker_prot.h @@ -51,7 +51,7 @@ void PASTEMAC(ch,varname) \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); @@ -68,7 +68,7 @@ void PASTEMAC(ch,varname) \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); @@ -89,6 +89,6 @@ void PASTEMAC(ch,varname) \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ); diff --git a/frame/1m/bli_l1m_oapi.c b/frame/1m/bli_l1m_oapi.c index 840b058d4..7520afce7 100644 --- a/frame/1m/bli_l1m_oapi.c +++ b/frame/1m/bli_l1m_oapi.c @@ -45,8 +45,8 @@ \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -70,7 +70,7 @@ void PASTEMAC(opname,EX_SUF) \ inc_t cs_y = bli_obj_col_stride( y ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, y ); \ + PASTEMAC(opname,_check)( x, y ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -102,9 +102,9 @@ GENFRONT( subm ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -132,7 +132,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, x, y ); \ + PASTEMAC(opname,_check)( alpha, x, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -170,8 +170,8 @@ GENFRONT( scal2m ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -197,7 +197,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t x_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, x ); \ + PASTEMAC(opname,_check)( alpha, x ); \ \ /* Alias x to x_local so we can apply alpha if it is non-unit. */ \ bli_obj_alias_to( x, &x_local ); \ @@ -245,8 +245,8 @@ GENFRONT( scalm ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -271,7 +271,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, x ); \ + PASTEMAC(opname,_check)( alpha, x ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -307,9 +307,9 @@ GENFRONT( setm ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -340,7 +340,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, beta, y ); \ + PASTEMAC(opname,_check)( x, beta, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -377,9 +377,9 @@ GENFRONT( xpbym ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ diff --git a/frame/1m/bli_l1m_oapi.h b/frame/1m/bli_l1m_oapi.h index a6a94cf9f..9510f1aee 100644 --- a/frame/1m/bli_l1m_oapi.h +++ b/frame/1m/bli_l1m_oapi.h @@ -42,8 +42,8 @@ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); @@ -57,9 +57,9 @@ GENPROT( subm ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); @@ -72,8 +72,8 @@ GENPROT( scal2m ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); @@ -86,9 +86,9 @@ GENPROT( setm ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); diff --git a/frame/1m/bli_l1m_oft_var.h b/frame/1m/bli_l1m_oft_var.h index 0b60d4e2f..325ed0ecf 100644 --- a/frame/1m/bli_l1m_oft_var.h +++ b/frame/1m/bli_l1m_oft_var.h @@ -45,12 +45,12 @@ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ - obj_t* a, \ - obj_t* p, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ + const obj_t* a, \ + obj_t* p, \ + const cntx_t* cntx, \ + rntm_t* rntm, \ + cntl_t* cntl, \ + const thrinfo_t* thread \ ); GENTDEF( packm ) @@ -61,11 +61,11 @@ GENTDEF( packm ) \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ - obj_t* p, \ - obj_t* a, \ - cntx_t* cntx, \ - cntl_t* cntl, \ - thrinfo_t* thread \ + const obj_t* p, \ + const obj_t* a, \ + const cntx_t* cntx, \ + const cntl_t* cntl, \ + const thrinfo_t* thread \ ); GENTDEF( unpackm ) diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c index 2b3c4bb4a..6b802b9fe 100644 --- a/frame/1m/bli_l1m_tapi.c +++ b/frame/1m/bli_l1m_tapi.c @@ -45,14 +45,14 @@ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -75,9 +75,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \ transx, \ m, \ n, \ - x, rs_x, cs_x, \ - y, rs_y, cs_y, \ - cntx, \ + ( ctype* )x, rs_x, cs_x, \ + y, rs_y, cs_y, \ + ( cntx_t* )cntx, \ rntm \ ); \ \ @@ -110,14 +110,14 @@ INSERT_GENTFUNC_BASIC( subm, subd ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -140,9 +140,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \ transx, \ m, \ n, \ - x, rs_x, cs_x, \ - y, rs_y, cs_y, \ - cntx, \ + ( ctype* )x, rs_x, cs_x, \ + y, rs_y, cs_y, \ + ( cntx_t* )cntx, \ rntm \ ); \ \ @@ -179,15 +179,15 @@ INSERT_GENTFUNC_BASIC0( copym ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -213,10 +213,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \ transx, \ m, \ n, \ - alpha, \ - x, rs_x, cs_x, \ - y, rs_y, cs_y, \ - cntx, \ + ( ctype* )alpha, \ + ( ctype* )x, rs_x, cs_x, \ + y, rs_y, cs_y, \ + ( cntx_t* )cntx, \ rntm \ ); \ \ @@ -249,15 +249,15 @@ INSERT_GENTFUNC_BASIC0( axpym ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -302,10 +302,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \ transx, \ m, \ n, \ - alpha, \ - x, rs_x, cs_x, \ - y, rs_y, cs_y, \ - cntx, \ + ( ctype* )alpha, \ + ( ctype* )x, rs_x, cs_x, \ + y, rs_y, cs_y, \ + ( cntx_t* )cntx, \ rntm \ ); \ \ @@ -341,14 +341,14 @@ INSERT_GENTFUNC_BASIC0( scal2m ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + conj_t conjalpha, \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -371,9 +371,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \ uplox, \ m, \ n, \ - alpha, \ - x, rs_x, cs_x, \ - cntx, \ + ( ctype* )alpha, \ + x, rs_x, cs_x, \ + ( cntx_t* )cntx, \ rntm \ ); \ } @@ -387,15 +387,15 @@ INSERT_GENTFUNC_BASIC0( setm ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + const ctype* beta, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -419,9 +419,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \ transx, \ m, \ n, \ - x, rs_x, cs_x, \ - y, rs_y, cs_y, \ - cntx, \ + ( ctype* )x, rs_x, cs_x, \ + y, rs_y, cs_y, \ + ( cntx_t* )cntx, \ rntm \ ); \ \ @@ -438,10 +438,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \ transx, \ m, \ n, \ - x, rs_x, cs_x, \ - beta, \ - y, rs_y, cs_y, \ - cntx, \ + ( ctype* )x, rs_x, cs_x, \ + ( ctype* )beta, \ + y, rs_y, cs_y, \ + ( cntx_t* )cntx, \ rntm \ ); \ \ @@ -474,15 +474,15 @@ INSERT_GENTFUNC_BASIC0( xpbym ) \ void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype_x* x, inc_t rs_x, inc_t cs_x, \ - ctype_y* beta, \ - ctype_y* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype_x* x, inc_t rs_x, inc_t cs_x, \ + const ctype_y* beta, \ + ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -520,10 +520,10 @@ void PASTEMAC3(chx,chy,opname,EX_SUF) \ transx, \ m, \ n, \ - x, rs_x, cs_x, \ - beta, \ - y, rs_y, cs_y, \ - cntx, \ + ( ctype_x* )x, rs_x, cs_x, \ + ( ctype_y* )beta, \ + y, rs_y, cs_y, \ + ( cntx_t* )cntx, \ rntm \ ); \ } diff --git a/frame/1m/bli_l1m_tapi.h b/frame/1m/bli_l1m_tapi.h index 03a1196ed..68646a71f 100644 --- a/frame/1m/bli_l1m_tapi.h +++ b/frame/1m/bli_l1m_tapi.h @@ -42,14 +42,14 @@ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); @@ -63,15 +63,15 @@ INSERT_GENTPROT_BASIC0( subm ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); @@ -84,14 +84,14 @@ INSERT_GENTPROT_BASIC0( scal2m ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjalpha, \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + conj_t conjalpha, \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); @@ -104,15 +104,15 @@ INSERT_GENTPROT_BASIC0( setm ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* beta, \ - ctype* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + const ctype* beta, \ + ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); @@ -124,15 +124,15 @@ INSERT_GENTPROT_BASIC0( xpbym ) \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype_x* x, inc_t rs_x, inc_t cs_x, \ - ctype_y* beta, \ - ctype_y* y, inc_t rs_y, inc_t cs_y \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype_x* x, inc_t rs_x, inc_t cs_x, \ + const ctype_y* beta, \ + ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); diff --git a/frame/1m/packm/bli_packm_alloc.c b/frame/1m/packm/bli_packm_alloc.c index b12a93ddc..22ed31ecc 100644 --- a/frame/1m/packm/bli_packm_alloc.c +++ b/frame/1m/packm/bli_packm_alloc.c @@ -37,10 +37,10 @@ void* bli_packm_alloc ( - siz_t size_needed, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + siz_t size_needed, + rntm_t* rntm, + cntl_t* cntl, + const thrinfo_t* thread ) { // Query the pack buffer type from the control tree node. @@ -58,11 +58,11 @@ void* bli_packm_alloc void* bli_packm_alloc_ex ( - siz_t size_needed, - packbuf_t pack_buf_type, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + siz_t size_needed, + packbuf_t pack_buf_type, + rntm_t* rntm, + cntl_t* cntl, + const thrinfo_t* thread ) { // Query the address of the mem_t entry within the control tree node. diff --git a/frame/1m/packm/bli_packm_alloc.h b/frame/1m/packm/bli_packm_alloc.h index 5a5cf126b..aec2e1af5 100644 --- a/frame/1m/packm/bli_packm_alloc.h +++ b/frame/1m/packm/bli_packm_alloc.h @@ -34,18 +34,18 @@ BLIS_EXPORT_BLIS void* bli_packm_alloc ( - siz_t size_needed, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + siz_t size_needed, + rntm_t* rntm, + cntl_t* cntl, + const thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( - siz_t size_needed, - packbuf_t pack_buf_type, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + siz_t size_needed, + packbuf_t pack_buf_type, + rntm_t* rntm, + cntl_t* cntl, + const thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index e13391151..601f2c05c 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -54,12 +54,12 @@ static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md); void bli_packm_blk_var1 ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* c, + obj_t* p, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + const thrinfo_t* thread ) { // Extract various fields from the control tree. @@ -271,7 +271,7 @@ void bli_packm_blk_var1 c_use, incc, ldc, p_use, ldp, is_p_use, - cntx, + ( cntx_t* )cntx, params ); } @@ -303,7 +303,7 @@ void bli_packm_blk_var1 kappa_cast, c_begin, incc, ldc, p_begin, ldp, is_p, - cntx, + ( cntx_t* )cntx, params ); } } diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h index 9cda5828b..5797e3b94 100644 --- a/frame/1m/packm/bli_packm_blk_var1.h +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -39,8 +39,8 @@ typedef struct { - // Type of C Type of P - packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; + // Type of C Type of P + packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // @@ -49,11 +49,11 @@ typedef struct BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* t + const obj_t* c, + obj_t* p, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + const thrinfo_t* t ); diff --git a/frame/1m/packm/bli_packm_check.c b/frame/1m/packm/bli_packm_check.c index e662a85df..15bd032ca 100644 --- a/frame/1m/packm/bli_packm_check.c +++ b/frame/1m/packm/bli_packm_check.c @@ -37,9 +37,9 @@ void bli_packm_init_check ( - obj_t* a, - obj_t* p, - cntx_t* cntx + const obj_t* a, + const obj_t* p, + const cntx_t* cntx ) { err_t e_val; @@ -59,9 +59,9 @@ void bli_packm_init_check void bli_packm_int_check ( - obj_t* a, - obj_t* p, - cntx_t* cntx + const obj_t* a, + const obj_t* p, + const cntx_t* cntx ) { err_t e_val; diff --git a/frame/1m/packm/bli_packm_check.h b/frame/1m/packm/bli_packm_check.h index be375fcf7..da9399b31 100644 --- a/frame/1m/packm/bli_packm_check.h +++ b/frame/1m/packm/bli_packm_check.h @@ -34,15 +34,15 @@ void bli_packm_init_check ( - obj_t* a, - obj_t* p, - cntx_t* cntx + const obj_t* a, + const obj_t* p, + const cntx_t* cntx ); void bli_packm_int_check ( - obj_t* a, - obj_t* p, - cntx_t* cntx + const obj_t* a, + const obj_t* p, + const cntx_t* cntx ); diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index 14bfe1ce8..be0fc8fde 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -35,48 +35,48 @@ struct packm_params_s { - uint64_t size; // size field must be present and come first. - bszid_t bmid_m; - bszid_t bmid_n; - bool does_invert_diag; - bool rev_iter_if_upper; - bool rev_iter_if_lower; - pack_t pack_schema; - packbuf_t pack_buf_type; + uint64_t size; // size field must be present and come first. + bszid_t bmid_m; + bszid_t bmid_n; + bool does_invert_diag; + bool rev_iter_if_upper; + bool rev_iter_if_lower; + pack_t pack_schema; + packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; -BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) +BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( const cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } -BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) +BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( const cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } -BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) +BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( const cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } -BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) +BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( const cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } -BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) +BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( const cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } -BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) +BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( const cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } -BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) +BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( const cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 5a7d716fe..67e02ac0e 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -37,12 +37,12 @@ bool bli_packm_init ( - obj_t* c, - obj_t* p, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* c, + obj_t* p, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + const thrinfo_t* thread ) { bli_init_once(); diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h index 152c6f15c..6f9b47273 100644 --- a/frame/1m/packm/bli_packm_init.h +++ b/frame/1m/packm/bli_packm_init.h @@ -34,11 +34,11 @@ BLIS_EXPORT_BLIS bool bli_packm_init ( - obj_t* a, - obj_t* p, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + obj_t* p, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + const thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index c9a2bb9db..f76607508 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -36,12 +36,12 @@ void bli_packm_int ( - obj_t* a, - obj_t* p, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + obj_t* p, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + const thrinfo_t* thread ) { bli_init_once(); diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h index 16a5c2c34..a4cf17d59 100644 --- a/frame/1m/packm/bli_packm_int.h +++ b/frame/1m/packm/bli_packm_int.h @@ -34,10 +34,10 @@ void bli_packm_int ( - obj_t* a, - obj_t* p, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + obj_t* p, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + const thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_part.c b/frame/1m/packm/bli_packm_part.c index 2fff4b7c8..feaaaeea8 100644 --- a/frame/1m/packm/bli_packm_part.c +++ b/frame/1m/packm/bli_packm_part.c @@ -38,11 +38,11 @@ // -- Matrix partitioning ------------------------------------------------------ -void bli_packm_acquire_mpart_t2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_packm_acquire_mpart_t2b( subpart_t requested_part, + dim_t i, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ) { dim_t m, n; @@ -110,11 +110,11 @@ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, -void bli_packm_acquire_mpart_l2r( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_packm_acquire_mpart_l2r( subpart_t requested_part, + dim_t j, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ) { dim_t m, n; @@ -186,18 +186,18 @@ void bli_packm_acquire_mpart_l2r( subpart_t requested_part, -void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, + dim_t ij, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } -dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ) +dim_t bli_packm_offset_to_panel_for( dim_t offmn, const obj_t* p ) { dim_t panel_off; diff --git a/frame/1m/packm/bli_packm_part.h b/frame/1m/packm/bli_packm_part.h index 5930d312e..39ee69a2c 100644 --- a/frame/1m/packm/bli_packm_part.h +++ b/frame/1m/packm/bli_packm_part.h @@ -34,23 +34,23 @@ // -- Matrix partitioning ------------------------------------------------------ -void bli_packm_acquire_mpart_t2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); - -void bli_packm_acquire_mpart_l2r( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); - -void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); - -dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); +void bli_packm_acquire_mpart_t2b( subpart_t requested_part, + dim_t i, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ); + +void bli_packm_acquire_mpart_l2r( subpart_t requested_part, + dim_t j, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ); + +void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, + dim_t ij, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ); + +dim_t bli_packm_offset_to_panel_for( dim_t offmn, const obj_t* p ); diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c index dbdaf4738..3f0d48dbf 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.c +++ b/frame/1m/packm/bli_packm_struc_cxk.c @@ -39,23 +39,23 @@ \ void PASTEMAC(ch,varname) \ ( \ - struc_t strucc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t panel_dim, \ - dim_t panel_len, \ - dim_t panel_dim_max, \ - dim_t panel_len_max, \ - dim_t panel_dim_off, \ - dim_t panel_len_off, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t ldp, \ - inc_t is_p, \ - cntx_t* cntx \ + struc_t strucc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool invdiag, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ + ctype* kappa, \ + ctype* c, inc_t incc, inc_t ldc, \ + ctype* p, inc_t ldp, \ + inc_t is_p, \ + cntx_t* cntx \ ) \ { \ num_t dt = PASTEMAC(ch,type); \ diff --git a/frame/1m/packm/bli_packm_struc_cxk.h b/frame/1m/packm/bli_packm_struc_cxk.h index 973a02612..f0293330b 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.h +++ b/frame/1m/packm/bli_packm_struc_cxk.h @@ -37,26 +37,24 @@ \ void PASTEMAC(ch,varname) \ ( \ - struc_t strucc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool invdiag, \ - dim_t panel_dim, \ - dim_t panel_len, \ - dim_t panel_dim_max, \ - dim_t panel_len_max, \ - dim_t panel_dim_off, \ - dim_t panel_len_off, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t incc, inc_t ldc, \ - ctype* restrict p, inc_t ldp, \ - inc_t is_p, \ - cntx_t* cntx \ + struc_t strucc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool invdiag, \ + dim_t panel_dim, \ + dim_t panel_len, \ + dim_t panel_dim_max, \ + dim_t panel_len_max, \ + dim_t panel_dim_off, \ + dim_t panel_len_off, \ + ctype* kappa, \ + ctype* c, inc_t incc, inc_t ldc, \ + ctype* p, inc_t ldp, \ + inc_t is_p, \ + cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) -INSERT_GENTPROT_BASIC0( packm_herm_cxk ) -INSERT_GENTPROT_BASIC0( packm_tri_cxk ) diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c index b6165f516..f9f7f511c 100644 --- a/frame/1m/unpackm/bli_unpackm_blk_var1.c +++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c @@ -58,11 +58,11 @@ static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1); void bli_unpackm_blk_var1 ( - obj_t* p, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* p, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + const thrinfo_t* thread ) { num_t dt_cp = bli_obj_dt( c ); @@ -108,19 +108,22 @@ void bli_unpackm_blk_var1 f = ftypes[dt_cp]; // Invoke the function. - f( strucc, - diagoffc, - diagc, - uploc, - transc, - m_c, - n_c, - m_panel, - n_panel, - buf_p, rs_p, cs_p, - pd_p, ps_p, - buf_c, rs_c, cs_c, - cntx ); + f + ( + strucc, + diagoffc, + diagc, + uploc, + transc, + m_c, + n_c, + m_panel, + n_panel, + buf_p, rs_p, cs_p, + pd_p, ps_p, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx + ); } @@ -144,29 +147,28 @@ void PASTEMAC(ch,varname) \ cntx_t* cntx \ ) \ { \ - ctype* restrict one = PASTEMAC(ch,1); \ - ctype* restrict c_cast = c; \ - ctype* restrict p_cast = p; \ - ctype* restrict c_begin; \ - ctype* restrict p_begin; \ -\ - dim_t iter_dim; \ - dim_t num_iter; \ - dim_t it, ic, ip; \ - dim_t ic0, ip0; \ - doff_t ic_inc, ip_inc; \ - doff_t diagoffc_i; \ - doff_t diagoffc_inc; \ - dim_t panel_len; \ - dim_t panel_dim_i; \ - dim_t panel_dim_max; \ - inc_t vs_c; \ - inc_t incc, ldc; \ - inc_t ldp; \ - dim_t* m_panel_full; \ - dim_t* n_panel_full; \ - pack_t schema; \ + ctype* one = PASTEMAC(ch,1); \ + ctype* c_cast = c; \ + ctype* p_cast = p; \ + ctype* c_begin; \ + ctype* p_begin; \ \ + dim_t iter_dim; \ + dim_t num_iter; \ + dim_t it, ic, ip; \ + dim_t ic0, ip0; \ + doff_t ic_inc, ip_inc; \ + doff_t diagoffc_i; \ + doff_t diagoffc_inc; \ + dim_t panel_len; \ + dim_t panel_dim_i; \ + dim_t panel_dim_max; \ + inc_t vs_c; \ + inc_t incc, ldc; \ + inc_t ldp; \ + dim_t* m_panel_full; \ + dim_t* n_panel_full; \ + pack_t schema; \ \ /* If c needs a transposition, induce it so that we can more simply express the remaining parameters and code. */ \ @@ -274,7 +276,7 @@ void PASTEMAC(ch,varname) \ one, \ p_begin, ldp, \ c_begin, incc, ldc, \ - cntx \ + ( cntx_t* )cntx \ ); \ } \ \ diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.h b/frame/1m/unpackm/bli_unpackm_blk_var1.h index abd044549..4a92dc1b7 100644 --- a/frame/1m/unpackm/bli_unpackm_blk_var1.h +++ b/frame/1m/unpackm/bli_unpackm_blk_var1.h @@ -34,11 +34,11 @@ void bli_unpackm_blk_var1 ( - obj_t* p, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* p, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + const thrinfo_t* thread ); diff --git a/frame/1m/unpackm/bli_unpackm_check.c b/frame/1m/unpackm/bli_unpackm_check.c index 5bce60ed3..786edd4c8 100644 --- a/frame/1m/unpackm/bli_unpackm_check.c +++ b/frame/1m/unpackm/bli_unpackm_check.c @@ -36,9 +36,9 @@ void bli_unpackm_int_check ( - obj_t* p, - obj_t* a, - cntx_t* cntx + const obj_t* p, + const obj_t* a, + const cntx_t* cntx ) { err_t e_val; diff --git a/frame/1m/unpackm/bli_unpackm_check.h b/frame/1m/unpackm/bli_unpackm_check.h index d2a976dd8..697010fa7 100644 --- a/frame/1m/unpackm/bli_unpackm_check.h +++ b/frame/1m/unpackm/bli_unpackm_check.h @@ -34,8 +34,8 @@ void bli_unpackm_int_check ( - obj_t* p, - obj_t* a, - cntx_t* cntx + const obj_t* p, + const obj_t* a, + const cntx_t* cntx ); diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c index 550a8fb87..f6b09d8ae 100644 --- a/frame/1m/unpackm/bli_unpackm_int.c +++ b/frame/1m/unpackm/bli_unpackm_int.c @@ -36,11 +36,11 @@ void bli_unpackm_int ( - obj_t* p, - obj_t* a, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* p, + const obj_t* a, + const cntx_t* cntx, + const cntl_t* cntl, + const thrinfo_t* thread ) { bli_init_once(); @@ -60,19 +60,19 @@ void bli_unpackm_int f = bli_cntl_unpackm_params_var_func( cntl ); // Invoke the variant. - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_ochief( thread ) ) { - f + f ( p, - a, + a, cntx, - cntl, + cntl, thread ); - } + } // Barrier so that unpacking is done before computation. - bli_thread_barrier( thread ); + bli_thread_barrier( thread ); } diff --git a/frame/1m/unpackm/bli_unpackm_int.h b/frame/1m/unpackm/bli_unpackm_int.h index cb66d0975..8258ea367 100644 --- a/frame/1m/unpackm/bli_unpackm_int.h +++ b/frame/1m/unpackm/bli_unpackm_int.h @@ -34,10 +34,10 @@ void bli_unpackm_int ( - obj_t* p, - obj_t* a, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* p, + const obj_t* a, + const cntx_t* cntx, + const cntl_t* cntl, + const thrinfo_t* thread ); diff --git a/frame/2/bli_l2_check.c b/frame/2/bli_l2_check.c index fac91fec4..a2772e1c4 100644 --- a/frame/2/bli_l2_check.c +++ b/frame/2/bli_l2_check.c @@ -36,11 +36,11 @@ void bli_gemv_check ( - obj_t* alpha, - obj_t* a, - obj_t* x, - obj_t* beta, - obj_t* y + const obj_t* alpha, + const obj_t* a, + const obj_t* x, + const obj_t* beta, + const obj_t* y ) { err_t e_val; @@ -66,11 +66,11 @@ void bli_gemv_check void bli_hemv_check ( - obj_t* alpha, - obj_t* a, - obj_t* x, - obj_t* beta, - obj_t* y + const obj_t* alpha, + const obj_t* a, + const obj_t* x, + const obj_t* beta, + const obj_t* y ) { err_t e_val; @@ -101,11 +101,11 @@ void bli_hemv_check void bli_symv_check ( - obj_t* alpha, - obj_t* a, - obj_t* x, - obj_t* beta, - obj_t* y + const obj_t* alpha, + const obj_t* a, + const obj_t* x, + const obj_t* beta, + const obj_t* y ) { err_t e_val; @@ -136,9 +136,9 @@ void bli_symv_check void bli_trmv_check ( - obj_t* alpha, - obj_t* a, - obj_t* x + const obj_t* alpha, + const obj_t* a, + const obj_t* x ) { err_t e_val; @@ -166,9 +166,9 @@ void bli_trmv_check void bli_trsv_check ( - obj_t* alpha, - obj_t* a, - obj_t* x + const obj_t* alpha, + const obj_t* a, + const obj_t* x ) { err_t e_val; @@ -196,10 +196,10 @@ void bli_trsv_check void bli_ger_check ( - obj_t* alpha, - obj_t* x, - obj_t* y, - obj_t* a + const obj_t* alpha, + const obj_t* x, + const obj_t* y, + const obj_t* a ) { err_t e_val; @@ -225,9 +225,9 @@ void bli_ger_check void bli_her_check ( - obj_t* alpha, - obj_t* x, - obj_t* a + const obj_t* alpha, + const obj_t* x, + const obj_t* a ) { err_t e_val; @@ -255,10 +255,10 @@ void bli_her_check void bli_her2_check ( - obj_t* alpha, - obj_t* x, - obj_t* y, - obj_t* a + const obj_t* alpha, + const obj_t* x, + const obj_t* y, + const obj_t* a ) { err_t e_val; @@ -289,9 +289,9 @@ void bli_her2_check void bli_syr_check ( - obj_t* alpha, - obj_t* x, - obj_t* a + const obj_t* alpha, + const obj_t* x, + const obj_t* a ) { err_t e_val; @@ -319,10 +319,10 @@ void bli_syr_check void bli_syr2_check ( - obj_t* alpha, - obj_t* x, - obj_t* y, - obj_t* a + const obj_t* alpha, + const obj_t* x, + const obj_t* y, + const obj_t* a ) { err_t e_val; @@ -355,11 +355,11 @@ void bli_syr2_check void bli_xxmv_check ( - obj_t* alpha, - obj_t* a, - obj_t* x, - obj_t* beta, - obj_t* y + const obj_t* alpha, + const obj_t* a, + const obj_t* x, + const obj_t* beta, + const obj_t* y ) { err_t e_val; @@ -424,10 +424,10 @@ void bli_xxmv_check void bli_xxr_check ( - obj_t* alpha, - obj_t* x, - obj_t* y, - obj_t* a + const obj_t* alpha, + const obj_t* x, + const obj_t* y, + const obj_t* a ) { err_t e_val; diff --git a/frame/2/bli_l2_check.h b/frame/2/bli_l2_check.h index af9388753..b698e9d59 100644 --- a/frame/2/bli_l2_check.h +++ b/frame/2/bli_l2_check.h @@ -42,11 +42,11 @@ \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ ); GENPROT( gemv ) @@ -59,10 +59,10 @@ GENPROT( symv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* a \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* a \ ); GENPROT( ger ) @@ -75,9 +75,9 @@ GENPROT( syr2 ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* a \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* a \ ); GENPROT( her ) @@ -89,9 +89,9 @@ GENPROT( syr ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* x \ ); GENPROT( trmv ) @@ -102,17 +102,17 @@ GENPROT( trsv ) void bli_xxmv_check ( - obj_t* alpha, - obj_t* a, - obj_t* x, - obj_t* beta, - obj_t* y + const obj_t* alpha, + const obj_t* a, + const obj_t* x, + const obj_t* beta, + const obj_t* y ); void bli_xxr_check ( - obj_t* alpha, - obj_t* x, - obj_t* y, - obj_t* a + const obj_t* alpha, + const obj_t* x, + const obj_t* y, + const obj_t* a ); diff --git a/frame/2/bli_l2_ft.h b/frame/2/bli_l2_ft.h index 73aa4dd48..8c48e2bed 100644 --- a/frame/2/bli_l2_ft.h +++ b/frame/2/bli_l2_ft.h @@ -44,15 +44,15 @@ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - trans_t transa, \ - conj_t conjx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + trans_t transa, \ + conj_t conjx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); @@ -65,14 +65,14 @@ INSERT_GENTDEF( gemv ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a \ + conj_t conjx, \ + conj_t conjy, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); @@ -85,15 +85,15 @@ INSERT_GENTDEF( ger ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - uplo_t uploa, \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + uplo_t uploa, \ + conj_t conja, \ + conj_t conjx, \ + dim_t m, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); @@ -107,12 +107,12 @@ INSERT_GENTDEF( symv ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - uplo_t uploa, \ - conj_t conjx, \ - dim_t m, \ - ctype_r* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a \ + uplo_t uploa, \ + conj_t conjx, \ + dim_t m, \ + const ctype_r* alpha, \ + const ctype* x, inc_t incx, \ + ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); @@ -125,12 +125,12 @@ INSERT_GENTDEFR( her ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - uplo_t uploa, \ - conj_t conjx, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a \ + uplo_t uploa, \ + conj_t conjx, \ + dim_t m, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); @@ -143,14 +143,14 @@ INSERT_GENTDEF( syr ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - uplo_t uploa, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a \ + uplo_t uploa, \ + conj_t conjx, \ + conj_t conjy, \ + dim_t m, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); @@ -164,13 +164,13 @@ INSERT_GENTDEF( syr2 ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); diff --git a/frame/2/bli_l2_oapi.c b/frame/2/bli_l2_oapi.c index cc32fb61e..2eac6394c 100644 --- a/frame/2/bli_l2_oapi.c +++ b/frame/2/bli_l2_oapi.c @@ -45,11 +45,11 @@ \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -78,7 +78,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \ + PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -96,17 +96,17 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - transa, \ - conjx, \ - m, \ - n, \ - buf_alpha, \ - buf_a, rs_a, cs_a, \ - buf_x, incx, \ - buf_beta, \ - buf_y, incy, \ - cntx, \ - rntm \ + transa, \ + conjx, \ + m, \ + n, \ + buf_alpha, \ + buf_a, rs_a, cs_a, \ + buf_x, incx, \ + buf_beta, \ + buf_y, incy, \ + cntx, \ + rntm \ ); \ } @@ -118,10 +118,10 @@ GENFRONT( gemv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* a \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* a \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -148,7 +148,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, x, y, a ); \ + PASTEMAC(opname,_check)( alpha, x, y, a ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -163,16 +163,16 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - conjx, \ - conjy, \ - m, \ - n, \ - buf_alpha, \ - buf_x, incx, \ - buf_y, incy, \ - buf_a, rs_a, cs_a, \ - cntx, \ - rntm \ + conjx, \ + conjy, \ + m, \ + n, \ + buf_alpha, \ + buf_x, incx, \ + buf_y, incy, \ + buf_a, rs_a, cs_a, \ + cntx, \ + rntm \ ); \ } @@ -184,11 +184,11 @@ GENFRONT( ger ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -217,7 +217,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \ + PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -235,17 +235,17 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - uploa, \ - conja, \ - conjx, \ - m, \ - buf_alpha, \ - buf_a, rs_a, cs_a, \ - buf_x, incx, \ - buf_beta, \ - buf_y, incy, \ - cntx, \ - rntm \ + uploa, \ + conja, \ + conjx, \ + m, \ + buf_alpha, \ + buf_a, rs_a, cs_a, \ + buf_x, incx, \ + buf_beta, \ + buf_y, incy, \ + cntx, \ + rntm \ ); \ } @@ -258,9 +258,9 @@ GENFRONT( symv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* a \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* a \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -270,21 +270,21 @@ void PASTEMAC(opname,EX_SUF) \ \ num_t dt = bli_obj_dt( a ); \ \ - uplo_t uploa = bli_obj_uplo( a ); \ - conj_t conjx = bli_obj_conj_status( x ); \ + uplo_t uploa = bli_obj_uplo( a ); \ + conj_t conjx = bli_obj_conj_status( x ); \ dim_t m = bli_obj_length( a ); \ - void* buf_x = bli_obj_buffer_at_off( x ); \ - inc_t incx = bli_obj_vector_inc( x ); \ - void* buf_a = bli_obj_buffer_at_off( a ); \ - inc_t rs_a = bli_obj_row_stride( a ); \ - inc_t cs_a = bli_obj_col_stride( a ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t incx = bli_obj_vector_inc( x ); \ + void* buf_a = bli_obj_buffer_at_off( a ); \ + inc_t rs_a = bli_obj_row_stride( a ); \ + inc_t cs_a = bli_obj_col_stride( a ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, x, a ); \ + PASTEMAC(opname,_check)( alpha, x, a ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -299,14 +299,14 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - uploa, \ - conjx, \ - m, \ - buf_alpha, \ - buf_x, incx, \ - buf_a, rs_a, cs_a, \ - cntx, \ - rntm \ + uploa, \ + conjx, \ + m, \ + buf_alpha, \ + buf_x, incx, \ + buf_a, rs_a, cs_a, \ + cntx, \ + rntm \ ); \ } @@ -319,10 +319,10 @@ GENFRONT( syr ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* a \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* a \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -349,7 +349,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, x, y, a ); \ + PASTEMAC(opname,_check)( alpha, x, y, a ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -364,16 +364,16 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - uploa, \ - conjx, \ - conjy, \ - m, \ - buf_alpha, \ - buf_x, incx, \ - buf_y, incy, \ - buf_a, rs_a, cs_a, \ - cntx, \ - rntm \ + uploa, \ + conjx, \ + conjy, \ + m, \ + buf_alpha, \ + buf_x, incx, \ + buf_y, incy, \ + buf_a, rs_a, cs_a, \ + cntx, \ + rntm \ ); \ } @@ -386,9 +386,9 @@ GENFRONT( syr2 ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -413,7 +413,7 @@ void PASTEMAC(opname,EX_SUF) \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( alpha, a, x ); \ + PASTEMAC(opname,_check)( alpha, a, x ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ @@ -428,15 +428,15 @@ void PASTEMAC(opname,EX_SUF) \ \ f \ ( \ - uploa, \ - transa, \ - diaga, \ - m, \ - buf_alpha, \ - buf_a, rs_a, cs_a, \ - buf_x, incx, \ - cntx, \ - rntm \ + uploa, \ + transa, \ + diaga, \ + m, \ + buf_alpha, \ + buf_a, rs_a, cs_a, \ + buf_x, incx, \ + cntx, \ + rntm \ ); \ } diff --git a/frame/2/bli_l2_oapi.h b/frame/2/bli_l2_oapi.h index 6b6a1d77e..391de06d5 100644 --- a/frame/2/bli_l2_oapi.h +++ b/frame/2/bli_l2_oapi.h @@ -42,11 +42,11 @@ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x, \ - obj_t* beta, \ - obj_t* y \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* x, \ + const obj_t* beta, \ + const obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); @@ -60,10 +60,10 @@ GENPROT( symv ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* y, \ - obj_t* a \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* y, \ + const obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); @@ -77,9 +77,9 @@ GENPROT( syr2 ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* x, \ - obj_t* a \ + const obj_t* alpha, \ + const obj_t* x, \ + const obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); @@ -92,9 +92,9 @@ GENPROT( syr ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* x \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); diff --git a/frame/2/bli_l2_tapi.c b/frame/2/bli_l2_tapi.c index f6eb6c7d9..4bef7c81a 100644 --- a/frame/2/bli_l2_tapi.c +++ b/frame/2/bli_l2_tapi.c @@ -45,15 +45,15 @@ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - trans_t transa, \ - conj_t conjx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + trans_t transa, \ + conj_t conjx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -111,12 +111,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \ conjx, \ m, \ n, \ - alpha, \ - a, rs_a, cs_a, \ - x, incx, \ - beta, \ - y, incy, \ - cntx \ + ( ctype* )alpha, \ + ( ctype* )a, rs_a, cs_a, \ + ( ctype* )x, incx, \ + ( ctype* )beta, \ + y, incy, \ + ( cntx_t* )cntx \ ); \ } @@ -128,14 +128,14 @@ INSERT_GENTFUNC_BASIC3( gemv, gemv, gemv_unf_var1, gemv_unf_var2 ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a \ + conj_t conjx, \ + conj_t conjy, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -164,11 +164,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \ conjy, \ m, \ n, \ - alpha, \ - x, incx, \ - y, incy, \ - a, rs_a, cs_a, \ - cntx \ + ( ctype* )alpha, \ + ( ctype* )x, incx, \ + ( ctype* )y, incy, \ + a, rs_a, cs_a, \ + ( cntx_t* )cntx \ ); \ } @@ -180,15 +180,15 @@ INSERT_GENTFUNC_BASIC3( ger, ger, ger_unb_var1, ger_unb_var2 ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - uplo_t uploa, \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + uplo_t uploa, \ + conj_t conja, \ + conj_t conjx, \ + dim_t m, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -239,12 +239,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \ conjx, \ conjh, /* used by variants to distinguish hemv from symv */ \ m, \ - alpha, \ - a, rs_a, cs_a, \ - x, incx, \ - beta, \ - y, incy, \ - cntx \ + ( ctype* )alpha, \ + ( ctype* )a, rs_a, cs_a, \ + ( ctype* )x, incx, \ + ( ctype* )beta, \ + y, incy, \ + ( cntx_t* )cntx \ ); \ } @@ -257,12 +257,12 @@ INSERT_GENTFUNC_BASIC4( symv, hemv, BLIS_NO_CONJUGATE, hemv_unf_var1, hemv_unf_v \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - uplo_t uploa, \ - conj_t conjx, \ - dim_t m, \ - ctype_r* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a \ + uplo_t uploa, \ + conj_t conjx, \ + dim_t m, \ + const ctype_r* alpha, \ + const ctype* x, inc_t incx, \ + ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -306,10 +306,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \ conjx, \ conjh, /* used by variants to distinguish her from syr */ \ m, \ - &alpha_local, \ - x, incx, \ - a, rs_a, cs_a, \ - cntx \ + ( ctype* )&alpha_local, \ + ( ctype* )x, incx, \ + a, rs_a, cs_a, \ + ( cntx_t* )cntx \ ); \ } @@ -321,12 +321,12 @@ INSERT_GENTFUNCR_BASIC4( her, her, BLIS_CONJUGATE, her_unb_var1, her_unb_var2 ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - uplo_t uploa, \ - conj_t conjx, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a \ + uplo_t uploa, \ + conj_t conjx, \ + dim_t m, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -363,10 +363,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \ conjx, \ conjh, /* used by variants to distinguish her2 from syr2 */ \ m, \ - alpha, \ - x, incx, \ - a, rs_a, cs_a, \ - cntx \ + ( ctype* )alpha, \ + ( ctype* )x, incx, \ + a, rs_a, cs_a, \ + ( cntx_t* )cntx \ ); \ } @@ -378,14 +378,14 @@ INSERT_GENTFUNC_BASIC4( syr, her, BLIS_NO_CONJUGATE, her_unb_var1, her_unb_var2 \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - uplo_t uploa, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a \ + uplo_t uploa, \ + conj_t conjx, \ + conj_t conjy, \ + dim_t m, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -423,11 +423,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \ conjy, \ conjh, \ m, \ - alpha, \ - x, incx, \ - y, incy, \ - a, rs_a, cs_a, \ - cntx \ + ( ctype* )alpha, \ + ( ctype* )x, incx, \ + ( ctype* )y, incy, \ + a, rs_a, cs_a, \ + ( cntx_t* )cntx \ ); \ } @@ -440,13 +440,13 @@ INSERT_GENTFUNC_BASIC4( syr2, her2, BLIS_NO_CONJUGATE, her2_unf_var1, her2_unf_v \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -498,10 +498,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \ transa, \ diaga, \ m, \ - alpha, \ - a, rs_a, cs_a, \ - x, incx, \ - cntx \ + ( ctype* )alpha, \ + ( ctype* )a, rs_a, cs_a, \ + x, incx, \ + ( cntx_t* )cntx \ ); \ } diff --git a/frame/2/bli_l2_tapi.h b/frame/2/bli_l2_tapi.h index 4b45236e2..edd9607b6 100644 --- a/frame/2/bli_l2_tapi.h +++ b/frame/2/bli_l2_tapi.h @@ -42,15 +42,15 @@ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - trans_t transa, \ - conj_t conjx, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + trans_t transa, \ + conj_t conjx, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); @@ -62,14 +62,14 @@ INSERT_GENTPROT_BASIC0( gemv ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a \ + conj_t conjx, \ + conj_t conjy, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); @@ -81,15 +81,15 @@ INSERT_GENTPROT_BASIC0( ger ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - uplo_t uploa, \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx, \ - ctype* beta, \ - ctype* y, inc_t incy \ + uplo_t uploa, \ + conj_t conja, \ + conj_t conjx, \ + dim_t m, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* x, inc_t incx, \ + const ctype* beta, \ + ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); @@ -102,12 +102,12 @@ INSERT_GENTPROT_BASIC0( symv ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - uplo_t uploa, \ - conj_t conjx, \ - dim_t m, \ - ctype_r* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a \ + uplo_t uploa, \ + conj_t conjx, \ + dim_t m, \ + const ctype_r* alpha, \ + const ctype* x, inc_t incx, \ + ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); @@ -119,12 +119,12 @@ INSERT_GENTPROTR_BASIC0( her ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - uplo_t uploa, \ - conj_t conjx, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* a, inc_t rs_a, inc_t cs_a \ + uplo_t uploa, \ + conj_t conjx, \ + dim_t m, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); @@ -136,14 +136,14 @@ INSERT_GENTPROT_BASIC0( syr ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - uplo_t uploa, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype* alpha, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - ctype* a, inc_t rs_a, inc_t cs_a \ + uplo_t uploa, \ + conj_t conjx, \ + conj_t conjy, \ + dim_t m, \ + const ctype* alpha, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); @@ -156,13 +156,13 @@ INSERT_GENTPROT_BASIC0( syr2 ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* x, inc_t incx \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); diff --git a/frame/2/gemv/bli_gemv_var_oapi.c.prev b/frame/2/gemv/other/bli_gemv_var_oapi.c.prev similarity index 100% rename from frame/2/gemv/bli_gemv_var_oapi.c.prev rename to frame/2/gemv/other/bli_gemv_var_oapi.c.prev diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 1986b3b0f..78482b5f6 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -37,14 +37,14 @@ dim_t bli_l3_determine_kc ( - dir_t direct, - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx, - cntl_t* cntl + dir_t direct, + dim_t i, + dim_t dim, + const obj_t* a, + const obj_t* b, + bszid_t bszid, + const cntx_t* cntx, + const cntl_t* cntl ) { opid_t family = bli_cntl_family( cntl ); @@ -75,13 +75,13 @@ dim_t bli_l3_determine_kc \ dim_t PASTEMAC0(opname) \ ( \ - dir_t direct, \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dir_t direct, \ + dim_t i, \ + dim_t dim, \ + const obj_t* a, \ + const obj_t* b, \ + bszid_t bszid, \ + const cntx_t* cntx \ ) \ { \ if ( direct == BLIS_FWD ) \ @@ -102,20 +102,14 @@ GENFRONT( trsm_determine_kc, trsm ) \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dim_t i, \ + dim_t dim, \ + const obj_t* a, \ + const obj_t* b, \ + bszid_t bszid, \ + const cntx_t* cntx \ ) \ { \ - num_t dt; \ - blksz_t* bsize; \ - dim_t mnr; \ - dim_t b_alg, b_max; \ - dim_t b_use; \ - \ /* bli_*_determine_kc_f(): We assume that this function is being called from an algorithm that @@ -130,15 +124,16 @@ dim_t PASTEMAC0(opname) \ \ /* Extract the execution datatype and use it to query the corresponding blocksize and blocksize maximum values from the blksz_t object. */ \ - dt = bli_obj_exec_dt( a ); \ - bsize = bli_cntx_get_blksz( bszid, cntx ); \ - b_alg = bli_blksz_get_def( dt, bsize ); \ - b_max = bli_blksz_get_max( dt, bsize ); \ + const num_t dt = bli_obj_exec_dt( a ); \ + const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \ + dim_t b_alg = bli_blksz_get_def( dt, bsize ); \ + dim_t b_max = bli_blksz_get_max( dt, bsize ); \ \ /* Nudge the default and maximum kc blocksizes up to the nearest multiple of MR if A is Hermitian or symmetric, or NR if B is Hermitian or symmetric. If neither case applies, then we leave the blocksizes unchanged. */ \ + dim_t mnr; \ if ( bli_obj_root_is_herm_or_symm( a ) ) \ { \ mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ @@ -154,9 +149,7 @@ dim_t PASTEMAC0(opname) \ \ /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined in bli_blksz.c */ \ - b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ -\ - return b_use; \ + return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ } GENFRONT( gemm_determine_kc_f, f ) @@ -169,19 +162,14 @@ GENFRONT( gemm_determine_kc_b, b ) \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dim_t i, \ + dim_t dim, \ + const obj_t* a, \ + const obj_t* b, \ + bszid_t bszid, \ + const cntx_t* cntx \ ) \ { \ - num_t dt; \ - blksz_t* bsize; \ - dim_t b_alg, b_max; \ - dim_t b_use; \ - \ /* bli_*_determine_kc_f(): We assume that this function is being called from an algorithm that @@ -196,19 +184,17 @@ dim_t PASTEMAC0(opname) \ \ /* Extract the execution datatype and use it to query the corresponding blocksize and blocksize maximum values from the blksz_t object. */ \ - dt = bli_obj_exec_dt( a ); \ - bsize = bli_cntx_get_blksz( bszid, cntx ); \ - b_alg = bli_blksz_get_def( dt, bsize ); \ - b_max = bli_blksz_get_max( dt, bsize ); \ + const num_t dt = bli_obj_exec_dt( a ); \ + const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \ + const dim_t b_alg = bli_blksz_get_def( dt, bsize ); \ + const dim_t b_max = bli_blksz_get_max( dt, bsize ); \ \ /* Notice that for gemmt, we do not need to perform any special handling for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \ \ /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined in bli_blksz.c */ \ - b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ -\ - return b_use; \ + return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ } GENFRONT( gemmt_determine_kc_f, f ) @@ -221,20 +207,14 @@ GENFRONT( gemmt_determine_kc_b, b ) \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dim_t i, \ + dim_t dim, \ + const obj_t* a, \ + const obj_t* b, \ + bszid_t bszid, \ + const cntx_t* cntx \ ) \ { \ - num_t dt; \ - blksz_t* bsize; \ - dim_t mnr; \ - dim_t b_alg, b_max; \ - dim_t b_use; \ - \ /* bli_*_determine_kc_f(): We assume that this function is being called from an algorithm that @@ -249,14 +229,15 @@ dim_t PASTEMAC0(opname) \ \ /* Extract the execution datatype and use it to query the corresponding blocksize and blocksize maximum values from the blksz_t object. */ \ - dt = bli_obj_exec_dt( a ); \ - bsize = bli_cntx_get_blksz( bszid, cntx ); \ - b_alg = bli_blksz_get_def( dt, bsize ); \ - b_max = bli_blksz_get_max( dt, bsize ); \ + const num_t dt = bli_obj_exec_dt( a ); \ + const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \ + dim_t b_alg = bli_blksz_get_def( dt, bsize ); \ + dim_t b_max = bli_blksz_get_max( dt, bsize ); \ \ /* Nudge the default and maximum kc blocksizes up to the nearest multiple of MR if the triangular matrix is on the left, or NR if the triangular matrix is one the right. */ \ + dim_t mnr; \ if ( bli_obj_root_is_triangular( a ) ) \ mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ else \ @@ -267,9 +248,7 @@ dim_t PASTEMAC0(opname) \ \ /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined in bli_blksz.c */ \ - b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ -\ - return b_use; \ + return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ } GENFRONT( trmm_determine_kc_f, f ) @@ -282,20 +261,14 @@ GENFRONT( trmm_determine_kc_b, b ) \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dim_t i, \ + dim_t dim, \ + const obj_t* a, \ + const obj_t* b, \ + bszid_t bszid, \ + const cntx_t* cntx \ ) \ { \ - num_t dt; \ - blksz_t* bsize; \ - dim_t mnr; \ - dim_t b_alg, b_max; \ - dim_t b_use; \ - \ /* bli_*_determine_kc_f(): We assume that this function is being called from an algorithm that @@ -310,25 +283,23 @@ dim_t PASTEMAC0(opname) \ \ /* Extract the execution datatype and use it to query the corresponding blocksize and blocksize maximum values from the blksz_t object. */ \ - dt = bli_obj_exec_dt( a ); \ - bsize = bli_cntx_get_blksz( bszid, cntx ); \ - b_alg = bli_blksz_get_def( dt, bsize ); \ - b_max = bli_blksz_get_max( dt, bsize ); \ + const num_t dt = bli_obj_exec_dt( a ); \ + const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \ + dim_t b_alg = bli_blksz_get_def( dt, bsize ); \ + dim_t b_max = bli_blksz_get_max( dt, bsize ); \ \ /* Nudge the default and maximum kc blocksizes up to the nearest multiple of MR. We always use MR (rather than sometimes using NR) because even when the triangle is on the right, packing of that matrix uses MR, since only left-side trsm micro-kernels are supported. */ \ - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ - b_max = bli_align_dim_to_mult( b_max, mnr ); \ + const dim_t mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ + b_max = bli_align_dim_to_mult( b_max, mnr ); \ \ /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined in bli_blksz.c */ \ - b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ -\ - return b_use; \ + return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ } GENFRONT( trsm_determine_kc_f, f ) diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h index 3ea3c5aa0..1ec889e03 100644 --- a/frame/3/bli_l3_blocksize.h +++ b/frame/3/bli_l3_blocksize.h @@ -34,14 +34,14 @@ dim_t bli_l3_determine_kc ( - dir_t direct, - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx, - cntl_t* cntl + dir_t direct, + dim_t i, + dim_t dim, + const obj_t* a, + const obj_t* b, + bszid_t bszid, + const cntx_t* cntx, + const cntl_t* cntl ); @@ -50,13 +50,13 @@ dim_t bli_l3_determine_kc \ dim_t PASTEMAC0(opname) \ ( \ - dir_t direct, \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dir_t direct, \ + dim_t i, \ + dim_t dim, \ + const obj_t* a, \ + const obj_t* b, \ + bszid_t bszid, \ + const cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) @@ -70,12 +70,12 @@ GENPROT( trsm_determine_kc ) \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dim_t i, \ + dim_t dim, \ + const obj_t* a, \ + const obj_t* b, \ + bszid_t bszid, \ + const cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index 3e7882bc3..3b4d88746 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -36,12 +36,12 @@ void bli_gemm_check ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { //err_t e_val; @@ -65,12 +65,12 @@ void bli_gemm_check void bli_gemmt_check ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { err_t e_val; @@ -87,13 +87,13 @@ void bli_gemmt_check void bli_hemm_check ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { err_t e_val; @@ -110,11 +110,11 @@ void bli_hemm_check void bli_herk_check ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { err_t e_val; @@ -143,12 +143,12 @@ void bli_herk_check void bli_her2k_check ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { err_t e_val; @@ -175,13 +175,13 @@ void bli_her2k_check void bli_symm_check ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { err_t e_val; @@ -198,11 +198,11 @@ void bli_symm_check void bli_syrk_check ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { err_t e_val; @@ -223,12 +223,12 @@ void bli_syrk_check void bli_syr2k_check ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { err_t e_val; @@ -250,13 +250,13 @@ void bli_syr2k_check void bli_trmm3_check ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { err_t e_val; @@ -273,11 +273,11 @@ void bli_trmm3_check void bli_trmm_check ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const cntx_t* cntx ) { err_t e_val; @@ -294,11 +294,11 @@ void bli_trmm_check void bli_trsm_check ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const cntx_t* cntx ) { err_t e_val; @@ -317,12 +317,12 @@ void bli_trsm_check void bli_gemm_basic_check ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { err_t e_val; @@ -367,12 +367,12 @@ void bli_gemm_basic_check void bli_gemmt_basic_check ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { err_t e_val; @@ -389,13 +389,13 @@ void bli_gemmt_basic_check void bli_hemm_basic_check ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { err_t e_val; @@ -433,12 +433,12 @@ void bli_hemm_basic_check void bli_herk_basic_check ( - obj_t* alpha, - obj_t* a, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* ah, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { err_t e_val; @@ -476,14 +476,14 @@ void bli_herk_basic_check void bli_her2k_basic_check ( - obj_t* alpha, - obj_t* a, - obj_t* bh, - obj_t* b, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* bh, + const obj_t* b, + const obj_t* ah, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { err_t e_val; @@ -537,12 +537,12 @@ void bli_her2k_basic_check void bli_l3_basic_check ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { err_t e_val; diff --git a/frame/3/bli_l3_check.h b/frame/3/bli_l3_check.h index c600d60b9..8551b6b61 100644 --- a/frame/3/bli_l3_check.h +++ b/frame/3/bli_l3_check.h @@ -42,12 +42,12 @@ \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx \ ); GENPROT( gemm ) @@ -61,13 +61,13 @@ GENPROT( syr2k ) \ void PASTEMAC(opname,_check) \ ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx \ + side_t side, \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx \ ); GENPROT( hemm ) @@ -80,11 +80,11 @@ GENPROT( trmm3 ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx \ ); GENPROT( herk ) @@ -96,11 +96,11 @@ GENPROT( syrk ) \ void PASTEMAC(opname,_check) \ ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - cntx_t* cntx \ + side_t side, \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const cntx_t* cntx \ ); GENPROT( trmm ) @@ -111,63 +111,63 @@ GENPROT( trsm ) void bli_gemm_basic_check ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ); void bli_gemmt_basic_check ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ); void bli_hemm_basic_check ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ); void bli_herk_basic_check ( - obj_t* alpha, - obj_t* a, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* ah, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ); void bli_her2k_basic_check ( - obj_t* alpha, - obj_t* a, - obj_t* bh, - obj_t* b, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* bh, + const obj_t* b, + const obj_t* ah, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ); void bli_l3_basic_check ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ); diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index 83ff8e5af..d7fd9649e 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -38,15 +38,15 @@ void bli_l3_cntl_create_if ( - opid_t family, - pack_t schema_a, - pack_t schema_b, - obj_t* a, - obj_t* b, - obj_t* c, - rntm_t* rntm, - cntl_t* cntl_orig, - cntl_t** cntl_use + opid_t family, + pack_t schema_a, + pack_t schema_b, + const obj_t* a, + const obj_t* b, + const obj_t* c, + rntm_t* rntm, + cntl_t* cntl_orig, + cntl_t** cntl_use ) { // If the control tree pointer is NULL, we construct a default diff --git a/frame/3/bli_l3_cntl.h b/frame/3/bli_l3_cntl.h index c308c8a96..eb4321ecd 100644 --- a/frame/3/bli_l3_cntl.h +++ b/frame/3/bli_l3_cntl.h @@ -40,15 +40,15 @@ void bli_l3_cntl_create_if ( - opid_t family, - pack_t schema_a, - pack_t schema_b, - obj_t* a, - obj_t* b, - obj_t* c, - rntm_t* rntm, - cntl_t* cntl_orig, - cntl_t** cntl_use + opid_t family, + pack_t schema_a, + pack_t schema_b, + const obj_t* a, + const obj_t* b, + const obj_t* c, + rntm_t* rntm, + cntl_t* cntl_orig, + cntl_t** cntl_use ); void bli_l3_cntl_free diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c index 0d0a71921..bbc4af7a0 100644 --- a/frame/3/bli_l3_direct.c +++ b/frame/3/bli_l3_direct.c @@ -36,10 +36,10 @@ dir_t bli_l3_direct ( - obj_t* a, - obj_t* b, - obj_t* c, - cntl_t* cntl + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntl_t* cntl ) { // Query the operation family. @@ -58,9 +58,9 @@ dir_t bli_l3_direct dir_t bli_gemm_direct ( - obj_t* a, - obj_t* b, - obj_t* c + const obj_t* a, + const obj_t* b, + const obj_t* c ) { // For gemm, movement may be forwards (or backwards). @@ -70,9 +70,9 @@ dir_t bli_gemm_direct dir_t bli_gemmt_direct ( - obj_t* a, - obj_t* b, - obj_t* c + const obj_t* a, + const obj_t* b, + const obj_t* c ) { // For gemmt, movement may be forwards (or backwards). @@ -82,9 +82,9 @@ dir_t bli_gemmt_direct dir_t bli_trmm_direct ( - obj_t* a, - obj_t* b, - obj_t* c + const obj_t* a, + const obj_t* b, + const obj_t* c ) { dir_t direct; @@ -111,9 +111,9 @@ dir_t bli_trmm_direct dir_t bli_trsm_direct ( - obj_t* a, - obj_t* b, - obj_t* c + const obj_t* a, + const obj_t* b, + const obj_t* c ) { dir_t direct; diff --git a/frame/3/bli_l3_direct.h b/frame/3/bli_l3_direct.h index 39798407a..8f624098e 100644 --- a/frame/3/bli_l3_direct.h +++ b/frame/3/bli_l3_direct.h @@ -34,10 +34,10 @@ dir_t bli_l3_direct ( - obj_t* a, - obj_t* b, - obj_t* c, - cntl_t* cntl + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntl_t* cntl ); // ----------------------------------------------------------------------------- @@ -47,9 +47,9 @@ dir_t bli_l3_direct \ dir_t PASTEMAC0(opname) \ ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* c \ ); GENPROT( gemm_direct ) diff --git a/frame/3/bli_l3_ft_ukr.h b/frame/3/bli_l3_ft_ukr.h index 28065c208..e7952409f 100644 --- a/frame/3/bli_l3_ft_ukr.h +++ b/frame/3/bli_l3_ft_ukr.h @@ -55,8 +55,8 @@ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ); INSERT_GENTDEF( gemm ) @@ -78,8 +78,8 @@ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ); INSERT_GENTDEF( gemmtrsm ) @@ -95,8 +95,8 @@ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ); INSERT_GENTDEF( trsm ) diff --git a/frame/3/bli_l3_ind_ukr.h b/frame/3/bli_l3_ind_ukr.h index 6f24e71fc..243ff818d 100644 --- a/frame/3/bli_l3_ind_ukr.h +++ b/frame/3/bli_l3_ind_ukr.h @@ -51,8 +51,8 @@ void PASTEMAC(ch,opname) \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemm1m_ukr_name ) @@ -72,8 +72,8 @@ void PASTEMAC(ch,opname) \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemmtrsm1m_l_ukr_name ) @@ -88,8 +88,8 @@ void PASTEMAC(ch,opname) \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsm1m_l_ukr_name ) diff --git a/frame/3/bli_l3_int.c b/frame/3/bli_l3_int.c index d4b974030..b786236ab 100644 --- a/frame/3/bli_l3_int.c +++ b/frame/3/bli_l3_int.c @@ -36,15 +36,15 @@ void bli_l3_int ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { obj_t a_local; diff --git a/frame/3/bli_l3_int.h b/frame/3/bli_l3_int.h index d76b0ac3e..65485206d 100644 --- a/frame/3/bli_l3_int.h +++ b/frame/3/bli_l3_int.h @@ -34,14 +34,14 @@ void bli_l3_int ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ); diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c index 1df8e8012..0365a198c 100644 --- a/frame/3/bli_l3_oapi.c +++ b/frame/3/bli_l3_oapi.c @@ -43,11 +43,11 @@ \ void PASTEMAC0(opname) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t @@ -66,12 +66,12 @@ GENFRONT( syr2k ) \ void PASTEMAC0(opname) \ ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c \ + side_t side, \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t @@ -89,10 +89,10 @@ GENFRONT( trmm3 ) \ void PASTEMAC0(opname) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* beta, \ + const obj_t* c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t @@ -109,10 +109,10 @@ GENFRONT( syrk ) \ void PASTEMAC0(opname) \ ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b \ + side_t side, \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t diff --git a/frame/3/bli_l3_oapi.h b/frame/3/bli_l3_oapi.h index e00f238ad..7161a3bf3 100644 --- a/frame/3/bli_l3_oapi.h +++ b/frame/3/bli_l3_oapi.h @@ -43,11 +43,11 @@ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c \ ); GENPROT( gemm ) @@ -61,12 +61,12 @@ GENPROT( syr2k ) \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c \ + side_t side, \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c \ ); GENPROT( hemm ) @@ -79,10 +79,10 @@ GENPROT( trmm3 ) \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* beta, \ + const obj_t* c \ ); GENPROT( herk ) @@ -94,10 +94,10 @@ GENPROT( syrk ) \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b \ + side_t side, \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b \ ); GENPROT( trmm ) diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c index cd0df7017..e4c815fe3 100644 --- a/frame/3/bli_l3_oapi_ex.c +++ b/frame/3/bli_l3_oapi_ex.c @@ -44,13 +44,13 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF) ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { bli_init_once(); @@ -117,13 +117,13 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF) void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { bli_init_once(); @@ -166,13 +166,13 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) void PASTEMAC(her2k,BLIS_OAPI_EX_SUF) ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { bli_init_once(); @@ -212,13 +212,13 @@ void PASTEMAC(her2k,BLIS_OAPI_EX_SUF) void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF) ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { bli_init_once(); @@ -244,14 +244,14 @@ void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF) void PASTEMAC(hemm,BLIS_OAPI_EX_SUF) ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { bli_init_once(); @@ -294,14 +294,14 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF) void PASTEMAC(symm,BLIS_OAPI_EX_SUF) ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { bli_init_once(); @@ -344,14 +344,14 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF) void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF) ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { bli_init_once(); @@ -394,12 +394,12 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF) void PASTEMAC(herk,BLIS_OAPI_EX_SUF) ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { bli_init_once(); @@ -428,12 +428,12 @@ void PASTEMAC(herk,BLIS_OAPI_EX_SUF) void PASTEMAC(syrk,BLIS_OAPI_EX_SUF) ( - obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { bli_init_once(); @@ -453,12 +453,12 @@ void PASTEMAC(syrk,BLIS_OAPI_EX_SUF) void PASTEMAC(trmm,BLIS_OAPI_EX_SUF) ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - rntm_t* rntm + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const cntx_t* cntx, + rntm_t* rntm ) { bli_init_once(); @@ -500,12 +500,12 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF) void PASTEMAC(trsm,BLIS_OAPI_EX_SUF) ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - rntm_t* rntm + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const cntx_t* cntx, + rntm_t* rntm ) { bli_init_once(); diff --git a/frame/3/bli_l3_oapi_ex.h b/frame/3/bli_l3_oapi_ex.h index 946a7aa17..58091704b 100644 --- a/frame/3/bli_l3_oapi_ex.h +++ b/frame/3/bli_l3_oapi_ex.h @@ -43,13 +43,13 @@ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); GENPROT( gemm ) @@ -63,14 +63,14 @@ GENPROT( syr2k ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + side_t side, \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); GENPROT( hemm ) @@ -83,12 +83,12 @@ GENPROT( trmm3 ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); GENPROT( herk ) @@ -100,12 +100,12 @@ GENPROT( syrk ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - cntx_t* cntx, \ - rntm_t* rntm \ + side_t side, \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); GENPROT( trmm ) diff --git a/frame/3/bli_l3_oft.h b/frame/3/bli_l3_oft.h index e7c8dcca3..997ade58e 100644 --- a/frame/3/bli_l3_oft.h +++ b/frame/3/bli_l3_oft.h @@ -48,13 +48,13 @@ \ typedef void (*PASTECH(opname,_oft)) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); GENTDEF( gemm ) @@ -70,14 +70,14 @@ GENTDEF( syr2k ) \ typedef void (*PASTECH(opname,_oft)) \ ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + side_t side, \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); GENTDEF( hemm ) @@ -92,12 +92,12 @@ GENTDEF( trmm3 ) \ typedef void (*PASTECH(opname,_oft)) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); GENTDEF( herk ) @@ -111,12 +111,12 @@ GENTDEF( syrk ) \ typedef void (*PASTECH(opname,_oft)) \ ( \ - side_t side, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - cntx_t* cntx, \ - rntm_t* rntm \ + side_t side, \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); GENTDEF( trmm ) diff --git a/frame/3/bli_l3_oft_var.h b/frame/3/bli_l3_oft_var.h index ea10d8090..ee529b115 100644 --- a/frame/3/bli_l3_oft_var.h +++ b/frame/3/bli_l3_oft_var.h @@ -45,13 +45,13 @@ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* c, \ + const cntx_t* cntx, \ + rntm_t* rntm, \ + cntl_t* cntl, \ + thrinfo_t* thread \ ); GENTDEF( l3 ) diff --git a/frame/3/bli_l3_packab.c b/frame/3/bli_l3_packab.c index d91181942..6f18169b2 100644 --- a/frame/3/bli_l3_packab.c +++ b/frame/3/bli_l3_packab.c @@ -36,13 +36,13 @@ void bli_l3_packa ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { obj_t a_local, a_pack; @@ -84,13 +84,13 @@ void bli_l3_packa void bli_l3_packb ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { obj_t bt_local, bt_pack; diff --git a/frame/3/bli_l3_packab.h b/frame/3/bli_l3_packab.h index 380ca7212..f03b7f62c 100644 --- a/frame/3/bli_l3_packab.h +++ b/frame/3/bli_l3_packab.h @@ -34,23 +34,23 @@ void bli_l3_packa ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ); void bli_l3_packb ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ); diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c index 6ca8244cb..6531b74a8 100644 --- a/frame/3/bli_l3_prune.c +++ b/frame/3/bli_l3_prune.c @@ -34,174 +34,106 @@ #include "blis.h" -/* + void bli_l3_prune_unref_mparts_m ( - obj_t* a, - obj_t* b, - obj_t* c, - cntl_t* cntl + obj_t* a, + const obj_t* b, + obj_t* c, + const cntl_t* cntl ) { - // Query the operation family. + /* Query the operation family. */ opid_t family = bli_cntl_family( cntl ); - if ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm. - else if ( family == BLIS_GEMMT ) bli_gemmt_prune_unref_mparts_m( a, b, c ); - else if ( family == BLIS_TRMM ) bli_trmm_prune_unref_mparts_m( a, b, c ); - else if ( family == BLIS_TRSM ) bli_trsm_prune_unref_mparts_m( a, b, c ); -} -*/ - -#undef GENFRONT -#define GENFRONT( dim ) \ -\ -void PASTEMAC(l3_prune_unref_mparts_,dim) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntl_t* cntl \ - ) \ -{ \ - /* Query the operation family. */ \ - opid_t family = bli_cntl_family( cntl ); \ -\ - if ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \ - else if ( family == BLIS_GEMMT ) PASTEMAC(gemmt_prune_unref_mparts_,dim)( a, b, c ); \ - else if ( family == BLIS_TRMM ) PASTEMAC(trmm_prune_unref_mparts_,dim)( a, b, c ); \ - else if ( family == BLIS_TRSM ) PASTEMAC(trsm_prune_unref_mparts_,dim)( a, b, c ); \ + if ( family == BLIS_GEMM ) + { + /* No pruning is necessary for gemm. */ + return; + } + else if ( family == BLIS_GEMMT ) + { + /* Prune any unreferenced part from the subpartition of C (that would + be encountered from partitioning in the m dimension) and adjust the + subpartition of A accordingly. */ + bli_prune_unref_mparts( c, BLIS_M, a, BLIS_M ); + } + else if ( family == BLIS_TRMM || + family == BLIS_TRSM ) + { + /* Prune any unreferenced part from the subpartition of A (that would + be encountered from partitioning in the m dimension) and adjust the + subpartition of C accordingly. */ + bli_prune_unref_mparts( a, BLIS_M, c, BLIS_M ); + } } -GENFRONT( m ) -GENFRONT( n ) -GENFRONT( k ) - -// ----------------------------------------------------------------------------- - -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,_prune_unref_mparts_m) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c \ - ) \ -{ \ - /* No pruning is necessary for gemm. */ \ -} \ -void PASTEMAC(opname,_prune_unref_mparts_n) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c \ - ) \ -{ \ - /* No pruning is necessary for gemm. */ \ -} \ -void PASTEMAC(opname,_prune_unref_mparts_k) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c \ - ) \ -{ \ - /* No pruning is necessary for gemm. */ \ -} - -GENFRONT( gemm ) - -// ----------------------------------------------------------------------------- +void bli_l3_prune_unref_mparts_n + ( + const obj_t* a, + obj_t* b, + obj_t* c, + const cntl_t* cntl + ) +{ + /* Query the operation family. */ + opid_t family = bli_cntl_family( cntl ); -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,_prune_unref_mparts_m) \ - ( \ - obj_t* a, \ - obj_t* ah, \ - obj_t* c \ - ) \ -{ \ - /* Prune any unreferenced part from the subpartition of C (that would - be encountered from partitioning in the m dimension) and adjust the - subpartition of A accordingly. */ \ - bli_prune_unref_mparts( c, BLIS_M, a, BLIS_M ); \ -} \ -void PASTEMAC(opname,_prune_unref_mparts_n) \ - ( \ - obj_t* a, \ - obj_t* ah, \ - obj_t* c \ - ) \ -{ \ - /* Prune any unreferenced part from the subpartition of C (that would - be encountered from partitioning in the n dimension) and adjust the - subpartition of Ah accordingly. */ \ - bli_prune_unref_mparts( c, BLIS_N, ah, BLIS_N ); \ -} \ -void PASTEMAC(opname,_prune_unref_mparts_k) \ - ( \ - obj_t* a, \ - obj_t* ah, \ - obj_t* c \ - ) \ -{ \ - /* As long as A and Ah are general in structure, no pruning should be - for the k dimension. */ \ + if ( family == BLIS_GEMM ) + { + /* No pruning is necessary for gemm. */ + return; + } + else if ( family == BLIS_GEMMT ) + { + /* Prune any unreferenced part from the subpartition of C (that would + be encountered from partitioning in the m dimension) and adjust the + subpartition of B accordingly. */ + bli_prune_unref_mparts( c, BLIS_N, b, BLIS_N ); + } + else if ( family == BLIS_TRMM || + family == BLIS_TRSM ) + { + /* Prune any unreferenced part from the subpartition of B (that would + be encountered from partitioning in the m dimension) and adjust the + subpartition of C accordingly. */ + bli_prune_unref_mparts( b, BLIS_N, c, BLIS_N ); + } } -GENFRONT( gemmt ) - -// ----------------------------------------------------------------------------- +void bli_l3_prune_unref_mparts_k + ( + obj_t* a, + obj_t* b, + const obj_t* c, + const cntl_t* cntl + ) +{ + /* Query the operation family. */ + opid_t family = bli_cntl_family( cntl ); -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,_prune_unref_mparts_m) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c \ - ) \ -{ \ - /* Prune any unreferenced part from the subpartition of A (that would - be encountered from partitioning in the m dimension) and adjust the - subpartition of C accordingly. */ \ - bli_prune_unref_mparts( a, BLIS_M, c, BLIS_M ); \ -} \ -void PASTEMAC(opname,_prune_unref_mparts_n) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c \ - ) \ -{ \ - /* Prune any unreferenced part from the subpartition of B (that would - be encountered from partitioning in the n dimension) and adjust the - subpartition of C accordingly. */ \ - bli_prune_unref_mparts( b, BLIS_N, c, BLIS_N ); \ -} \ -void PASTEMAC(opname,_prune_unref_mparts_k) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c \ - ) \ -{ \ - /* Prune any unreferenced part from the subpartition of A (that would - be encountered from partitioning in the k dimension) and adjust the - subpartition of B accordingly. */ \ - bli_prune_unref_mparts( a, BLIS_N, b, BLIS_M ); \ -\ - /* Prune any unreferenced part from the subpartition of B (that would - be encountered from partitioning in the k dimension) and adjust the - subpartition of A accordingly. */ \ - bli_prune_unref_mparts( b, BLIS_M, a, BLIS_N ); \ + if ( family == BLIS_GEMM ) + { + /* No pruning is necessary for gemm. */ + return; + } + else if ( family == BLIS_GEMMT ) + { + /* No pruning is necessary for gemmt. */ + return; + } + else if ( family == BLIS_TRMM || + family == BLIS_TRSM ) + { + /* Prune any unreferenced part from the subpartition of A (that would + be encountered from partitioning in the k dimension) and adjust the + subpartition of B accordingly. */ + bli_prune_unref_mparts( a, BLIS_N, b, BLIS_M ); + + /* Prune any unreferenced part from the subpartition of B (that would + be encountered from partitioning in the k dimension) and adjust the + subpartition of A accordingly. */ + bli_prune_unref_mparts( b, BLIS_M, a, BLIS_N ); + } } -GENFRONT( trmm ) -GENFRONT( trsm ) - - diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h index ad8f07dc4..84c0cbbcd 100644 --- a/frame/3/bli_l3_prune.h +++ b/frame/3/bli_l3_prune.h @@ -33,46 +33,27 @@ */ -#undef GENPROT -#define GENPROT( dim ) \ -\ -void PASTEMAC(l3_prune_unref_mparts_,dim) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntl_t* cntl \ +void bli_l3_prune_unref_mparts_m + ( + obj_t* a, + const obj_t* b, + obj_t* c, + const cntl_t* cntl ); -GENPROT( m ) -GENPROT( n ) -GENPROT( k ) - -// ----------------------------------------------------------------------------- - -#undef GENPROT -#define GENPROT( opname, dim ) \ -\ -void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ - ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c \ +void bli_l3_prune_unref_mparts_n + ( + const obj_t* a, + obj_t* b, + obj_t* c, + const cntl_t* cntl ); -GENPROT( gemm, m ) -GENPROT( gemm, n ) -GENPROT( gemm, k ) - -GENPROT( gemmt, m ) -GENPROT( gemmt, n ) -GENPROT( gemmt, k ) - -GENPROT( trmm, m ) -GENPROT( trmm, n ) -GENPROT( trmm, k ) - -GENPROT( trsm, m ) -GENPROT( trsm, n ) -GENPROT( trsm, k ) +void bli_l3_prune_unref_mparts_k + ( + obj_t* a, + obj_t* b, + const obj_t* c, + const cntl_t* cntl + ); diff --git a/frame/3/bli_l3_schema.c b/frame/3/bli_l3_schema.c index 1d4608799..1de381f37 100644 --- a/frame/3/bli_l3_schema.c +++ b/frame/3/bli_l3_schema.c @@ -36,10 +36,10 @@ void bli_l3_set_schemas ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx + obj_t* a, + obj_t* b, + const obj_t* c, + const cntx_t* cntx ) { // Begin with pack schemas for native execution. diff --git a/frame/3/bli_l3_schema.h b/frame/3/bli_l3_schema.h index c6a12ce52..a909bf598 100644 --- a/frame/3/bli_l3_schema.h +++ b/frame/3/bli_l3_schema.h @@ -34,8 +34,8 @@ void bli_l3_set_schemas ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx + obj_t* a, + obj_t* b, + const obj_t* c, + const cntx_t* cntx ); diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c index 7e37e1f22..eedbd9ec5 100644 --- a/frame/3/bli_l3_sup.c +++ b/frame/3/bli_l3_sup.c @@ -36,13 +36,13 @@ err_t bli_gemmsup ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { // Return early if small matrix handling is disabled at configure-time. @@ -134,13 +134,13 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n", err_t bli_gemmtsup ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { // Return early if small matrix handling is disabled at configure-time. diff --git a/frame/3/bli_l3_sup.h b/frame/3/bli_l3_sup.h index fe6d0483e..33b3f8ca7 100644 --- a/frame/3/bli_l3_sup.h +++ b/frame/3/bli_l3_sup.h @@ -34,23 +34,23 @@ err_t bli_gemmsup ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ); err_t bli_gemmtsup ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ); diff --git a/frame/3/bli_l3_sup_ft_ker.h b/frame/3/bli_l3_sup_ft_ker.h index 5bb2218f3..dbeafb404 100644 --- a/frame/3/bli_l3_sup_ft_ker.h +++ b/frame/3/bli_l3_sup_ft_ker.h @@ -57,8 +57,8 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ); INSERT_GENTDEF( gemmsup ) diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c index 3da3954fa..3ff13bdb5 100644 --- a/frame/3/bli_l3_sup_int.c +++ b/frame/3/bli_l3_sup_int.c @@ -36,14 +36,14 @@ err_t bli_gemmsup_int ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread ) { #if 0 @@ -240,14 +240,14 @@ err_t bli_gemmsup_int err_t bli_gemmtsup_int ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread ) { const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); diff --git a/frame/3/bli_l3_sup_int.h b/frame/3/bli_l3_sup_int.h index c6cb88056..195e3ca40 100644 --- a/frame/3/bli_l3_sup_int.h +++ b/frame/3/bli_l3_sup_int.h @@ -34,24 +34,24 @@ err_t bli_gemmsup_int ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread ); err_t bli_gemmtsup_int ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread ); diff --git a/frame/3/bli_l3_sup_ker_prot.h b/frame/3/bli_l3_sup_ker_prot.h index 899a47d3f..30cad5257 100644 --- a/frame/3/bli_l3_sup_ker_prot.h +++ b/frame/3/bli_l3_sup_ker_prot.h @@ -50,7 +50,7 @@ void PASTEMAC(ch,opname) \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ); diff --git a/frame/3/bli_l3_sup_oft.h b/frame/3/bli_l3_sup_oft.h index 98a06cf57..ba60035b7 100644 --- a/frame/3/bli_l3_sup_oft.h +++ b/frame/3/bli_l3_sup_oft.h @@ -47,13 +47,13 @@ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); GENTDEF( gemmsup ) diff --git a/frame/3/bli_l3_sup_packm_a.c b/frame/3/bli_l3_sup_packm_a.c index 56726c5f8..6b73050fd 100644 --- a/frame/3/bli_l3_sup_packm_a.c +++ b/frame/3/bli_l3_sup_packm_a.c @@ -40,15 +40,15 @@ \ void PASTEMAC(ch,opname) \ ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool will_pack, \ + packbuf_t pack_buf_type, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + const cntx_t* cntx, \ + rntm_t* rntm, \ + mem_t* mem, \ + const thrinfo_t* thread \ ) \ { \ /* Inspect whether we are going to be packing matrix A. */ \ @@ -174,10 +174,10 @@ INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_a ) \ void PASTEMAC(ch,opname) \ ( \ - bool did_pack, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool did_pack, \ + rntm_t* rntm, \ + mem_t* mem, \ + const thrinfo_t* thread \ ) \ { \ /* Inspect whether we previously packed matrix A. */ \ @@ -212,20 +212,20 @@ INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_a ) \ void PASTEMAC(ch,opname) \ ( \ - bool will_pack, \ - stor3_t stor_id, \ - pack_t* restrict schema, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - dim_t* restrict m_max, \ - dim_t* restrict k_max, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ - dim_t* restrict pd_p, inc_t* restrict ps_p, \ - cntx_t* restrict cntx, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool will_pack, \ + stor3_t stor_id, \ + pack_t* schema, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + dim_t* m_max, \ + dim_t* k_max, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype** p, inc_t* rs_p, inc_t* cs_p, \ + dim_t* pd_p, inc_t* ps_p, \ + cntx_t* cntx, \ + mem_t* mem, \ + thrinfo_t* thread \ ) \ { \ /* Inspect whether we are going to be packing matrix A. */ \ @@ -238,11 +238,11 @@ void PASTEMAC(ch,opname) \ source matrix A directly). */ \ { \ /* Use the strides of the source matrix as the final values. */ \ - *rs_p = rs_x; \ - *cs_p = cs_x; \ + *rs_p = rs_a; \ + *cs_p = cs_a; \ \ *pd_p = mr; \ - *ps_p = mr * rs_x; \ + *ps_p = mr * rs_a; \ \ /* Set the schema to "not packed" to indicate that packing will be skipped. */ \ @@ -251,7 +251,7 @@ void PASTEMAC(ch,opname) \ \ /* Since we won't be packing, simply update the buffer address provided by the caller to point to source matrix. */ \ - *p = x; \ + *p = a; \ } \ else /* if ( will_pack == TRUE ) */ \ { \ @@ -311,23 +311,23 @@ INSERT_GENTFUNC_BASIC0( packm_sup_init_a ) \ void PASTEMAC(ch,opname) \ ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - stor3_t stor_id, \ - trans_t transc, \ - dim_t m_alloc, \ - dim_t k_alloc, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t rs_a, inc_t cs_a, \ - ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ - inc_t* restrict ps_p, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool will_pack, \ + packbuf_t pack_buf_type, \ + stor3_t stor_id, \ + trans_t transc, \ + dim_t m_alloc, \ + dim_t k_alloc, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + ctype* kappa, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype** p, inc_t* rs_p, inc_t* cs_p, \ + inc_t* ps_p, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + mem_t* mem, \ + thrinfo_t* thread \ ) \ { \ pack_t schema; \ diff --git a/frame/3/bli_l3_sup_packm_a.h b/frame/3/bli_l3_sup_packm_a.h index 95c9582e7..0aaa302c8 100644 --- a/frame/3/bli_l3_sup_packm_a.h +++ b/frame/3/bli_l3_sup_packm_a.h @@ -38,15 +38,15 @@ \ void PASTEMAC(ch,opname) \ ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool will_pack, \ + packbuf_t pack_buf_type, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + const cntx_t* cntx, \ + rntm_t* rntm, \ + mem_t* mem, \ + const thrinfo_t* thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) @@ -57,10 +57,10 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) \ void PASTEMAC(ch,opname) \ ( \ - bool did_pack, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool did_pack, \ + rntm_t* rntm, \ + mem_t* mem, \ + const thrinfo_t* thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) @@ -71,20 +71,20 @@ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) \ void PASTEMAC(ch,opname) \ ( \ - bool will_pack, \ - stor3_t stor_id, \ - pack_t* restrict schema, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - dim_t* restrict m_max, \ - dim_t* restrict k_max, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ - dim_t* restrict pd_p, inc_t* restrict ps_p, \ - cntx_t* restrict cntx, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool will_pack, \ + stor3_t stor_id, \ + pack_t* schema, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + dim_t* m_max, \ + dim_t* k_max, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype** p, inc_t* rs_p, inc_t* cs_p, \ + dim_t* pd_p, inc_t* ps_p, \ + cntx_t* cntx, \ + mem_t* mem, \ + thrinfo_t* thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) @@ -95,23 +95,23 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) \ void PASTEMAC(ch,opname) \ ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - stor3_t stor_id, \ - trans_t transc, \ - dim_t m_alloc, \ - dim_t k_alloc, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - ctype* restrict kappa, \ - ctype* restrict a, inc_t rs_a, inc_t cs_a, \ - ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ - inc_t* restrict ps_p, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool will_pack, \ + packbuf_t pack_buf_type, \ + stor3_t stor_id, \ + trans_t transc, \ + dim_t m_alloc, \ + dim_t k_alloc, \ + dim_t m, \ + dim_t k, \ + dim_t mr, \ + ctype* kappa, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype** p, inc_t* rs_p, inc_t* cs_p, \ + inc_t* ps_p, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + mem_t* mem, \ + thrinfo_t* thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) diff --git a/frame/3/bli_l3_sup_packm_b.c b/frame/3/bli_l3_sup_packm_b.c index 32c14afe3..7a2030ccf 100644 --- a/frame/3/bli_l3_sup_packm_b.c +++ b/frame/3/bli_l3_sup_packm_b.c @@ -40,15 +40,15 @@ \ void PASTEMAC(ch,opname) \ ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool will_pack, \ + packbuf_t pack_buf_type, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + const cntx_t* cntx, \ + rntm_t* rntm, \ + mem_t* mem, \ + const thrinfo_t* thread \ ) \ { \ /* Inspect whether we are going to be packing matrix B. */ \ @@ -174,10 +174,10 @@ INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_b ) \ void PASTEMAC(ch,opname) \ ( \ - bool did_pack, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool did_pack, \ + rntm_t* rntm, \ + mem_t* mem, \ + const thrinfo_t* thread \ ) \ { \ /* Inspect whether we previously packed matrix A. */ \ @@ -212,20 +212,20 @@ INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_b ) \ void PASTEMAC(ch,opname) \ ( \ - bool will_pack, \ - stor3_t stor_id, \ - pack_t* restrict schema, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - dim_t* restrict k_max, \ - dim_t* restrict n_max, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ - dim_t* restrict pd_p, inc_t* restrict ps_p, \ - cntx_t* restrict cntx, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool will_pack, \ + stor3_t stor_id, \ + pack_t* schema, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + dim_t* k_max, \ + dim_t* n_max, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype** p, inc_t* rs_p, inc_t* cs_p, \ + dim_t* pd_p, inc_t* ps_p, \ + cntx_t* cntx, \ + mem_t* mem, \ + thrinfo_t* thread \ ) \ { \ /* Inspect whether we are going to be packing matrix B. */ \ @@ -238,11 +238,11 @@ void PASTEMAC(ch,opname) \ source matrix B directly). */ \ { \ /* Use the strides of the source matrix as the final values. */ \ - *rs_p = rs_x; \ - *cs_p = cs_x; \ + *rs_p = rs_b; \ + *cs_p = cs_b; \ \ *pd_p = nr; \ - *ps_p = nr * cs_x; \ + *ps_p = nr * cs_b; \ \ /* Set the schema to "not packed" to indicate that packing will be skipped. */ \ @@ -251,7 +251,7 @@ void PASTEMAC(ch,opname) \ \ /* Since we won't be packing, simply update the buffer address provided by the caller to point to source matrix. */ \ - *p = x; \ + *p = b; \ } \ else /* if ( will_pack == TRUE ) */ \ { \ @@ -311,23 +311,23 @@ INSERT_GENTFUNC_BASIC0( packm_sup_init_b ) \ void PASTEMAC(ch,opname) \ ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - stor3_t stor_id, \ - trans_t transc, \ - dim_t k_alloc, \ - dim_t n_alloc, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - ctype* restrict kappa, \ - ctype* restrict b, inc_t rs_b, inc_t cs_b, \ - ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ - inc_t* restrict ps_p, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool will_pack, \ + packbuf_t pack_buf_type, \ + stor3_t stor_id, \ + trans_t transc, \ + dim_t k_alloc, \ + dim_t n_alloc, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + ctype* kappa, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype** p, inc_t* rs_p, inc_t* cs_p, \ + inc_t* ps_p, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + mem_t* mem, \ + thrinfo_t* thread \ ) \ { \ pack_t schema; \ diff --git a/frame/3/bli_l3_sup_packm_b.h b/frame/3/bli_l3_sup_packm_b.h index 2965727d5..bd18e5887 100644 --- a/frame/3/bli_l3_sup_packm_b.h +++ b/frame/3/bli_l3_sup_packm_b.h @@ -38,15 +38,15 @@ \ void PASTEMAC(ch,opname) \ ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool will_pack, \ + packbuf_t pack_buf_type, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + const cntx_t* cntx, \ + rntm_t* rntm, \ + mem_t* mem, \ + const thrinfo_t* thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) @@ -57,10 +57,10 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) \ void PASTEMAC(ch,opname) \ ( \ - bool did_pack, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool did_pack, \ + rntm_t* rntm, \ + mem_t* mem, \ + const thrinfo_t* thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) @@ -71,20 +71,20 @@ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) \ void PASTEMAC(ch,opname) \ ( \ - bool will_pack, \ - stor3_t stor_id, \ - pack_t* restrict schema, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - dim_t* restrict k_max, \ - dim_t* restrict n_max, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ - dim_t* restrict pd_p, inc_t* restrict ps_p, \ - cntx_t* restrict cntx, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool will_pack, \ + stor3_t stor_id, \ + pack_t* schema, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + dim_t* k_max, \ + dim_t* n_max, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype** p, inc_t* rs_p, inc_t* cs_p, \ + dim_t* pd_p, inc_t* ps_p, \ + cntx_t* cntx, \ + mem_t* mem, \ + thrinfo_t* thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) @@ -95,23 +95,23 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) \ void PASTEMAC(ch,opname) \ ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - stor3_t stor_id, \ - trans_t transc, \ - dim_t k_alloc, \ - dim_t n_alloc, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - ctype* restrict kappa, \ - ctype* restrict b, inc_t rs_b, inc_t cs_b, \ - ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ - inc_t* restrict ps_p, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ + bool will_pack, \ + packbuf_t pack_buf_type, \ + stor3_t stor_id, \ + trans_t transc, \ + dim_t k_alloc, \ + dim_t n_alloc, \ + dim_t k, \ + dim_t n, \ + dim_t nr, \ + ctype* kappa, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype** p, inc_t* rs_p, inc_t* cs_p, \ + inc_t* ps_p, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + mem_t* mem, \ + thrinfo_t* thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c index 519dc5ccd..54ecab8ff 100644 --- a/frame/3/bli_l3_sup_packm_var.c +++ b/frame/3/bli_l3_sup_packm_var.c @@ -44,39 +44,39 @@ \ void PASTEMAC(ch,varname) \ ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p, \ - cntx_t* restrict cntx, \ - thrinfo_t* restrict thread \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + ctype* kappa, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + ctype* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* cntx, \ + thrinfo_t* thread \ ) \ { \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict c_cast = c; \ - ctype* restrict p_cast = p; \ -\ - dim_t iter_dim; \ - dim_t n_iter; \ - dim_t it, ic; \ - dim_t ic0; \ - doff_t ic_inc; \ - dim_t panel_len_full; \ - dim_t panel_len_i; \ - dim_t panel_len_max; \ - dim_t panel_len_max_i; \ - dim_t panel_dim_i; \ - dim_t panel_dim_max; \ - inc_t vs_c; \ - inc_t ldc; \ - inc_t ldp, p_inc; \ - conj_t conjc; \ + ctype* kappa_cast = kappa; \ + ctype* c_cast = c; \ + ctype* p_cast = p; \ +\ + dim_t iter_dim; \ + dim_t n_iter; \ + dim_t it, ic; \ + dim_t ic0; \ + doff_t ic_inc; \ + dim_t panel_len_full; \ + dim_t panel_len_i; \ + dim_t panel_len_max; \ + dim_t panel_len_max_i; \ + dim_t panel_dim_i; \ + dim_t panel_dim_max; \ + inc_t vs_c; \ + inc_t ldc; \ + inc_t ldp, p_inc; \ + conj_t conjc; \ \ \ /* Extract the conjugation bit from the transposition argument. */ \ @@ -141,7 +141,7 @@ void PASTEMAC(ch,varname) \ ic_inc = panel_dim_max; \ } \ \ - ctype* restrict p_begin = p_cast; \ + ctype* p_begin = p_cast; \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ @@ -166,10 +166,10 @@ void PASTEMAC(ch,varname) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ - ctype* restrict c_begin = c_cast + (ic )*vs_c; \ + ctype* c_begin = c_cast + (ic )*vs_c; \ \ - ctype* restrict c_use = c_begin; \ - ctype* restrict p_use = p_begin; \ + ctype* c_use = c_begin; \ + ctype* p_use = p_begin; \ \ { \ panel_len_i = panel_len_full; \ @@ -317,28 +317,28 @@ bli_thread_barrier( thread ); \ \ void PASTEMAC(ch,varname) \ ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - cntx_t* restrict cntx, \ - thrinfo_t* restrict thread \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + ctype* kappa, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + ctype* p, inc_t rs_p, inc_t cs_p, \ + cntx_t* cntx, \ + thrinfo_t* thread \ ) \ { \ - ctype* restrict kappa_cast = kappa; \ - ctype* restrict c_cast = c; \ - ctype* restrict p_cast = p; \ + ctype* kappa_cast = kappa; \ + ctype* c_cast = c; \ + ctype* p_cast = p; \ \ - dim_t iter_dim; \ - dim_t n_iter; \ - dim_t it; \ - dim_t vector_len; \ - inc_t incc, ldc; \ - inc_t incp, ldp; \ - conj_t conjc; \ + dim_t iter_dim; \ + dim_t n_iter; \ + dim_t it; \ + dim_t vector_len; \ + inc_t incc, ldc; \ + inc_t incp, ldp; \ + conj_t conjc; \ \ \ /* Extract the conjugation bit from the transposition argument. */ \ @@ -384,7 +384,7 @@ void PASTEMAC(ch,varname) \ n_iter = iter_dim; \ \ \ - ctype* restrict p_begin = p_cast; \ + ctype* p_begin = p_cast; \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ @@ -406,10 +406,10 @@ void PASTEMAC(ch,varname) \ /* Iterate over every logical micropanel in the source matrix. */ \ for ( it = 0; it < n_iter; it += 1 ) \ { \ - ctype* restrict c_begin = c_cast + (it )*ldc; \ + ctype* c_begin = c_cast + (it )*ldc; \ \ - ctype* restrict c_use = c_begin; \ - ctype* restrict p_use = p_begin; \ + ctype* c_use = c_begin; \ + ctype* p_use = p_begin; \ \ { \ /* The definition of bli_packm_my_iter() will depend on whether slab diff --git a/frame/3/bli_l3_sup_packm_var.h b/frame/3/bli_l3_sup_packm_var.h index 5ccdd3b76..9c62c9c68 100644 --- a/frame/3/bli_l3_sup_packm_var.h +++ b/frame/3/bli_l3_sup_packm_var.h @@ -42,18 +42,18 @@ \ void PASTEMAC(ch,varname) \ ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p, \ - cntx_t* restrict cntx, \ - thrinfo_t* restrict thread \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + ctype* kappa, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + ctype* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* cntx, \ + thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) @@ -63,15 +63,15 @@ INSERT_GENTPROT_BASIC0( packm_sup_var1 ) \ void PASTEMAC(ch,varname) \ ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - cntx_t* restrict cntx, \ - thrinfo_t* restrict thread \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + ctype* kappa, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + ctype* p, inc_t rs_p, inc_t cs_p, \ + cntx_t* cntx, \ + thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) diff --git a/frame/3/bli_l3_sup_ref.c b/frame/3/bli_l3_sup_ref.c index f03ec1b18..8eb7a6d4b 100644 --- a/frame/3/bli_l3_sup_ref.c +++ b/frame/3/bli_l3_sup_ref.c @@ -36,13 +36,13 @@ err_t bli_gemmsup_ref ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { // This function implements the default gemmsup handler. If you are a @@ -124,13 +124,13 @@ err_t bli_gemmsup_ref err_t bli_gemmtsup_ref ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { // This function implements the default gemmtsup handler. If you are a diff --git a/frame/3/bli_l3_sup_ref.h b/frame/3/bli_l3_sup_ref.h index bce4e1729..4d4811db3 100644 --- a/frame/3/bli_l3_sup_ref.h +++ b/frame/3/bli_l3_sup_ref.h @@ -34,23 +34,23 @@ err_t bli_gemmsup_ref ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ); err_t bli_gemmtsup_ref ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ); diff --git a/frame/3/bli_l3_sup_var12.c b/frame/3/bli_l3_sup_var12.c index 106ad86e4..d65482243 100644 --- a/frame/3/bli_l3_sup_var12.c +++ b/frame/3/bli_l3_sup_var12.c @@ -38,19 +38,19 @@ typedef void (*FUNCPTR_T) ( - conj_t conja, - conj_t conjb, - dim_t m, - dim_t n, - dim_t k, - void* restrict alpha, - void* restrict a, inc_t rs_a, inc_t cs_a, - void* restrict b, inc_t rs_b, inc_t cs_b, - void* restrict beta, - void* restrict c, inc_t rs_c, inc_t cs_c, - stor3_t eff_id, - cntx_t* restrict cntx, - rntm_t* restrict rntm + conj_t conja, + conj_t conjb, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t rs_a, inc_t cs_a, + void* b, inc_t rs_b, inc_t cs_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + stor3_t eff_id, + cntx_t* cntx, + rntm_t* rntm ); #if 0 @@ -95,20 +95,20 @@ void bli_gemmsup_ref_var2 const dim_t k = bli_obj_width( &at ); - void* restrict buf_a = bli_obj_buffer_at_off( &at ); + void* buf_a = bli_obj_buffer_at_off( &at ); const inc_t rs_a = bli_obj_row_stride( &at ); const inc_t cs_a = bli_obj_col_stride( &at ); - void* restrict buf_b = bli_obj_buffer_at_off( &bt ); + void* buf_b = bli_obj_buffer_at_off( &bt ); const inc_t rs_b = bli_obj_row_stride( &bt ); const inc_t cs_b = bli_obj_col_stride( &bt ); - void* restrict buf_c = bli_obj_buffer_at_off( c ); + void* buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); - void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); - void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); + void* buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); + void* buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); #else @@ -121,11 +121,11 @@ void bli_gemmsup_ref_var2 const dim_t n = bli_obj_width( c ); dim_t k; - void* restrict buf_a = bli_obj_buffer_at_off( a ); + void* buf_a = bli_obj_buffer_at_off( a ); inc_t rs_a; inc_t cs_a; - void* restrict buf_b = bli_obj_buffer_at_off( b ); + void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b; inc_t cs_b; @@ -157,12 +157,12 @@ void bli_gemmsup_ref_var2 cs_b = bli_obj_row_stride( b ); } - void* restrict buf_c = bli_obj_buffer_at_off( c ); + void* buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); - void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); - void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); + void* buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); + void* buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); #endif @@ -200,14 +200,14 @@ void PASTEMAC(ch,varname) \ dim_t m, \ dim_t n, \ dim_t k, \ - void* restrict alpha, \ - void* restrict a, inc_t rs_a, inc_t cs_a, \ - void* restrict b, inc_t rs_b, inc_t cs_b, \ - void* restrict beta, \ - void* restrict c, inc_t rs_c, inc_t cs_c, \ + void* alpha, \ + void* a, inc_t rs_a, inc_t cs_a, \ + void* b, inc_t rs_b, inc_t cs_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm \ + cntx_t* cntx, \ + rntm_t* rntm \ ) \ { \ /* If any dimension is zero, return immediately. */ \ @@ -266,13 +266,13 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,gemmsup_ker_ft) \ gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ - ctype* restrict a_00 = a; \ - ctype* restrict b_00 = b; \ - ctype* restrict c_00 = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ + ctype* a_00 = a; \ + ctype* b_00 = b; \ + ctype* c_00 = c; \ + ctype* alpha_cast = alpha; \ + ctype* beta_cast = beta; \ \ - ctype* restrict one = PASTEMAC(ch,1); \ + ctype* one = PASTEMAC(ch,1); \ \ auxinfo_t aux; \ \ @@ -305,8 +305,8 @@ void PASTEMAC(ch,varname) \ { \ const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \ \ - ctype* restrict b_jc = b_00 + jj * jcstep_b; \ - ctype* restrict c_jc = c_00 + jj * jcstep_c; \ + ctype* b_jc = b_00 + jj * jcstep_b; \ + ctype* c_jc = c_00 + jj * jcstep_c; \ \ const dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ const dim_t jr_left = nc_cur % NR; \ @@ -316,19 +316,19 @@ void PASTEMAC(ch,varname) \ { \ const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \ \ - ctype* restrict a_pc = a_00 + pp * pcstep_a; \ - ctype* restrict b_pc = b_jc + pp * pcstep_b; \ + ctype* a_pc = a_00 + pp * pcstep_a; \ + ctype* b_pc = b_jc + pp * pcstep_b; \ \ /* Only apply beta to the first iteration of the pc loop. */ \ - ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \ + ctype* beta_use = ( pp == 0 ? beta_cast : one ); \ \ /* Loop over the m dimension (MC rows at a time). */ \ for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \ { \ const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \ \ - ctype* restrict a_ic = a_pc + ii * icstep_a; \ - ctype* restrict c_ic = c_jc + ii * icstep_c; \ + ctype* a_ic = a_pc + ii * icstep_a; \ + ctype* c_ic = c_jc + ii * icstep_c; \ \ const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ const dim_t ir_left = mc_cur % MR; \ @@ -338,11 +338,11 @@ void PASTEMAC(ch,varname) \ { \ const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ \ - ctype* restrict b_jr = b_pc + j * jrstep_b; \ - ctype* restrict c_jr = c_ic + j * jrstep_c; \ + ctype* b_jr = b_pc + j * jrstep_b; \ + ctype* c_jr = c_ic + j * jrstep_c; \ \ /* - ctype* restrict b2 = b_jr; \ + ctype* b2 = b_jr; \ */ \ \ /* Loop over the m dimension (MR rows at a time). */ \ @@ -350,13 +350,13 @@ void PASTEMAC(ch,varname) \ { \ const dim_t mr_cur = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \ \ - ctype* restrict a_ir = a_ic + i * irstep_a; \ - ctype* restrict c_ir = c_jr + i * irstep_c; \ + ctype* a_ir = a_ic + i * irstep_a; \ + ctype* c_ir = c_jr + i * irstep_c; \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ /* - ctype* restrict a2 = bli_gemm_get_next_a_upanel( a_ir, irstep_a, ir_inc ); \ + ctype* a2 = bli_gemm_get_next_a_upanel( a_ir, irstep_a, ir_inc ); \ if ( bli_is_last_iter( i, ir_iter, 0, 1 ) ) \ { \ a2 = a_00; \ @@ -442,20 +442,20 @@ void bli_gemmsup_ref_var1 const dim_t k = bli_obj_width( &at ); - void* restrict buf_a = bli_obj_buffer_at_off( &at ); + void* buf_a = bli_obj_buffer_at_off( &at ); const inc_t rs_a = bli_obj_row_stride( &at ); const inc_t cs_a = bli_obj_col_stride( &at ); - void* restrict buf_b = bli_obj_buffer_at_off( &bt ); + void* buf_b = bli_obj_buffer_at_off( &bt ); const inc_t rs_b = bli_obj_row_stride( &bt ); const inc_t cs_b = bli_obj_col_stride( &bt ); - void* restrict buf_c = bli_obj_buffer_at_off( c ); + void* buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); - void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); - void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); + void* buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); + void* buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); #else @@ -468,11 +468,11 @@ void bli_gemmsup_ref_var1 const dim_t n = bli_obj_width( c ); dim_t k; - void* restrict buf_a = bli_obj_buffer_at_off( a ); + void* buf_a = bli_obj_buffer_at_off( a ); inc_t rs_a; inc_t cs_a; - void* restrict buf_b = bli_obj_buffer_at_off( b ); + void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b; inc_t cs_b; @@ -504,12 +504,12 @@ void bli_gemmsup_ref_var1 cs_b = bli_obj_row_stride( b ); } - void* restrict buf_c = bli_obj_buffer_at_off( c ); + void* buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); - void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); - void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); + void* buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); + void* buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); #endif @@ -547,14 +547,14 @@ void PASTEMAC(ch,varname) \ dim_t m, \ dim_t n, \ dim_t k, \ - void* restrict alpha, \ - void* restrict a, inc_t rs_a, inc_t cs_a, \ - void* restrict b, inc_t rs_b, inc_t cs_b, \ - void* restrict beta, \ - void* restrict c, inc_t rs_c, inc_t cs_c, \ + void* alpha, \ + void* a, inc_t rs_a, inc_t cs_a, \ + void* b, inc_t rs_b, inc_t cs_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm \ + cntx_t* cntx, \ + rntm_t* rntm \ ) \ { \ /* If any dimension is zero, return immediately. */ \ @@ -617,13 +617,13 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,gemmsup_ker_ft) \ gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ - ctype* restrict a_00 = a; \ - ctype* restrict b_00 = b; \ - ctype* restrict c_00 = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ + ctype* a_00 = a; \ + ctype* b_00 = b; \ + ctype* c_00 = c; \ + ctype* alpha_cast = alpha; \ + ctype* beta_cast = beta; \ \ - ctype* restrict one = PASTEMAC(ch,1); \ + ctype* one = PASTEMAC(ch,1); \ \ auxinfo_t aux; \ \ @@ -656,8 +656,8 @@ void PASTEMAC(ch,varname) \ { \ const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \ \ - ctype* restrict a_jc = a_00 + jj * jcstep_a; \ - ctype* restrict c_jc = c_00 + jj * jcstep_c; \ + ctype* a_jc = a_00 + jj * jcstep_a; \ + ctype* c_jc = c_00 + jj * jcstep_c; \ \ const dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \ const dim_t jr_left = nc_cur % MR; \ @@ -667,19 +667,19 @@ void PASTEMAC(ch,varname) \ { \ const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \ \ - ctype* restrict a_pc = a_jc + pp * pcstep_a; \ - ctype* restrict b_pc = b_00 + pp * pcstep_b; \ + ctype* a_pc = a_jc + pp * pcstep_a; \ + ctype* b_pc = b_00 + pp * pcstep_b; \ \ /* Only apply beta to the first iteration of the pc loop. */ \ - ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \ + ctype* beta_use = ( pp == 0 ? beta_cast : one ); \ \ /* Loop over the n dimension (MC rows at a time). */ \ for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \ { \ const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \ \ - ctype* restrict b_ic = b_pc + ii * icstep_b; \ - ctype* restrict c_ic = c_jc + ii * icstep_c; \ + ctype* b_ic = b_pc + ii * icstep_b; \ + ctype* c_ic = c_jc + ii * icstep_c; \ \ const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \ const dim_t ir_left = mc_cur % NR; \ @@ -689,16 +689,16 @@ void PASTEMAC(ch,varname) \ { \ const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ \ - ctype* restrict a_jr = a_pc + j * jrstep_a; \ - ctype* restrict c_jr = c_ic + j * jrstep_c; \ + ctype* a_jr = a_pc + j * jrstep_a; \ + ctype* c_jr = c_ic + j * jrstep_c; \ \ /* Loop over the n dimension (MR rows at a time). */ \ for ( dim_t i = 0; i < ir_iter; i += ir_inc ) \ { \ const dim_t mr_cur = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \ \ - ctype* restrict b_ir = b_ic + i * irstep_b; \ - ctype* restrict c_ir = c_jr + i * irstep_c; \ + ctype* b_ir = b_ic + i * irstep_b; \ + ctype* c_ir = c_jr + i * irstep_c; \ \ /* Invoke the gemmsup micro-kernel. */ \ gemmsup_ker \ diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c index acc4c3071..a5d66783f 100644 --- a/frame/3/bli_l3_sup_var1n2m.c +++ b/frame/3/bli_l3_sup_var1n2m.c @@ -38,22 +38,22 @@ typedef void (*FUNCPTR_T) ( - bool packa, - bool packb, - conj_t conja, - conj_t conjb, - dim_t m, - dim_t n, - dim_t k, - void* restrict alpha, - void* restrict a, inc_t rs_a, inc_t cs_a, - void* restrict b, inc_t rs_b, inc_t cs_b, - void* restrict beta, - void* restrict c, inc_t rs_c, inc_t cs_c, - stor3_t eff_id, - cntx_t* restrict cntx, - rntm_t* restrict rntm, - thrinfo_t* restrict thread + bool packa, + bool packb, + conj_t conja, + conj_t conjb, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t rs_a, inc_t cs_a, + void* b, inc_t rs_b, inc_t cs_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + stor3_t eff_id, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread ); // @@ -64,16 +64,16 @@ static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n); void bli_gemmsup_ref_var1n ( - trans_t trans, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - stor3_t eff_id, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread + trans_t trans, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + stor3_t eff_id, + const cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread ) { #if 0 @@ -98,41 +98,41 @@ void bli_gemmsup_ref_var1n const dim_t k = bli_obj_width( &at ); - void* restrict buf_a = bli_obj_buffer_at_off( &at ); + void* buf_a = bli_obj_buffer_at_off( &at ); const inc_t rs_a = bli_obj_row_stride( &at ); const inc_t cs_a = bli_obj_col_stride( &at ); - void* restrict buf_b = bli_obj_buffer_at_off( &bt ); + void* buf_b = bli_obj_buffer_at_off( &bt ); const inc_t rs_b = bli_obj_row_stride( &bt ); const inc_t cs_b = bli_obj_col_stride( &bt ); - void* restrict buf_c = bli_obj_buffer_at_off( c ); + void* buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); - void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); - void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); #else - const num_t dt = bli_obj_dt( c ); + const num_t dt = bli_obj_dt( c ); - const bool packa = bli_rntm_pack_a( rntm ); - const bool packb = bli_rntm_pack_b( rntm ); + const bool packa = bli_rntm_pack_a( rntm ); + const bool packb = bli_rntm_pack_b( rntm ); - const conj_t conja = bli_obj_conj_status( a ); - const conj_t conjb = bli_obj_conj_status( b ); + const conj_t conja = bli_obj_conj_status( a ); + const conj_t conjb = bli_obj_conj_status( b ); - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - dim_t k; + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + dim_t k; - void* restrict buf_a = bli_obj_buffer_at_off( a ); - inc_t rs_a; - inc_t cs_a; + const void* buf_a = bli_obj_buffer_at_off( a ); + inc_t rs_a; + inc_t cs_a; - void* restrict buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b; - inc_t cs_b; + const void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b; + inc_t cs_b; if ( bli_obj_has_notrans( a ) ) { @@ -162,12 +162,12 @@ void bli_gemmsup_ref_var1n cs_b = bli_obj_row_stride( b ); } - void* restrict buf_c = bli_obj_buffer_at_off( c ); - const inc_t rs_c = bli_obj_row_stride( c ); - const inc_t cs_c = bli_obj_col_stride( c ); + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); - void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); - void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + const void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + const void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); #endif @@ -193,13 +193,13 @@ void bli_gemmsup_ref_var1n m, n, k, - buf_alpha, - buf_a, rs_a, cs_a, - buf_b, rs_b, cs_b, - buf_beta, - buf_c, rs_c, cs_c, + ( void* )buf_alpha, + ( void* )buf_a, rs_a, cs_a, + ( void* )buf_b, rs_b, cs_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, eff_id, - cntx, + ( cntx_t* )cntx, rntm, thread ); @@ -216,13 +216,13 @@ void bli_gemmsup_ref_var1n n, // swap the m and n dimensions. m, k, - buf_alpha, - buf_b, cs_b, rs_b, // swap the positions of A and B. - buf_a, cs_a, rs_a, // swap the strides of A and B. - buf_beta, - buf_c, cs_c, rs_c, // swap the strides of C. + ( void* )buf_alpha, + ( void* )buf_b, cs_b, rs_b, // swap the positions of A and B. + ( void* )buf_a, cs_a, rs_a, // swap the strides of A and B. + ( void* )buf_beta, + buf_c, cs_c, rs_c, // swap the strides of C. bli_stor3_trans( eff_id ), // transpose the stor3_t id. - cntx, + ( cntx_t* )cntx, rntm, thread ); @@ -235,22 +235,22 @@ void bli_gemmsup_ref_var1n \ void PASTEMAC(ch,varname) \ ( \ - bool packa, \ - bool packb, \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* restrict alpha, \ - void* restrict a, inc_t rs_a, inc_t cs_a, \ - void* restrict b, inc_t rs_b, inc_t cs_b, \ - void* restrict beta, \ - void* restrict c, inc_t rs_c, inc_t cs_c, \ - stor3_t stor_id, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - thrinfo_t* restrict thread \ + bool packa, \ + bool packb, \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t rs_a, inc_t cs_a, \ + void* b, inc_t rs_b, inc_t cs_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + stor3_t stor_id, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -365,20 +365,20 @@ void PASTEMAC(ch,varname) \ /* Query the context for the sup microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemmsup_ker_ft) \ - gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ - ctype* restrict a_00 = a; \ - ctype* restrict b_00 = b; \ - ctype* restrict c_00 = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ + ctype* a_00 = a; \ + ctype* b_00 = b; \ + ctype* c_00 = c; \ + ctype* alpha_cast = alpha; \ + ctype* beta_cast = beta; \ \ /* Make local copies of beta and one scalars to prevent any unnecessary sharing of cache lines between the cores' caches. */ \ ctype beta_local = *beta_cast; \ ctype one_local = *PASTEMAC(ch,1); \ \ - auxinfo_t aux; \ + auxinfo_t aux; \ \ /* Parse and interpret the contents of the rntm_t object to properly set the ways of parallelism for each loop. */ \ @@ -408,12 +408,12 @@ void PASTEMAC(ch,varname) \ That is, this panel-block algorithm partitions an NC x KC submatrix of A to be packed in the 4th loop, and a KC x MC submatrix of B to be packed in the 3rd loop. */ \ - /* 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop */ \ + /* 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop */ \ bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ - bszid_t* restrict bszids; \ + bszid_t* bszids; \ \ /* Set the bszids pointer to the correct bszids array above based on which matrices (if any) are being packed. */ \ @@ -425,16 +425,16 @@ void PASTEMAC(ch,varname) \ /* Determine whether we are using more than one thread. */ \ const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \ \ - thrinfo_t* restrict thread_jc = NULL; \ - thrinfo_t* restrict thread_pc = NULL; \ - thrinfo_t* restrict thread_pa = NULL; \ - thrinfo_t* restrict thread_ic = NULL; \ - thrinfo_t* restrict thread_pb = NULL; \ - thrinfo_t* restrict thread_jr = NULL; \ + thrinfo_t* thread_jc = NULL; \ + thrinfo_t* thread_pc = NULL; \ + thrinfo_t* thread_pa = NULL; \ + thrinfo_t* thread_ic = NULL; \ + thrinfo_t* thread_pb = NULL; \ + thrinfo_t* thread_jr = NULL; \ \ /* Grow the thrinfo_t tree. */ \ - bszid_t* restrict bszids_jc = bszids; \ - thread_jc = thread; \ + bszid_t* bszids_jc = bszids; \ + thread_jc = thread; \ bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ \ /* Compute the JC loop thread range for the current thread. */ \ @@ -453,12 +453,12 @@ void PASTEMAC(ch,varname) \ /* Calculate the thread's current JC block dimension. */ \ const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ \ - ctype* restrict a_jc = a_00 + jj * jcstep_a; \ - ctype* restrict c_jc = c_00 + jj * jcstep_c; \ + ctype* a_jc = a_00 + jj * jcstep_a; \ + ctype* c_jc = c_00 + jj * jcstep_c; \ \ /* Grow the thrinfo_t tree. */ \ - bszid_t* restrict bszids_pc = &bszids_jc[1]; \ - thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bszid_t* bszids_pc = &bszids_jc[1]; \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ \ /* Compute the PC loop thread range for the current thread. */ \ @@ -476,14 +476,14 @@ void PASTEMAC(ch,varname) \ /* Calculate the thread's current PC block dimension. */ \ const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ \ - ctype* restrict a_pc = a_jc + pp * pcstep_a; \ - ctype* restrict b_pc = b_00 + pp * pcstep_b; \ + ctype* a_pc = a_jc + pp * pcstep_a; \ + ctype* b_pc = b_00 + pp * pcstep_b; \ \ /* Only apply beta to the first iteration of the pc loop. */ \ - ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ + ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \ \ ctype* a_use; \ - inc_t rs_a_use, cs_a_use, ps_a_use; \ + inc_t rs_a_use, cs_a_use, ps_a_use; \ \ /* Set the bszid_t array and thrinfo_t pointer based on whether we will be packing A. If we won't be packing A, we alias to @@ -493,7 +493,7 @@ void PASTEMAC(ch,varname) \ previous call to bli_thrinfo_grow(), since bszid values of BLIS_NO_PART cause the tree to grow by two (e.g. to the next bszid that is a normal bszid_t value). */ \ - bszid_t* restrict bszids_pa; \ + bszid_t* bszids_pa; \ if ( packa ) { bszids_pa = &bszids_pc[1]; \ thread_pa = bli_thrinfo_sub_node( thread_pc ); } \ else { bszids_pa = &bszids_pc[0]; \ @@ -526,7 +526,7 @@ void PASTEMAC(ch,varname) \ \ /* Alias a_use so that it's clear this is our current block of matrix A. */ \ - ctype* restrict a_pc_use = a_use; \ + ctype* a_pc_use = a_use; \ \ /* We don't need to embed the panel stride of A within the auxinfo_t object because this variant iterates through A in the jr loop, @@ -535,8 +535,8 @@ void PASTEMAC(ch,varname) \ /*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \ \ /* Grow the thrinfo_t tree. */ \ - bszid_t* restrict bszids_ic = &bszids_pa[1]; \ - thread_ic = bli_thrinfo_sub_node( thread_pa ); \ + bszid_t* bszids_ic = &bszids_pa[1]; \ + thread_ic = bli_thrinfo_sub_node( thread_pa ); \ bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ \ /* Compute the IC loop thread range for the current thread. */ \ @@ -555,11 +555,11 @@ void PASTEMAC(ch,varname) \ /* Calculate the thread's current IC block dimension. */ \ const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ \ - ctype* restrict b_ic = b_pc + ii * icstep_b; \ - ctype* restrict c_ic = c_jc + ii * icstep_c; \ + ctype* b_ic = b_pc + ii * icstep_b; \ + ctype* c_ic = c_jc + ii * icstep_c; \ \ ctype* b_use; \ - inc_t rs_b_use, cs_b_use, ps_b_use; \ + inc_t rs_b_use, cs_b_use, ps_b_use; \ \ /* Set the bszid_t array and thrinfo_t pointer based on whether we will be packing A. If we won't be packing A, we alias to @@ -569,7 +569,7 @@ void PASTEMAC(ch,varname) \ previous call to bli_thrinfo_grow(), since bszid values of BLIS_NO_PART cause the tree to grow by two (e.g. to the next bszid that is a normal bszid_t value). */ \ - bszid_t* restrict bszids_pb; \ + bszid_t* bszids_pb; \ if ( packb ) { bszids_pb = &bszids_ic[1]; \ thread_pb = bli_thrinfo_sub_node( thread_ic ); } \ else { bszids_pb = &bszids_ic[0]; \ @@ -602,7 +602,7 @@ void PASTEMAC(ch,varname) \ \ /* Alias b_use so that it's clear this is our current block of matrix B. */ \ - ctype* restrict b_ic_use = b_use; \ + ctype* b_ic_use = b_use; \ \ /* Embed the panel stride of B within the auxinfo_t object. The millikernel will query and use this to iterate through @@ -610,8 +610,8 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_ps_b( ps_b_use, &aux ); \ \ /* Grow the thrinfo_t tree. */ \ - bszid_t* restrict bszids_jr = &bszids_pb[1]; \ - thread_jr = bli_thrinfo_sub_node( thread_pb ); \ + bszid_t* bszids_jr = &bszids_pb[1]; \ + thread_jr = bli_thrinfo_sub_node( thread_pb ); \ bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ \ /* Compute number of primary and leftover components of the JR loop. */ \ @@ -640,10 +640,10 @@ void PASTEMAC(ch,varname) \ const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \ \ /* - ctype* restrict a_jr = a_pc + j * jrstep_a; \ + ctype* a_jr = a_pc + j * jrstep_a; \ */ \ - ctype* restrict a_jr = a_pc_use + j * ps_a_use; \ - ctype* restrict c_jr = c_ic + j * jrstep_c; \ + ctype* a_jr = a_pc_use + j * ps_a_use; \ + ctype* c_jr = c_ic + j * jrstep_c; \ \ /* const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \ @@ -664,7 +664,7 @@ void PASTEMAC(ch,varname) \ a_jr, rs_a_use, cs_a_use, \ b_ic_use, rs_b_use, cs_b_use, \ beta_use, \ - c_jr, rs_c, cs_c, \ + c_jr, rs_c, cs_c, \ &aux, \ cntx \ ); \ @@ -712,16 +712,16 @@ static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m); void bli_gemmsup_ref_var2m ( - trans_t trans, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - stor3_t eff_id, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread + trans_t trans, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + stor3_t eff_id, + const cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread ) { #if 0 @@ -746,41 +746,41 @@ void bli_gemmsup_ref_var2m const dim_t k = bli_obj_width( &at ); - void* restrict buf_a = bli_obj_buffer_at_off( &at ); + void* buf_a = bli_obj_buffer_at_off( &at ); const inc_t rs_a = bli_obj_row_stride( &at ); const inc_t cs_a = bli_obj_col_stride( &at ); - void* restrict buf_b = bli_obj_buffer_at_off( &bt ); + void* buf_b = bli_obj_buffer_at_off( &bt ); const inc_t rs_b = bli_obj_row_stride( &bt ); const inc_t cs_b = bli_obj_col_stride( &bt ); - void* restrict buf_c = bli_obj_buffer_at_off( c ); + void* buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); - void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); - void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); #else - const num_t dt = bli_obj_dt( c ); + const num_t dt = bli_obj_dt( c ); - const bool packa = bli_rntm_pack_a( rntm ); - const bool packb = bli_rntm_pack_b( rntm ); + const bool packa = bli_rntm_pack_a( rntm ); + const bool packb = bli_rntm_pack_b( rntm ); - const conj_t conja = bli_obj_conj_status( a ); - const conj_t conjb = bli_obj_conj_status( b ); + const conj_t conja = bli_obj_conj_status( a ); + const conj_t conjb = bli_obj_conj_status( b ); - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - dim_t k; + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + dim_t k; - void* restrict buf_a = bli_obj_buffer_at_off( a ); - inc_t rs_a; - inc_t cs_a; + const void* buf_a = bli_obj_buffer_at_off( a ); + inc_t rs_a; + inc_t cs_a; - void* restrict buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b; - inc_t cs_b; + const void* buf_b = bli_obj_buffer_at_off( b ); + inc_t rs_b; + inc_t cs_b; if ( bli_obj_has_notrans( a ) ) { @@ -810,12 +810,12 @@ void bli_gemmsup_ref_var2m cs_b = bli_obj_row_stride( b ); } - void* restrict buf_c = bli_obj_buffer_at_off( c ); - const inc_t rs_c = bli_obj_row_stride( c ); - const inc_t cs_c = bli_obj_col_stride( c ); + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); - void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); - void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + const void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); + const void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); #endif @@ -841,13 +841,13 @@ void bli_gemmsup_ref_var2m m, n, k, - buf_alpha, - buf_a, rs_a, cs_a, - buf_b, rs_b, cs_b, - buf_beta, - buf_c, rs_c, cs_c, + ( void* )buf_alpha, + ( void* )buf_a, rs_a, cs_a, + ( void* )buf_b, rs_b, cs_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, eff_id, - cntx, + ( cntx_t* )cntx, rntm, thread ); @@ -864,13 +864,13 @@ void bli_gemmsup_ref_var2m n, // swap the m and n dimensions. m, k, - buf_alpha, - buf_b, cs_b, rs_b, // swap the positions of A and B. - buf_a, cs_a, rs_a, // swap the strides of A and B. - buf_beta, - buf_c, cs_c, rs_c, // swap the strides of C. + ( void* )buf_alpha, + ( void* )buf_b, cs_b, rs_b, // swap the positions of A and B. + ( void* )buf_a, cs_a, rs_a, // swap the strides of A and B. + ( void* )buf_beta, + buf_c, cs_c, rs_c, // swap the strides of C. bli_stor3_trans( eff_id ), // transpose the stor3_t id. - cntx, + ( cntx_t* )cntx, rntm, thread ); @@ -883,22 +883,22 @@ void bli_gemmsup_ref_var2m \ void PASTEMAC(ch,varname) \ ( \ - bool packa, \ - bool packb, \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* restrict alpha, \ - void* restrict a, inc_t rs_a, inc_t cs_a, \ - void* restrict b, inc_t rs_b, inc_t cs_b, \ - void* restrict beta, \ - void* restrict c, inc_t rs_c, inc_t cs_c, \ - stor3_t stor_id, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - thrinfo_t* restrict thread \ + bool packa, \ + bool packb, \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t rs_a, inc_t cs_a, \ + void* b, inc_t rs_b, inc_t cs_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + stor3_t stor_id, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -998,13 +998,13 @@ void PASTEMAC(ch,varname) \ /* Query the context for the sup microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemmsup_ker_ft) \ - gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ + gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ - ctype* restrict a_00 = a; \ - ctype* restrict b_00 = b; \ - ctype* restrict c_00 = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ + ctype* a_00 = a; \ + ctype* b_00 = b; \ + ctype* c_00 = c; \ + ctype* alpha_cast = alpha; \ + ctype* beta_cast = beta; \ \ /* Make local copies of beta and one scalars to prevent any unnecessary sharing of cache lines between the cores' caches. */ \ @@ -1035,7 +1035,7 @@ void PASTEMAC(ch,varname) \ bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ - bszid_t* restrict bszids; \ + bszid_t* bszids; \ \ /* Set the bszids pointer to the correct bszids array above based on which matrices (if any) are being packed. */ \ @@ -1047,16 +1047,16 @@ void PASTEMAC(ch,varname) \ /* Determine whether we are using more than one thread. */ \ const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \ \ - thrinfo_t* restrict thread_jc = NULL; \ - thrinfo_t* restrict thread_pc = NULL; \ - thrinfo_t* restrict thread_pb = NULL; \ - thrinfo_t* restrict thread_ic = NULL; \ - thrinfo_t* restrict thread_pa = NULL; \ - thrinfo_t* restrict thread_jr = NULL; \ + thrinfo_t* thread_jc = NULL; \ + thrinfo_t* thread_pc = NULL; \ + thrinfo_t* thread_pb = NULL; \ + thrinfo_t* thread_ic = NULL; \ + thrinfo_t* thread_pa = NULL; \ + thrinfo_t* thread_jr = NULL; \ \ /* Grow the thrinfo_t tree. */ \ - bszid_t* restrict bszids_jc = bszids; \ - thread_jc = thread; \ + bszid_t* bszids_jc = bszids; \ + thread_jc = thread; \ bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ \ /* Compute the JC loop thread range for the current thread. */ \ @@ -1075,12 +1075,12 @@ void PASTEMAC(ch,varname) \ /* Calculate the thread's current JC block dimension. */ \ const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ \ - ctype* restrict b_jc = b_00 + jj * jcstep_b; \ - ctype* restrict c_jc = c_00 + jj * jcstep_c; \ + ctype* b_jc = b_00 + jj * jcstep_b; \ + ctype* c_jc = c_00 + jj * jcstep_c; \ \ /* Grow the thrinfo_t tree. */ \ - bszid_t* restrict bszids_pc = &bszids_jc[1]; \ - thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + bszid_t* bszids_pc = &bszids_jc[1]; \ + thread_pc = bli_thrinfo_sub_node( thread_jc ); \ bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ \ /* Compute the PC loop thread range for the current thread. */ \ @@ -1098,11 +1098,11 @@ void PASTEMAC(ch,varname) \ /* Calculate the thread's current PC block dimension. */ \ const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ \ - ctype* restrict a_pc = a_00 + pp * pcstep_a; \ - ctype* restrict b_pc = b_jc + pp * pcstep_b; \ + ctype* a_pc = a_00 + pp * pcstep_a; \ + ctype* b_pc = b_jc + pp * pcstep_b; \ \ /* Only apply beta to the first iteration of the pc loop. */ \ - ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ + ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \ \ ctype* b_use; \ inc_t rs_b_use, cs_b_use, ps_b_use; \ @@ -1115,7 +1115,7 @@ void PASTEMAC(ch,varname) \ previous call to bli_thrinfo_grow(), since bszid values of BLIS_NO_PART cause the tree to grow by two (e.g. to the next bszid that is a normal bszid_t value). */ \ - bszid_t* restrict bszids_pb; \ + bszid_t* bszids_pb; \ if ( packb ) { bszids_pb = &bszids_pc[1]; \ thread_pb = bli_thrinfo_sub_node( thread_pc ); } \ else { bszids_pb = &bszids_pc[0]; \ @@ -1146,7 +1146,7 @@ void PASTEMAC(ch,varname) \ \ /* Alias b_use so that it's clear this is our current block of matrix B. */ \ - ctype* restrict b_pc_use = b_use; \ + ctype* b_pc_use = b_use; \ \ /* We don't need to embed the panel stride of B within the auxinfo_t object because this variant iterates through B in the jr loop, @@ -1155,8 +1155,8 @@ void PASTEMAC(ch,varname) \ /*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \ \ /* Grow the thrinfo_t tree. */ \ - bszid_t* restrict bszids_ic = &bszids_pb[1]; \ - thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + bszid_t* bszids_ic = &bszids_pb[1]; \ + thread_ic = bli_thrinfo_sub_node( thread_pb ); \ bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ \ /* Compute the IC loop thread range for the current thread. */ \ @@ -1175,8 +1175,8 @@ void PASTEMAC(ch,varname) \ /* Calculate the thread's current IC block dimension. */ \ const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ \ - ctype* restrict a_ic = a_pc + ii * icstep_a; \ - ctype* restrict c_ic = c_jc + ii * icstep_c; \ + ctype* a_ic = a_pc + ii * icstep_a; \ + ctype* c_ic = c_jc + ii * icstep_c; \ \ ctype* a_use; \ inc_t rs_a_use, cs_a_use, ps_a_use; \ @@ -1189,7 +1189,7 @@ void PASTEMAC(ch,varname) \ previous call to bli_thrinfo_grow(), since bszid values of BLIS_NO_PART cause the tree to grow by two (e.g. to the next bszid that is a normal bszid_t value). */ \ - bszid_t* restrict bszids_pa; \ + bszid_t* bszids_pa; \ if ( packa ) { bszids_pa = &bszids_ic[1]; \ thread_pa = bli_thrinfo_sub_node( thread_ic ); } \ else { bszids_pa = &bszids_ic[0]; \ @@ -1220,7 +1220,7 @@ void PASTEMAC(ch,varname) \ \ /* Alias a_use so that it's clear this is our current block of matrix A. */ \ - ctype* restrict a_ic_use = a_use; \ + ctype* a_ic_use = a_use; \ \ /* Embed the panel stride of A within the auxinfo_t object. The millikernel will query and use this to iterate through @@ -1228,8 +1228,8 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_ps_a( ps_a_use, &aux ); \ \ /* Grow the thrinfo_t tree. */ \ - bszid_t* restrict bszids_jr = &bszids_pa[1]; \ - thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + bszid_t* bszids_jr = &bszids_pa[1]; \ + thread_jr = bli_thrinfo_sub_node( thread_pa ); \ bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ \ /* Compute number of primary and leftover components of the JR loop. */ \ @@ -1258,10 +1258,10 @@ void PASTEMAC(ch,varname) \ const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ \ /* - ctype* restrict b_jr = b_pc_use + j * jrstep_b; \ + ctype* b_jr = b_pc_use + j * jrstep_b; \ */ \ - ctype* restrict b_jr = b_pc_use + j * ps_b_use; \ - ctype* restrict c_jr = c_ic + j * jrstep_c; \ + ctype* b_jr = b_pc_use + j * ps_b_use; \ + ctype* c_jr = c_ic + j * jrstep_c; \ \ /* const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ @@ -1282,7 +1282,7 @@ void PASTEMAC(ch,varname) \ a_ic_use, rs_a_use, cs_a_use, \ b_jr, rs_b_use, cs_b_use, \ beta_use, \ - c_jr, rs_c, cs_c, \ + c_jr, rs_c, cs_c, \ &aux, \ cntx \ ); \ diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h index ead9925e6..df9a747ab 100644 --- a/frame/3/bli_l3_sup_vars.h +++ b/frame/3/bli_l3_sup_vars.h @@ -42,16 +42,16 @@ \ void PASTEMAC0(opname) \ ( \ - trans_t trans, \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - stor3_t eff_id, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ + trans_t trans, \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c, \ + stor3_t eff_id, \ + const cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) @@ -70,20 +70,20 @@ GENPROT( gemmsup_ref_var2m ) \ void PASTEMAC(ch,varname) \ ( \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* restrict alpha, \ - void* restrict a, inc_t rs_a, inc_t cs_a, \ - void* restrict b, inc_t rs_b, inc_t cs_b, \ - void* restrict beta, \ - void* restrict c, inc_t rs_c, inc_t cs_c, \ - stor3_t eff_id, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - thrinfo_t* restrict thread \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t rs_a, inc_t cs_a, \ + void* b, inc_t rs_b, inc_t cs_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + stor3_t eff_id, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) @@ -94,22 +94,22 @@ INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) \ void PASTEMAC(ch,varname) \ ( \ - bool packa, \ - bool packb, \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* restrict alpha, \ - void* restrict a, inc_t rs_a, inc_t cs_a, \ - void* restrict b, inc_t rs_b, inc_t cs_b, \ - void* restrict beta, \ - void* restrict c, inc_t rs_c, inc_t cs_c, \ - stor3_t eff_id, \ - cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - thrinfo_t* restrict thread \ + bool packa, \ + bool packb, \ + conj_t conja, \ + conj_t conjb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t rs_a, inc_t cs_a, \ + void* b, inc_t rs_b, inc_t cs_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + stor3_t eff_id, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) @@ -119,12 +119,12 @@ INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( - num_t dt, - trans_t* trans, - bool packa, - bool packb, - stor3_t* eff_id, - cntx_t* cntx + num_t dt, + trans_t* trans, + bool packa, + bool packb, + stor3_t* eff_id, + const cntx_t* cntx ) { const bool row_pref = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( *eff_id ), cntx ); diff --git a/frame/3/bli_l3_tapi.c b/frame/3/bli_l3_tapi.c index afec5b677..8f256a11a 100644 --- a/frame/3/bli_l3_tapi.c +++ b/frame/3/bli_l3_tapi.c @@ -43,16 +43,16 @@ \ void PASTEMAC(ch,opname) \ ( \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t @@ -80,16 +80,16 @@ INSERT_GENTFUNC_BASIC0( gemm ) \ void PASTEMAC(ch,opname) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t @@ -118,17 +118,17 @@ INSERT_GENTFUNC_BASIC0( gemmt ) \ void PASTEMAC(ch,opname) \ ( \ - side_t side, \ - uplo_t uploa, \ - conj_t conja, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t @@ -159,14 +159,14 @@ INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC ) \ void PASTEMAC(ch,opname) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype_r* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype_r* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t @@ -193,16 +193,16 @@ INSERT_GENTFUNCR_BASIC0( herk ) \ void PASTEMAC(ch,opname) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t @@ -231,14 +231,14 @@ INSERT_GENTFUNCR_BASIC0( her2k ) \ void PASTEMAC(ch,opname) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t @@ -265,16 +265,16 @@ INSERT_GENTFUNC_BASIC0( syrk ) \ void PASTEMAC(ch,opname) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t @@ -303,18 +303,18 @@ INSERT_GENTFUNC_BASIC0( syr2k ) \ void PASTEMAC(ch,opname) \ ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t @@ -345,15 +345,15 @@ INSERT_GENTFUNC_BASIC0( trmm3 ) \ void PASTEMAC(ch,opname) \ ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t diff --git a/frame/3/bli_l3_tapi.h b/frame/3/bli_l3_tapi.h index 4b3504001..9b7a9b077 100644 --- a/frame/3/bli_l3_tapi.h +++ b/frame/3/bli_l3_tapi.h @@ -43,16 +43,16 @@ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) @@ -62,17 +62,17 @@ INSERT_GENTPROT_BASIC0( gemm ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - side_t side, \ - uplo_t uploa, \ - conj_t conja, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) @@ -84,14 +84,14 @@ INSERT_GENTPROT_BASIC0( symm ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype_r* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype_r* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) @@ -102,16 +102,16 @@ INSERT_GENTPROTR_BASIC0( herk ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) @@ -122,14 +122,14 @@ INSERT_GENTPROTR_BASIC0( her2k ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) @@ -140,16 +140,16 @@ INSERT_GENTPROT_BASIC0( syrk ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) @@ -161,18 +161,18 @@ INSERT_GENTPROT_BASIC0( syr2k ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) @@ -183,15 +183,15 @@ INSERT_GENTPROT_BASIC0( trmm3 ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) diff --git a/frame/3/bli_l3_tapi_ex.c b/frame/3/bli_l3_tapi_ex.c index f6a52fb5e..c934ba949 100644 --- a/frame/3/bli_l3_tapi_ex.c +++ b/frame/3/bli_l3_tapi_ex.c @@ -44,18 +44,18 @@ \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -74,12 +74,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \ \ - bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ - bli_obj_init_finish_1x1( dt, beta, &betao ); \ + bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt, ( void* )beta, &betao ); \ \ - bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ - bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ - bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \ + bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \ + bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \ \ bli_obj_set_conjtrans( transa, &ao ); \ bli_obj_set_conjtrans( transb, &bo ); \ @@ -103,19 +103,19 @@ INSERT_GENTFUNC_BASIC0( gemm ) \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ - side_t side, \ - uplo_t uploa, \ - conj_t conja, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -134,12 +134,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ bli_set_dim_with_side( side, m, n, &mn_a ); \ bli_set_dims_with_trans( transb, m, n, &m_b, &n_b ); \ \ - bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ - bli_obj_init_finish_1x1( dt, beta, &betao ); \ + bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt, ( void* )beta, &betao ); \ \ - bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ - bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ - bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \ + bli_obj_init_finish( dt, mn_a, mn_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \ + bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( uploa, &ao ); \ bli_obj_set_conj( conja, &ao ); \ @@ -169,16 +169,16 @@ INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC ) \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype_r* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype_r* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -195,11 +195,11 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ \ bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ \ - bli_obj_init_finish_1x1( dt_r, alpha, &alphao ); \ - bli_obj_init_finish_1x1( dt_r, beta, &betao ); \ + bli_obj_init_finish_1x1( dt_r, ( void* )alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt_r, ( void* )beta, &betao ); \ \ - bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ - bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ + bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( uploc, &co ); \ bli_obj_set_conjtrans( transa, &ao ); \ @@ -225,18 +225,18 @@ INSERT_GENTFUNCR_BASIC0( herk ) \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -256,12 +256,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \ \ - bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ - bli_obj_init_finish_1x1( dt_r, beta, &betao ); \ + bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt_r, ( void* )beta, &betao ); \ \ - bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ - bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ - bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ + bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \ + bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( uploc, &co ); \ bli_obj_set_conjtrans( transa, &ao ); \ @@ -289,16 +289,16 @@ INSERT_GENTFUNCR_BASIC0( her2k ) \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -314,11 +314,11 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ \ bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ \ - bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ - bli_obj_init_finish_1x1( dt, beta, &betao ); \ + bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt, ( void* )beta, &betao ); \ \ - bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ - bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ + bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( uploc, &co ); \ bli_obj_set_conjtrans( transa, &ao ); \ @@ -344,18 +344,18 @@ INSERT_GENTFUNC_BASIC0( syrk ) \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -374,12 +374,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \ \ - bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ - bli_obj_init_finish_1x1( dt, beta, &betao ); \ + bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt, ( void* )beta, &betao ); \ \ - bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ - bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ - bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ + bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \ + bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( uploc, &co ); \ bli_obj_set_conjtrans( transa, &ao ); \ @@ -407,18 +407,18 @@ INSERT_GENTFUNC_BASIC0( syr2k ) \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -437,12 +437,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ bli_set_dims_with_trans( transb, k, m, &m_b, &n_b ); \ \ - bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ - bli_obj_init_finish_1x1( dt, beta, &betao ); \ + bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt, ( void* )beta, &betao ); \ \ - bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ - bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ - bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ + bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \ + bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( uploc, &co ); \ bli_obj_set_conjtrans( transa, &ao ); \ @@ -468,20 +468,20 @@ INSERT_GENTFUNC_BASIC0( gemmt ) \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -500,12 +500,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ bli_set_dim_with_side( side, m, n, &mn_a ); \ bli_set_dims_with_trans( transb, m, n, &m_b, &n_b ); \ \ - bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ - bli_obj_init_finish_1x1( dt, beta, &betao ); \ + bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt, ( void* )beta, &betao ); \ \ - bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ - bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ - bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \ + bli_obj_init_finish( dt, mn_a, mn_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \ + bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( uploa, &ao ); \ bli_obj_set_diag( diaga, &ao ); \ @@ -535,17 +535,17 @@ INSERT_GENTFUNC_BASIC0( trmm3 ) \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - cntx_t* cntx, \ - rntm_t* rntm \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -560,10 +560,10 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ \ bli_set_dim_with_side( side, m, n, &mn_a ); \ \ - bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ + bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \ \ - bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ - bli_obj_init_finish( dt, m, n, b, rs_b, cs_b, &bo ); \ + bli_obj_init_finish( dt, mn_a, mn_a, ( void* )a, rs_a, cs_a, &ao ); \ + bli_obj_init_finish( dt, m, n, b, rs_b, cs_b, &bo ); \ \ bli_obj_set_uplo( uploa, &ao ); \ bli_obj_set_diag( diaga, &ao ); \ diff --git a/frame/3/bli_l3_tapi_ex.h b/frame/3/bli_l3_tapi_ex.h index 1ab0a8ff1..eb142af05 100644 --- a/frame/3/bli_l3_tapi_ex.h +++ b/frame/3/bli_l3_tapi_ex.h @@ -43,18 +43,18 @@ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) @@ -64,19 +64,19 @@ INSERT_GENTPROT_BASIC0( gemm ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ - side_t side, \ - uplo_t uploa, \ - conj_t conja, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) @@ -88,16 +88,16 @@ INSERT_GENTPROT_BASIC0( symm ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype_r* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype_r* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) @@ -108,18 +108,18 @@ INSERT_GENTPROTR_BASIC0( herk ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype_r* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) @@ -130,16 +130,16 @@ INSERT_GENTPROTR_BASIC0( her2k ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) @@ -150,18 +150,18 @@ INSERT_GENTPROT_BASIC0( syrk ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ - uplo_t uploc, \ - trans_t transa, \ - trans_t transb, \ - dim_t m, \ - dim_t k, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) @@ -173,20 +173,20 @@ INSERT_GENTPROT_BASIC0( syr2k ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - trans_t transb, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + const ctype* b, inc_t rs_b, inc_t cs_b, \ + const ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) @@ -197,17 +197,17 @@ INSERT_GENTPROT_BASIC0( trmm3 ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ - side_t side, \ - uplo_t uploa, \ - trans_t transa, \ - diag_t diaga, \ - dim_t m, \ - dim_t n, \ - ctype* alpha, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - cntx_t* cntx, \ - rntm_t* rntm \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + dim_t n, \ + const ctype* alpha, \ + const ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + const cntx_t* cntx, \ + rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) diff --git a/frame/3/bli_l3_ukr_prot.h b/frame/3/bli_l3_ukr_prot.h index 677afc020..44a59bd4c 100644 --- a/frame/3/bli_l3_ukr_prot.h +++ b/frame/3/bli_l3_ukr_prot.h @@ -50,8 +50,8 @@ void PASTEMAC(ch,opname) \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ); @@ -68,8 +68,8 @@ void PASTEMAC(ch,opname) \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ); @@ -80,7 +80,7 @@ void PASTEMAC(ch,opname) \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ); diff --git a/frame/3/bli_l3_ukr_tapi.c b/frame/3/bli_l3_ukr_tapi.c index 56eaf3f4c..c2e8ed5d5 100644 --- a/frame/3/bli_l3_ukr_tapi.c +++ b/frame/3/bli_l3_ukr_tapi.c @@ -47,8 +47,8 @@ void PASTEMAC(ch,opname) \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ bli_init_once(); \ @@ -92,8 +92,8 @@ void PASTEMAC(ch,opname) \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ bli_init_once(); \ @@ -133,8 +133,8 @@ void PASTEMAC(ch,opname) \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ bli_init_once(); \ diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c index de077e5ad..485779a90 100644 --- a/frame/3/gemm/bli_gemm_blk_var1.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -37,44 +37,47 @@ void bli_gemm_blk_var1 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - obj_t a1, c1; - dim_t my_start, my_end; - dim_t b_alg; + obj_t ap, cp; + bli_obj_alias_to( a, &ap ); + bli_obj_alias_to( c, &cp ); // Determine the direction in which to partition (forwards or backwards). - dir_t direct = bli_l3_direct( a, b, c, cntl ); + const dir_t direct = bli_l3_direct( &ap, b, &cp, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_m( a, b, c, cntl ); + bli_l3_prune_unref_mparts_m( &ap, b, &cp, cntl ); // Determine the current thread's subpartition range. + dim_t my_start, my_end; bli_thread_range_mdim ( - direct, thread, a, b, c, cntl, cntx, + direct, thread, &ap, b, &cp, cntl, cntx, &my_start, &my_end ); // Partition along the m dimension. + dim_t b_alg; for ( dim_t i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize( direct, i, my_end, a, + b_alg = bli_determine_blocksize( direct, i, my_end, &ap, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and C1. + obj_t a1, c1; bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, - i, b_alg, a, &a1 ); + i, b_alg, &ap, &a1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, - i, b_alg, c, &c1 ); + i, b_alg, &cp, &c1 ); // Perform gemm subproblem. bli_l3_int diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c index 53943e47c..254a31064 100644 --- a/frame/3/gemm/bli_gemm_blk_var2.c +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -37,44 +37,47 @@ void bli_gemm_blk_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - obj_t b1, c1; - dim_t my_start, my_end; - dim_t b_alg; + obj_t bp, cp; + bli_obj_alias_to( b, &bp ); + bli_obj_alias_to( c, &cp ); // Determine the direction in which to partition (forwards or backwards). - dir_t direct = bli_l3_direct( a, b, c, cntl ); + dir_t direct = bli_l3_direct( a, &bp, &cp, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_n( a, b, c, cntl ); + bli_l3_prune_unref_mparts_n( a, &bp, &cp, cntl ); // Determine the current thread's subpartition range. + dim_t my_start, my_end; bli_thread_range_ndim ( - direct, thread, a, b, c, cntl, cntx, + direct, thread, a, &bp, &cp, cntl, cntx, &my_start, &my_end ); // Partition along the n dimension. + dim_t b_alg; for ( dim_t i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize( direct, i, my_end, b, + b_alg = bli_determine_blocksize( direct, i, my_end, &bp, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for B1 and C1. + obj_t b1, c1; bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, - i, b_alg, b, &b1 ); + i, b_alg, &bp, &b1 ); bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, - i, b_alg, c, &c1 ); + i, b_alg, &cp, &c1 ); // Perform gemm subproblem. bli_l3_int diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index 28029777d..1bbec1d95 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -36,39 +36,43 @@ void bli_gemm_blk_var3 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - obj_t a1, b1; - dim_t b_alg; + obj_t ap, bp, cs; + bli_obj_alias_to( a, &ap ); + bli_obj_alias_to( b, &bp ); + bli_obj_alias_to( c, &cs ); // Determine the direction in which to partition (forwards or backwards). - dir_t direct = bli_l3_direct( a, b, c, cntl ); + dir_t direct = bli_l3_direct( &ap, &bp, &cs, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_k( a, b, c, cntl ); + bli_l3_prune_unref_mparts_k( &ap, &bp, &cs, cntl ); // Query dimension in partitioning direction. - dim_t k_trans = bli_obj_width_after_trans( a ); + dim_t k_trans = bli_obj_width_after_trans( &ap ); // Partition along the k dimension. + dim_t b_alg; for ( dim_t i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_l3_determine_kc( direct, i, k_trans, a, b, + b_alg = bli_l3_determine_kc( direct, i, k_trans, &ap, &bp, bli_cntl_bszid( cntl ), cntx, cntl ); // Acquire partitions for A1 and B1. + obj_t a1, b1; bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, - i, b_alg, a, &a1 ); + i, b_alg, &ap, &a1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, - i, b_alg, b, &b1 ); + i, b_alg, &bp, &b1 ); // Perform gemm subproblem. bli_l3_int @@ -77,7 +81,7 @@ void bli_gemm_blk_var3 &a1, &b1, &BLIS_ONE, - c, + &cs, cntx, rntm, bli_cntl_sub_node( cntl ), @@ -107,7 +111,7 @@ void bli_gemm_blk_var3 // Thus, for neither trmm nor trmm3 should we reset the scalar on C // after the first iteration. if ( bli_cntl_family( cntl ) != BLIS_TRMM ) - if ( i == 0 ) bli_obj_scalar_reset( c ); + if ( i == 0 ) bli_obj_scalar_reset( &cs ); } } diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index cd8827bd9..1ae904abf 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -37,14 +37,14 @@ void bli_gemm_front ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ) { bli_init_once(); @@ -163,8 +163,8 @@ void bli_gemm_front rntm ); - obj_t* cp = &c_local; - obj_t* betap = beta; + obj_t* cp = &c_local; + const obj_t* betap = beta; #ifdef BLIS_ENABLE_GEMM_MD #ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM @@ -269,7 +269,7 @@ void bli_gemm_front // If we created a temporary matrix conformal to C for whatever reason, // we copy/accumulate the result back to C and then release the object. if ( use_ct ) - { + { obj_t beta_local; bli_obj_scalar_detach( &c_local, &beta_local ); diff --git a/frame/3/gemm/bli_gemm_front.h b/frame/3/gemm/bli_gemm_front.h index 2728ce8f7..744f88d1b 100644 --- a/frame/3/gemm/bli_gemm_front.h +++ b/frame/3/gemm/bli_gemm_front.h @@ -34,26 +34,26 @@ void bli_gemm_front ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + cntl_t* cntl ); #endif diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 874a12439..814b47c0c 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -77,38 +77,38 @@ static xpbys_mxn_vft GENARRAY2_ALL(xbpys_mxn, xbpys_mxn_fn); void bli_gemm_ker_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - num_t dt_exec = bli_obj_exec_dt( c ); - num_t dt_c = bli_obj_dt( c ); + num_t dt_exec = bli_obj_exec_dt( c ); + num_t dt_c = bli_obj_dt( c ); - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); - char* a_cast = bli_obj_buffer_at_off( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); + const char* a_cast = bli_obj_buffer_at_off( a ); + inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); - char* b_cast = bli_obj_buffer_at_off( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); + const char* b_cast = bli_obj_buffer_at_off( b ); + inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); - char* c_cast = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); + char* c_cast = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); // If any dimension is zero, return immediately. if ( bli_zero_dim3( m, n, k ) ) return; @@ -129,8 +129,8 @@ void bli_gemm_ker_var2 // that casts the scalars of A and B to dt_exec via scalar_a and scalar_b, // and we know that the internal scalar in C is already of the type dt_c // due to the casting in the implementation of bli_obj_scalar_attach(). - char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b ); - char* beta_cast = bli_obj_internal_scalar_buffer( c ); + const char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b ); + const char* beta_cast = bli_obj_internal_scalar_buffer( c ); // If 1m is being employed on a column- or row-stored matrix with a // real-valued beta, we can use the real domain macro-kernel, which @@ -174,14 +174,12 @@ void bli_gemm_ker_var2 } #endif - siz_t dt_size = bli_dt_size( dt_exec ); - siz_t dt_c_size = bli_dt_size( dt_c ); + const siz_t dt_size = bli_dt_size( dt_exec ); + const siz_t dt_c_size = bli_dt_size( dt_c ); // Alias some constants to simpler names. - const dim_t MR = pd_a; - const dim_t NR = pd_b; - //const dim_t PACKMR = cs_a; - //const dim_t PACKNR = rs_b; + const dim_t MR = pd_a; + const dim_t NR = pd_b; // Query the context for the micro-kernel address and cast it to its // function pointer type. @@ -191,7 +189,7 @@ void bli_gemm_ker_var2 // field of the params struct. If that function pointer is non-NULL, use it // as our microkernel instead of the default microkernel queried from the // cntx above. - gemm_ker_params_t* params = bli_obj_ker_params( c ); + const gemm_ker_params_t* params = bli_obj_ker_params( c ); gemm_ukr_vft user_ukr = params ? params->ukr : NULL; if ( user_ukr ) gemm_ukr = user_ukr; @@ -204,7 +202,7 @@ void bli_gemm_ker_var2 const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx ); const inc_t rs_ct = ( col_pref ? 1 : NR ); const inc_t cs_ct = ( col_pref ? MR : 1 ); - char* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); + const char* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); // // Assumptions/assertions: @@ -277,24 +275,24 @@ void bli_gemm_ker_var2 // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) { - char* b1 = b_cast + j * cstep_b; - char* c1 = c_cast + j * cstep_c; + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. - char* b2 = b1; + const char* b2 = b1; // Loop over the m dimension (MR rows at a time). for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) { - char* a1 = a_cast + i * rstep_a; - char* c11 = c1 + i * rstep_c; + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); // Compute the addresses of the next panels of A and B. - char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); + const char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) { a2 = a_cast; @@ -320,13 +318,13 @@ void bli_gemm_ker_var2 m_cur, n_cur, k, - alpha_cast, - a1, - b1, - beta_cast, - c11, rs_c, cs_c, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, &aux, - cntx + ( cntx_t* )cntx ); } else @@ -337,13 +335,13 @@ void bli_gemm_ker_var2 MR, NR, k, - alpha_cast, - a1, - b1, - zero, - &ct, rs_ct, cs_ct, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )zero, + &ct, rs_ct, cs_ct, &aux, - cntx + ( cntx_t* )cntx ); // Accumulate to C with type-casting. @@ -351,7 +349,7 @@ void bli_gemm_ker_var2 ( m_cur, n_cur, &ct, rs_ct, cs_ct, - beta_cast, + ( void* )beta_cast, c11, rs_c, cs_c ); } diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c index 6202cfffd..a283c1235 100644 --- a/frame/3/gemm/bli_gemm_md.c +++ b/frame/3/gemm/bli_gemm_md.c @@ -39,12 +39,12 @@ void bli_gemm_md ( - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx_local, - cntx_t** cntx + obj_t* a, + obj_t* b, + const obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + const cntx_t** cntx ) { mddm_t doms; @@ -148,12 +148,12 @@ void bli_gemm_md // cab mddm_t bli_gemm_md_ccr ( - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx_local, - cntx_t** cntx + obj_t* a, + obj_t* b, + const obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + const cntx_t** cntx ) { mddm_t doms; @@ -201,48 +201,51 @@ mddm_t bli_gemm_md_ccr // Copy the real domain blocksizes into the slots of their complex // counterparts. - blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx ); - blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx ); - blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx ); - blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx ); - blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx ); + blksz_t blksz_mr = *bli_cntx_get_blksz( BLIS_MR, cntx_local ); + blksz_t blksz_nr = *bli_cntx_get_blksz( BLIS_NR, cntx_local ); + blksz_t blksz_mc = *bli_cntx_get_blksz( BLIS_MC, cntx_local ); + blksz_t blksz_nc = *bli_cntx_get_blksz( BLIS_NC, cntx_local ); + blksz_t blksz_kc = *bli_cntx_get_blksz( BLIS_KC, cntx_local ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_mr, BLIS_SCOMPLEX, blksz_mr ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_mr, BLIS_SCOMPLEX, &blksz_mr ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mr, BLIS_DCOMPLEX, &blksz_mr ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_nr, BLIS_SCOMPLEX, blksz_nr ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_nr, BLIS_SCOMPLEX, &blksz_nr ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nr, BLIS_DCOMPLEX, &blksz_nr ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_mc, BLIS_SCOMPLEX, blksz_mc ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_mc, BLIS_SCOMPLEX, &blksz_mc ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mc, BLIS_DCOMPLEX, &blksz_mc ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_nc, BLIS_SCOMPLEX, blksz_nc ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_nc, BLIS_SCOMPLEX, &blksz_nc ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nc, BLIS_DCOMPLEX, &blksz_nc ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_kc, BLIS_SCOMPLEX, blksz_kc ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_kc, BLIS_SCOMPLEX, &blksz_kc ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_kc, BLIS_DCOMPLEX, &blksz_kc ); // Halve both the real and complex MR's (which are both real MR's). - bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_mr ); - bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_mr ); - bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mr ); - bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mr ); + bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, &blksz_mr ); + bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, &blksz_mr ); + bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_mr ); + bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_mr ); // Halve both the real and complex MC's (which are both real MC's). - bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_mc ); - bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_mc ); - bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mc ); - bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mc ); + bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, &blksz_mc ); + bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, &blksz_mc ); + bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_mc ); + bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_mc ); - // Use the default pack schemas in the objects. + bli_cntx_set_blksz( BLIS_MR, &blksz_mr, BLIS_MR, cntx_local ); + bli_cntx_set_blksz( BLIS_NR, &blksz_nr, BLIS_NR, cntx_local ); + bli_cntx_set_blksz( BLIS_MC, &blksz_mc, BLIS_MR, cntx_local ); + bli_cntx_set_blksz( BLIS_NC, &blksz_nc, BLIS_NR, cntx_local ); + bli_cntx_set_blksz( BLIS_KC, &blksz_kc, BLIS_KC, cntx_local ); - // static func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx ) - func_t* l3_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, *cntx ); + // Use the default pack schemas in the objects. // Rather than check which complex datatype dt_comp refers to, we set // the mixed-domain virtual microkernel for both types. - bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs ); - bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs ); + bli_cntx_set_ukr_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local ); + bli_cntx_set_ukr_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local ); // Return the computation and execution domains. return doms; @@ -253,12 +256,12 @@ mddm_t bli_gemm_md_ccr // cab mddm_t bli_gemm_md_crc ( - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx_local, - cntx_t** cntx + obj_t* a, + obj_t* b, + const obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + const cntx_t** cntx ) { mddm_t doms; @@ -306,48 +309,51 @@ mddm_t bli_gemm_md_crc // Copy the real domain blocksizes into the slots of their complex // counterparts. - blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx ); - blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx ); - blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx ); - blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx ); - blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx ); + blksz_t blksz_mr = *bli_cntx_get_blksz( BLIS_MR, cntx_local ); + blksz_t blksz_nr = *bli_cntx_get_blksz( BLIS_NR, cntx_local ); + blksz_t blksz_mc = *bli_cntx_get_blksz( BLIS_MC, cntx_local ); + blksz_t blksz_nc = *bli_cntx_get_blksz( BLIS_NC, cntx_local ); + blksz_t blksz_kc = *bli_cntx_get_blksz( BLIS_KC, cntx_local ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_mr, BLIS_SCOMPLEX, blksz_mr ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_mr, BLIS_SCOMPLEX, &blksz_mr ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mr, BLIS_DCOMPLEX, &blksz_mr ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_nr, BLIS_SCOMPLEX, blksz_nr ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_nr, BLIS_SCOMPLEX, &blksz_nr ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nr, BLIS_DCOMPLEX, &blksz_nr ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_mc, BLIS_SCOMPLEX, blksz_mc ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_mc, BLIS_SCOMPLEX, &blksz_mc ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mc, BLIS_DCOMPLEX, &blksz_mc ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_nc, BLIS_SCOMPLEX, blksz_nc ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_nc, BLIS_SCOMPLEX, &blksz_nc ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nc, BLIS_DCOMPLEX, &blksz_nc ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_kc, BLIS_SCOMPLEX, blksz_kc ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_kc, BLIS_SCOMPLEX, &blksz_kc ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_kc, BLIS_DCOMPLEX, &blksz_kc ); // Halve both the real and complex NR's (which are both real NR's). - bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_nr ); - bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_nr ); - bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nr ); - bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nr ); + bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, &blksz_nr ); + bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, &blksz_nr ); + bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_nr ); + bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_nr ); // Halve both the real and complex NC's (which are both real NC's). - bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_nc ); - bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_nc ); - bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nc ); - bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nc ); + bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, &blksz_nc ); + bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, &blksz_nc ); + bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_nc ); + bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_nc ); - // Use the default pack schemas in the objects. + bli_cntx_set_blksz( BLIS_MR, &blksz_mr, BLIS_MR, cntx_local ); + bli_cntx_set_blksz( BLIS_NR, &blksz_nr, BLIS_NR, cntx_local ); + bli_cntx_set_blksz( BLIS_MC, &blksz_mc, BLIS_MR, cntx_local ); + bli_cntx_set_blksz( BLIS_NC, &blksz_nc, BLIS_NR, cntx_local ); + bli_cntx_set_blksz( BLIS_KC, &blksz_kc, BLIS_KC, cntx_local ); - // static func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx ) - func_t* l3_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, *cntx ); + // Use the default pack schemas in the objects. // Rather than check which complex datatype dt_comp refers to, we set // the mixed-domain virtual microkernel for both types. - bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs ); - bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs ); + bli_cntx_set_ukr_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local ); + bli_cntx_set_ukr_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local ); // Return the computation and execution domains. return doms; @@ -358,12 +364,12 @@ mddm_t bli_gemm_md_crc // cab mddm_t bli_gemm_md_rcc ( - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx_local, - cntx_t** cntx + obj_t* a, + obj_t* b, + const obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + const cntx_t** cntx ) { mddm_t doms; @@ -384,32 +390,38 @@ mddm_t bli_gemm_md_rcc // Copy the real domain blocksizes into the slots of their complex // counterparts. - blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx ); - blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx ); - blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx ); - blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx ); - blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx ); + blksz_t blksz_mr = *bli_cntx_get_blksz( BLIS_MR, cntx_local ); + blksz_t blksz_nr = *bli_cntx_get_blksz( BLIS_NR, cntx_local ); + blksz_t blksz_mc = *bli_cntx_get_blksz( BLIS_MC, cntx_local ); + blksz_t blksz_nc = *bli_cntx_get_blksz( BLIS_NC, cntx_local ); + blksz_t blksz_kc = *bli_cntx_get_blksz( BLIS_KC, cntx_local ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_mr, BLIS_SCOMPLEX, blksz_mr ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_mr, BLIS_SCOMPLEX, &blksz_mr ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mr, BLIS_DCOMPLEX, &blksz_mr ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_nr, BLIS_SCOMPLEX, blksz_nr ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_nr, BLIS_SCOMPLEX, &blksz_nr ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nr, BLIS_DCOMPLEX, &blksz_nr ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_mc, BLIS_SCOMPLEX, blksz_mc ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_mc, BLIS_SCOMPLEX, &blksz_mc ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mc, BLIS_DCOMPLEX, &blksz_mc ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_nc, BLIS_SCOMPLEX, blksz_nc ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_nc, BLIS_SCOMPLEX, &blksz_nc ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nc, BLIS_DCOMPLEX, &blksz_nc ); - bli_blksz_copy_dt( BLIS_FLOAT, blksz_kc, BLIS_SCOMPLEX, blksz_kc ); - bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc ); + bli_blksz_copy_dt( BLIS_FLOAT, &blksz_kc, BLIS_SCOMPLEX, &blksz_kc ); + bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_kc, BLIS_DCOMPLEX, &blksz_kc ); // Halve both the real and complex KC's (which are both real KC's). - bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_kc ); - bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_kc ); - bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_kc ); - bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_kc ); + bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, &blksz_kc ); + bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, &blksz_kc ); + bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_kc ); + bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_kc ); + + bli_cntx_set_blksz( BLIS_MR, &blksz_mr, BLIS_MR, cntx_local ); + bli_cntx_set_blksz( BLIS_NR, &blksz_nr, BLIS_NR, cntx_local ); + bli_cntx_set_blksz( BLIS_MC, &blksz_mc, BLIS_MR, cntx_local ); + bli_cntx_set_blksz( BLIS_NC, &blksz_nc, BLIS_NR, cntx_local ); + bli_cntx_set_blksz( BLIS_KC, &blksz_kc, BLIS_KC, cntx_local ); // Use the 1r pack schema for both A and B with the conjugation // of A or B toggled (to produce ar * br - ai * bi). @@ -427,14 +439,14 @@ mddm_t bli_gemm_md_rcc // the target datatype. (The packm_blk_var1_md() function has "built-in" // support for packing to 1r (and 1e) schemas, whereas the // packm_blk_var1() function relies on packm kernels for packing to 1r. - const num_t dt_complex = bli_obj_dt( a ); - cntx_t* cntx_1m = bli_gks_query_ind_cntx( BLIS_1M, dt_complex ); + const num_t dt_complex = bli_obj_dt( a ); + const cntx_t* cntx_1m = bli_gks_query_ind_cntx( BLIS_1M, dt_complex ); - func_t* cntx_funcs = bli_cntx_ukrs_buf( *cntx ); - func_t* cntx_1m_funcs = bli_cntx_ukrs_buf( cntx_1m ); + const func_t* packm_1m_mr = bli_cntx_get_ukrs( BLIS_PACKM_MRXK_KER, cntx_1m ); + const func_t* packm_1m_nr = bli_cntx_get_ukrs( BLIS_PACKM_NRXK_KER, cntx_1m ); - cntx_funcs[ BLIS_PACKM_MRXK_KER ] = cntx_1m_funcs[ BLIS_PACKM_MRXK_KER ]; - cntx_funcs[ BLIS_PACKM_NRXK_KER ] = cntx_1m_funcs[ BLIS_PACKM_NRXK_KER ]; + bli_cntx_set_ukr( BLIS_PACKM_MRXK_KER, packm_1m_mr, cntx_local ); + bli_cntx_set_ukr( BLIS_PACKM_NRXK_KER, packm_1m_nr, cntx_local ); // Return the computation and execution domains. return doms; @@ -445,12 +457,12 @@ mddm_t bli_gemm_md_rcc // cab mddm_t bli_gemm_md_crr ( - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx_local, - cntx_t** cntx + obj_t* a, + obj_t* b, + const obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + const cntx_t** cntx ) { mddm_t doms; @@ -502,12 +514,12 @@ mddm_t bli_gemm_md_crr // cab mddm_t bli_gemm_md_rcr ( - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx_local, - cntx_t** cntx + obj_t* a, + obj_t* b, + const obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + const cntx_t** cntx ) { mddm_t doms; @@ -540,12 +552,12 @@ mddm_t bli_gemm_md_rcr // cab mddm_t bli_gemm_md_rrc ( - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx_local, - cntx_t** cntx + obj_t* a, + obj_t* b, + const obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + const cntx_t** cntx ) { mddm_t doms; @@ -578,12 +590,12 @@ mddm_t bli_gemm_md_rrc // cab mddm_t bli_gemm_md_rrr ( - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx_local, - cntx_t** cntx + obj_t* a, + obj_t* b, + const obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + const cntx_t** cntx ) { mddm_t doms; @@ -608,12 +620,12 @@ mddm_t bli_gemm_md_rrr // cab mddm_t bli_gemm_md_ccc ( - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx_local, - cntx_t** cntx + obj_t* a, + obj_t* b, + const obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + const cntx_t** cntx ) { mddm_t doms; diff --git a/frame/3/gemm/bli_gemm_md.h b/frame/3/gemm/bli_gemm_md.h index 751e271ea..d71d97987 100644 --- a/frame/3/gemm/bli_gemm_md.h +++ b/frame/3/gemm/bli_gemm_md.h @@ -43,51 +43,51 @@ typedef struct mddm_s void bli_gemm_md ( - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx_local, - cntx_t** cntx + obj_t* a, + obj_t* b, + const obj_t* beta, + obj_t* c, + cntx_t* cntx_local, + const cntx_t** cntx ); -mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); -mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); -mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); -mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); -mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); -mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); -mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); -mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); +mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx ); +mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx ); +mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx ); +mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx ); +mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx ); +mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx ); +mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx ); +mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ); void bli_gemm_md_zgemm ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ); // ----------------------------------------------------------------------------- -BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) +BLIS_INLINE bool bli_gemm_md_is_crr( const obj_t* a, const obj_t* b, const obj_t* c ) { bool r_val = FALSE; @@ -107,7 +107,7 @@ BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) return r_val; } -BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) +BLIS_INLINE bool bli_gemm_md_is_ccr( const obj_t* a, const obj_t* b, const obj_t* c ) { bool r_val = FALSE; @@ -127,7 +127,7 @@ BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) return r_val; } -BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) +BLIS_INLINE bool bli_gemm_md_is_crc( const obj_t* a, const obj_t* b, const obj_t* c ) { bool r_val = FALSE; @@ -151,17 +151,17 @@ BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) BLIS_INLINE void bli_gemm_md_ker_var2_recast ( - num_t* dt_comp, - num_t dt_a, - num_t dt_b, - num_t* dt_c, - dim_t* m, - dim_t* n, - dim_t* k, - inc_t* pd_a, inc_t* ps_a, - inc_t* pd_b, inc_t* ps_b, - obj_t* c, - inc_t* rs_c, inc_t* cs_c + num_t* dt_comp, + num_t dt_a, + num_t dt_b, + num_t* dt_c, + dim_t* m, + dim_t* n, + dim_t* k, + inc_t* pd_a, inc_t* ps_a, + inc_t* pd_b, inc_t* ps_b, + const obj_t* c, + inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c index a4797ad4f..086a3b1df 100644 --- a/frame/3/gemm/bli_gemm_md_c2r_ref.c +++ b/frame/3/gemm/bli_gemm_md_c2r_ref.c @@ -49,8 +49,8 @@ void PASTEMAC2(ch,opname,suf) \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index 888181bad..d3109e600 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -53,13 +53,13 @@ typedef struct \ void PASTEMAC0(opname) \ ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* c, \ + const cntx_t* cntx, \ + rntm_t* rntm, \ + cntl_t* cntl, \ + thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) diff --git a/frame/3/gemm/ind/bli_gemm_ind_opt.h b/frame/3/gemm/ind/bli_gemm_ind_opt.h index 52ea81a5e..789d5895c 100644 --- a/frame/3/gemm/ind/bli_gemm_ind_opt.h +++ b/frame/3/gemm/ind/bli_gemm_ind_opt.h @@ -34,16 +34,16 @@ BLIS_INLINE void bli_gemm_ind_recast_1m_params ( - num_t* dt_exec, - num_t* dt_c, - pack_t schema_a, - obj_t* c, - dim_t* m, - dim_t* n, - dim_t* k, - inc_t* pd_a, inc_t* ps_a, - inc_t* pd_b, inc_t* ps_b, - inc_t* rs_c, inc_t* cs_c + num_t* dt_exec, + num_t* dt_c, + pack_t schema_a, + const obj_t* c, + dim_t* m, + dim_t* n, + dim_t* k, + inc_t* pd_a, inc_t* ps_a, + inc_t* pd_b, inc_t* ps_b, + inc_t* rs_c, inc_t* cs_c ) { obj_t beta; diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c index d53838470..e291b5f27 100644 --- a/frame/3/gemmt/bli_gemmt_front.c +++ b/frame/3/gemmt/bli_gemmt_front.c @@ -37,14 +37,14 @@ void bli_gemmt_front ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ) { bli_init_once(); diff --git a/frame/3/gemmt/bli_gemmt_front.h b/frame/3/gemmt/bli_gemmt_front.h index c5967f8b8..0f2a9ada2 100644 --- a/frame/3/gemmt/bli_gemmt_front.h +++ b/frame/3/gemmt/bli_gemmt_front.h @@ -35,12 +35,12 @@ void bli_gemmt_front ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ); diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c index 3aedc6e9a..aed0359ec 100644 --- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c @@ -62,81 +62,74 @@ static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2); void bli_gemmt_l_ker_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); + const num_t dt_exec = bli_obj_exec_dt( c ); - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); + const doff_t diagoffc = bli_obj_diag_offset( c ); - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); + ftypes[dt_exec] + ( + diagoffc, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, is_a, + pd_a, ps_a, + ( void* )buf_b, rs_b, is_b, + pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); } diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c index b3a9fe8a1..87d77ee55 100644 --- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c @@ -62,81 +62,74 @@ static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2); void bli_gemmt_u_ker_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffc = bli_obj_diag_offset( c ); + const num_t dt_exec = bli_obj_exec_dt( c ); - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); + const doff_t diagoffc = bli_obj_diag_offset( c ); - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffc, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, is_a, - pd_a, ps_a, - buf_b, rs_b, is_b, - pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); + ftypes[dt_exec] + ( + diagoffc, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, is_a, + pd_a, ps_a, + ( void* )buf_b, rs_b, is_b, + pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); } diff --git a/frame/3/gemmt/bli_gemmt_var.h b/frame/3/gemmt/bli_gemmt_var.h index 60c68c9f5..98d8f5563 100644 --- a/frame/3/gemmt/bli_gemmt_var.h +++ b/frame/3/gemmt/bli_gemmt_var.h @@ -43,13 +43,13 @@ \ void PASTEMAC0(opname) \ ( \ - obj_t* a, \ - obj_t* ah, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ + const obj_t* a, \ + const obj_t* ah, \ + const obj_t* c, \ + const cntx_t* cntx, \ + rntm_t* rntm, \ + cntl_t* cntl, \ + thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c index 3a1d681c3..76fe106b0 100644 --- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c @@ -42,13 +42,13 @@ static l3_var_oft vars[2] = void bli_gemmt_x_ker_var2 ( - obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* ah, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { dim_t uplo; diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 15460125d..c39703503 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -36,15 +36,15 @@ void bli_hemm_front ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ) { bli_init_once(); diff --git a/frame/3/hemm/bli_hemm_front.h b/frame/3/hemm/bli_hemm_front.h index 308b6378b..63eb91cd3 100644 --- a/frame/3/hemm/bli_hemm_front.h +++ b/frame/3/hemm/bli_hemm_front.h @@ -34,13 +34,13 @@ void bli_hemm_front ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ); diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 8108b607f..c9aada989 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -36,15 +36,15 @@ void bli_symm_front ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ) { bli_init_once(); diff --git a/frame/3/symm/bli_symm_front.h b/frame/3/symm/bli_symm_front.h index 909997f6c..417cb9acb 100644 --- a/frame/3/symm/bli_symm_front.h +++ b/frame/3/symm/bli_symm_front.h @@ -34,13 +34,13 @@ void bli_symm_front ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ); diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index d973b6eb6..edd4ce1ef 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -37,13 +37,13 @@ void bli_trmm_front ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ) { bli_init_once(); diff --git a/frame/3/trmm/bli_trmm_front.h b/frame/3/trmm/bli_trmm_front.h index 3e136f9dc..cfefdd39b 100644 --- a/frame/3/trmm/bli_trmm_front.h +++ b/frame/3/trmm/bli_trmm_front.h @@ -34,11 +34,11 @@ void bli_trmm_front ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ); diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 646287f93..f5476b2ca 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -60,77 +60,70 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); void bli_trmm_ll_ker_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffa = bli_obj_diag_offset( a ); + const num_t dt_exec = bli_obj_exec_dt( c ); - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); + const doff_t diagoffa = bli_obj_diag_offset( a ); - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffa, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, pd_a, ps_a, - buf_b, rs_b, pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); + ftypes[dt_exec] + ( + diagoffa, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, pd_a, ps_a, + ( void* )buf_b, rs_b, pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); } diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 9ef2a475d..df5b2dac5 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -60,77 +60,70 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); void bli_trmm_lu_ker_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffa = bli_obj_diag_offset( a ); + const num_t dt_exec = bli_obj_exec_dt( c ); - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); + const doff_t diagoffa = bli_obj_diag_offset( a ); - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffa, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, pd_a, ps_a, - buf_b, rs_b, pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); + ftypes[dt_exec] + ( + diagoffa, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, pd_a, ps_a, + ( void* )buf_b, rs_b, pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); } diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index f6b20af2e..89f86aa3a 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -60,77 +60,70 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); void bli_trmm_rl_ker_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffb = bli_obj_diag_offset( b ); + const num_t dt_exec = bli_obj_exec_dt( c ); - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); + const doff_t diagoffb = bli_obj_diag_offset( b ); - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffb, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, pd_a, ps_a, - buf_b, rs_b, pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); + ftypes[dt_exec] + ( + diagoffb, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, pd_a, ps_a, + ( void* )buf_b, rs_b, pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); } diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index f71fb3c4d..4ed38e761 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -60,77 +60,70 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); void bli_trmm_ru_ker_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - num_t dt_exec = bli_obj_exec_dt( c ); - - doff_t diagoffb = bli_obj_diag_offset( b ); + const num_t dt_exec = bli_obj_exec_dt( c ); - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); + const doff_t diagoffb = bli_obj_diag_offset( b ); - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); - obj_t scalar_a; - obj_t scalar_b; - - void* buf_alpha; - void* buf_beta; - - FUNCPTR_T f; + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. - buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); - buf_beta = bli_obj_internal_scalar_buffer( c ); + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffb, - schema_a, - schema_b, - m, - n, - k, - buf_alpha, - buf_a, cs_a, pd_a, ps_a, - buf_b, rs_b, pd_b, ps_b, - buf_beta, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); + ftypes[dt_exec] + ( + diagoffb, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, pd_a, ps_a, + ( void* )buf_b, rs_b, pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); } diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h index 262b0490f..2f0642ca8 100644 --- a/frame/3/trmm/bli_trmm_var.h +++ b/frame/3/trmm/bli_trmm_var.h @@ -43,13 +43,13 @@ \ void PASTEMAC0(opname) \ ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* c, \ + const cntx_t* cntx, \ + rntm_t* rntm, \ + cntl_t* cntl, \ + thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c index 898cfe242..d42bc88c2 100644 --- a/frame/3/trmm/bli_trmm_xx_ker_var2.c +++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c @@ -43,13 +43,13 @@ static l3_var_oft vars[2][2] = void bli_trmm_xx_ker_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { dim_t side; diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 9cd04963b..9681eb640 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -36,15 +36,15 @@ void bli_trmm3_front ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ) { bli_init_once(); diff --git a/frame/3/trmm3/bli_trmm3_front.h b/frame/3/trmm3/bli_trmm3_front.h index 296b9354b..b5dde34cd 100644 --- a/frame/3/trmm3/bli_trmm3_front.h +++ b/frame/3/trmm3/bli_trmm3_front.h @@ -34,13 +34,13 @@ void bli_trmm3_front ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ); diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index 30bf6921c..79ac65c48 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -39,34 +39,35 @@ void bli_trsm_blk_var1 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - dim_t my_start, my_end; - dim_t b_alg; + obj_t ap, cp; + bli_obj_alias_to( a, &ap ); + bli_obj_alias_to( c, &cp ); // Determine the direction in which to partition (forwards or backwards). - dir_t direct = bli_l3_direct( a, b, c, cntl ); + dir_t direct = bli_l3_direct( &ap, b, &cp, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_m( a, b, c, cntl ); + bli_l3_prune_unref_mparts_m( &ap, b, &cp, cntl ); // Isolate the diagonal block A11 and its corresponding row panel C1. - const dim_t kc = bli_obj_width_after_trans( a ); + const dim_t kc = bli_obj_width_after_trans( &ap ); obj_t a11, c1; bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, - 0, kc, a, &a11 ); + 0, kc, &ap, &a11 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, - 0, kc, c, &c1 ); + 0, kc, &cp, &c1 ); // All threads iterate over the entire diagonal block A11. - my_start = 0; my_end = kc; + dim_t my_start = 0, my_end = kc; #ifdef PRINT printf( "bli_trsm_blk_var1(): a11 is %d x %d at offsets (%3d, %3d)\n", @@ -76,14 +77,14 @@ void bli_trsm_blk_var1 #endif // Partition along the m dimension for the trsm subproblem. + dim_t b_alg; for ( dim_t i = my_start; i < my_end; i += b_alg ) { - obj_t a11_1, c1_1; - b_alg = bli_determine_blocksize( direct, i, my_end, &a11, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and C1. + obj_t a11_1, c1_1; bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, &a11, &a11_1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, @@ -124,9 +125,9 @@ void bli_trsm_blk_var1 // on whether we are moving forwards or backwards, respectively). obj_t ax1, cx1; bli_acquire_mpart_mdim( direct, BLIS_SUBPART1A, - 0, kc, a, &ax1 ); + 0, kc, &ap, &ax1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1A, - 0, kc, c, &cx1 ); + 0, kc, &cp, &cx1 ); #ifdef PRINT printf( "bli_trsm_blk_var1(): ax1 is %d x %d at offsets (%3d, %3d)\n", @@ -139,7 +140,7 @@ void bli_trsm_blk_var1 bli_thread_range_mdim ( direct, thread, &ax1, b, &cx1, cntl, cntx, - &my_start, &my_end + &my_start, &my_end ); #ifdef PRINT @@ -149,13 +150,12 @@ void bli_trsm_blk_var1 // Partition along the m dimension for the gemm subproblem. for ( dim_t i = my_start; i < my_end; i += b_alg ) { - obj_t a11, c1; - // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize( direct, i, my_end, &ax1, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and C1. + obj_t a11, c1; bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, &ax1, &a11 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index 5691c964a..88db57e51 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -37,44 +37,47 @@ void bli_trsm_blk_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - obj_t b1, c1; - dim_t my_start, my_end; - dim_t b_alg; + obj_t bp, cp; + bli_obj_alias_to( b, &bp ); + bli_obj_alias_to( c, &cp ); // Determine the direction in which to partition (forwards or backwards). - dir_t direct = bli_l3_direct( a, b, c, cntl ); + dir_t direct = bli_l3_direct( a, &bp, &cp, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_n( a, b, c, cntl ); + bli_l3_prune_unref_mparts_n( a, &bp, &cp, cntl ); // Determine the current thread's subpartition range. + dim_t my_start, my_end; bli_thread_range_ndim ( - direct, thread, a, b, c, cntl, cntx, + direct, thread, a, &bp, &cp, cntl, cntx, &my_start, &my_end ); // Partition along the n dimension. + dim_t b_alg; for ( dim_t i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize( direct, i, my_end, b, + b_alg = bli_determine_blocksize( direct, i, my_end, &bp, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for B1 and C1. + obj_t b1, c1; bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, - i, b_alg, b, &b1 ); + i, b_alg, &bp, &b1 ); bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, - i, b_alg, c, &c1 ); + i, b_alg, &cp, &c1 ); // Perform trsm subproblem. bli_l3_int diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c index 43fc25f16..2ff3db6f1 100644 --- a/frame/3/trsm/bli_trsm_blk_var3.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -36,39 +36,43 @@ void bli_trsm_blk_var3 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - obj_t a1, b1; - dim_t b_alg; + obj_t ap, bp, cs; + bli_obj_alias_to( a, &ap ); + bli_obj_alias_to( b, &bp ); + bli_obj_alias_to( c, &cs ); // Determine the direction in which to partition (forwards or backwards). - dir_t direct = bli_l3_direct( a, b, c, cntl ); + dir_t direct = bli_l3_direct( &ap, &bp, &cs, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_k( a, b, c, cntl ); + bli_l3_prune_unref_mparts_k( &ap, &bp, &cs, cntl ); // Query dimension in partitioning direction. - dim_t k_trans = bli_obj_width_after_trans( a ); + dim_t k_trans = bli_obj_width_after_trans( &ap ); // Partition along the k dimension. + dim_t b_alg; for ( dim_t i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b, + b_alg = bli_trsm_determine_kc( direct, i, k_trans, &ap, &bp, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and B1. + obj_t a1, b1; bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, - i, b_alg, a, &a1 ); + i, b_alg, &ap, &a1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, - i, b_alg, b, &b1 ); + i, b_alg, &bp, &b1 ); // Perform trsm subproblem. bli_l3_int @@ -77,7 +81,7 @@ void bli_trsm_blk_var3 &a1, &b1, &BLIS_ONE, - c, + &cs, cntx, rntm, bli_cntl_sub_node( cntl ), @@ -92,8 +96,9 @@ void bli_trsm_blk_var3 // that they are only used in the first iteration. if ( i == 0 ) { - bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b ); - bli_obj_scalar_reset( c ); + bli_obj_scalar_reset( &ap ); + bli_obj_scalar_reset( &bp ); + bli_obj_scalar_reset( &cs ); } } } diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 7f3d17aef..b94a129d9 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -37,13 +37,13 @@ void bli_trsm_front ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ) { bli_init_once(); diff --git a/frame/3/trsm/bli_trsm_front.h b/frame/3/trsm/bli_trsm_front.h index 379935536..b31e88b04 100644 --- a/frame/3/trsm/bli_trsm_front.h +++ b/frame/3/trsm/bli_trsm_front.h @@ -35,13 +35,13 @@ void bli_trsm_front ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + side_t side, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index 7b1133c2a..075b40336 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -60,44 +60,39 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); void bli_trsm_ll_ker_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt_exec = bli_obj_exec_dt( c ); - doff_t diagoffa = bli_obj_diag_offset( a ); + const doff_t diagoffa = bli_obj_diag_offset( a ); - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - void* buf_alpha1; - void* buf_alpha2; - - FUNCPTR_T f; + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); // Grab the address of the internal scalar buffer for the scalar // attached to B (the non-triangular matrix). This will be the alpha @@ -105,7 +100,7 @@ void bli_trsm_ll_ker_var2 // be applied to the packed copy of B prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. - buf_alpha1 = bli_obj_internal_scalar_buffer( b ); + const void* buf_alpha1 = bli_obj_internal_scalar_buffer( b ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only @@ -113,27 +108,27 @@ void bli_trsm_ll_ker_var2 // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. - buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffa, - schema_a, - schema_b, - m, - n, - k, - buf_alpha1, - buf_a, cs_a, pd_a, ps_a, - buf_b, rs_b, pd_b, ps_b, - buf_alpha2, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); + ftypes[dt_exec] + ( + diagoffa, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha1, + ( void* )buf_a, cs_a, pd_a, ps_a, + ( void* )buf_b, rs_b, pd_b, ps_b, + ( void* )buf_alpha2, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); } diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 2059d1c9f..799fdd101 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -60,44 +60,39 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); void bli_trsm_lu_ker_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt_exec = bli_obj_exec_dt( c ); - doff_t diagoffa = bli_obj_diag_offset( a ); + const doff_t diagoffa = bli_obj_diag_offset( a ); - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - void* buf_alpha1; - void* buf_alpha2; - - FUNCPTR_T f; + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); // Grab the address of the internal scalar buffer for the scalar // attached to B (the non-triangular matrix). This will be the alpha @@ -105,7 +100,7 @@ void bli_trsm_lu_ker_var2 // be applied to the packed copy of B prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. - buf_alpha1 = bli_obj_internal_scalar_buffer( b ); + const void* buf_alpha1 = bli_obj_internal_scalar_buffer( b ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only @@ -113,27 +108,27 @@ void bli_trsm_lu_ker_var2 // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. - buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffa, - schema_a, - schema_b, - m, - n, - k, - buf_alpha1, - buf_a, cs_a, pd_a, ps_a, - buf_b, rs_b, pd_b, ps_b, - buf_alpha2, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); + ftypes[dt_exec] + ( + diagoffa, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha1, + ( void* )buf_a, cs_a, pd_a, ps_a, + ( void* )buf_b, rs_b, pd_b, ps_b, + ( void* )buf_alpha2, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); } diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index cace3622a..721203df7 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -60,44 +60,39 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); void bli_trsm_rl_ker_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt_exec = bli_obj_exec_dt( c ); - doff_t diagoffb = bli_obj_diag_offset( b ); + const doff_t diagoffb = bli_obj_diag_offset( b ); - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - void* buf_alpha1; - void* buf_alpha2; - - FUNCPTR_T f; + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); // Grab the address of the internal scalar buffer for the scalar // attached to A (the non-triangular matrix). This will be the alpha @@ -105,7 +100,7 @@ void bli_trsm_rl_ker_var2 // be applied to the packed copy of A prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. - buf_alpha1 = bli_obj_internal_scalar_buffer( a ); + const void* buf_alpha1 = bli_obj_internal_scalar_buffer( a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only @@ -113,27 +108,27 @@ void bli_trsm_rl_ker_var2 // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. - buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffb, - schema_a, - schema_b, - m, - n, - k, - buf_alpha1, - buf_a, cs_a, pd_a, ps_a, - buf_b, rs_b, pd_b, ps_b, - buf_alpha2, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); + ftypes[dt_exec] + ( + diagoffb, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha1, + ( void* )buf_a, cs_a, pd_a, ps_a, + ( void* )buf_b, rs_b, pd_b, ps_b, + ( void* )buf_alpha2, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); } diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 4b0c7f083..447fbf8cd 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -60,44 +60,39 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); void bli_trsm_ru_ker_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { - num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt_exec = bli_obj_exec_dt( c ); - doff_t diagoffb = bli_obj_diag_offset( b ); + const doff_t diagoffb = bli_obj_diag_offset( b ); - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t cs_a = bli_obj_col_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); - void* buf_c = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - void* buf_alpha1; - void* buf_alpha2; - - FUNCPTR_T f; + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); // Grab the address of the internal scalar buffer for the scalar // attached to A (the non-triangular matrix). This will be the alpha @@ -105,7 +100,7 @@ void bli_trsm_ru_ker_var2 // be applied to the packed copy of A prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. - buf_alpha1 = bli_obj_internal_scalar_buffer( a ); + const void* buf_alpha1 = bli_obj_internal_scalar_buffer( a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only @@ -113,27 +108,27 @@ void bli_trsm_ru_ker_var2 // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. - buf_alpha2 = bli_obj_internal_scalar_buffer( c ); + const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_exec]; - - // Invoke the function. - f( diagoffb, - schema_a, - schema_b, - m, - n, - k, - buf_alpha1, - buf_a, cs_a, pd_a, ps_a, - buf_b, rs_b, pd_b, ps_b, - buf_alpha2, - buf_c, rs_c, cs_c, - cntx, - rntm, - thread ); + ftypes[dt_exec] + ( + diagoffb, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha1, + ( void* )buf_a, cs_a, pd_a, ps_a, + ( void* )buf_b, rs_b, pd_b, ps_b, + ( void* )buf_alpha2, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); } diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h index 8322a8b5b..7e747b4a8 100644 --- a/frame/3/trsm/bli_trsm_var.h +++ b/frame/3/trsm/bli_trsm_var.h @@ -43,13 +43,13 @@ \ void PASTEMAC0(opname) \ ( \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - thrinfo_t* thread \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* c, \ + const cntx_t* cntx, \ + rntm_t* rntm, \ + cntl_t* cntl, \ + thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c index c30a5828a..a0a59c0a8 100644 --- a/frame/3/trsm/bli_trsm_xx_ker_var2.c +++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c @@ -43,13 +43,13 @@ static l3_var_oft vars[2][2] = void bli_trsm_xx_ker_var2 ( - obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ) { dim_t side; diff --git a/frame/base/bli_apool.c b/frame/base/bli_apool.c index e2d812351..a42c7103e 100644 --- a/frame/base/bli_apool.c +++ b/frame/base/bli_apool.c @@ -36,7 +36,7 @@ void bli_apool_init ( - apool_t* restrict apool + apool_t* apool ) { err_t r_val; @@ -47,7 +47,7 @@ void bli_apool_init // library initialization. // Query the mutex from the apool_t. - //bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool ); + //bli_pthread_mutex_t* mutex = bli_apool_mutex( apool ); // Initialize the mutex. //*mutex = BLIS_PTHREAD_MUTEX_INITIALIZER; @@ -76,7 +76,7 @@ void bli_apool_init const siz_t align_size = 64; // Query the underlying pool_t from the apool_t. - pool_t* restrict pool = bli_apool_pool( apool ); + pool_t* pool = bli_apool_pool( apool ); // Set the default array_t length of the apool_t. bli_apool_set_def_array_len( num_elem, apool ); @@ -92,7 +92,7 @@ void bli_apool_init #endif // Allocate the block_ptrs array. - array_t** restrict block_ptrs + array_t** block_ptrs = bli_malloc_intl( block_ptrs_len * sizeof( array_t* ), &r_val ); @@ -139,8 +139,8 @@ void bli_apool_init void bli_apool_alloc_block ( - siz_t num_elem, - array_t** restrict array_p + siz_t num_elem, + array_t** array_p ) { err_t r_val; @@ -156,9 +156,7 @@ void bli_apool_alloc_block // Allocate the array_t via the bli_fmalloc_align() wrapper, which performs // alignment logic and opaquely saves the original pointer so that it can // be recovered when it's time to free the block. - array_t* restrict array - = - bli_malloc_intl( block_size, &r_val ); + array_t* array = bli_malloc_intl( block_size, &r_val ); // Initialize an array_t struct within the newly allocated memory region. bli_array_init( num_elem, sizeof( pool_t* ), array ); @@ -169,16 +167,16 @@ void bli_apool_alloc_block void bli_apool_free_block ( - array_t* restrict array + array_t* array ) { - const siz_t num_elem = bli_array_num_elem( array ); - pool_t** restrict buf = bli_array_buf( array ); + const siz_t num_elem = bli_array_num_elem( array ); + pool_t** buf = bli_array_buf( array ); // Step through the array and finalize each pool_t. for ( dim_t i = 0; i < num_elem; ++i ) { - pool_t* restrict pool = buf[ i ]; + pool_t* pool = buf[ i ]; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_free_block(): freeing pool_t %d within array_t.\n", @@ -218,25 +216,25 @@ void bli_apool_free_block void bli_apool_finalize ( - apool_t* restrict apool + apool_t* apool ) { // NOTE: Since the apool_t's mutex is now initialized statically, we no // longer need to explicitly destroy it. // Query the mutex from the apool_t. - //bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool ); + //bli_pthread_mutex_t* mutex = bli_apool_mutex( apool ); // Destroy the mutex. //bli_pthread_mutex_destroy( mutex ); // Query the underlying pool_t and mutex from the apool_t. - pool_t* restrict pool = bli_apool_pool( apool ); + pool_t* pool = bli_apool_pool( apool ); // ---------------------------------------------------------------- // Query the block_ptrs array. - array_t** restrict block_ptrs = bli_pool_block_ptrs( pool ); + array_t** block_ptrs = bli_pool_block_ptrs( pool ); // Query the total number of blocks currently allocated. siz_t num_blocks = bli_pool_num_blocks( pool ); @@ -270,8 +268,8 @@ void bli_apool_finalize array_t* bli_apool_checkout_array ( - siz_t n_threads, - apool_t* restrict apool + siz_t n_threads, + apool_t* apool ) { // Acquire the apool_t's mutex. @@ -298,10 +296,10 @@ array_t* bli_apool_checkout_array // At this point, at least one array_t is guaranteed to be available. // Query the underlying pool_t from the apool_t. - pool_t* restrict pool = bli_apool_pool( apool ); + pool_t* pool = bli_apool_pool( apool ); // Query the block_ptrs array. - array_t** restrict block_ptrs = bli_pool_block_ptrs( pool ); + array_t** block_ptrs = bli_pool_block_ptrs( pool ); // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); @@ -313,7 +311,7 @@ array_t* bli_apool_checkout_array #endif // Select the array_t* at top_index to return to the caller. - array_t* restrict array = block_ptrs[ top_index ]; + array_t* array = block_ptrs[ top_index ]; // Increment the pool's top_index. bli_pool_set_top_index( top_index + 1, pool ); @@ -333,15 +331,15 @@ array_t* bli_apool_checkout_array void bli_apool_checkin_array ( - array_t* restrict array, - apool_t* restrict apool + array_t* array, + apool_t* apool ) { // Acquire the apool_t's mutex. bli_apool_lock( apool ); // Query the underlying pool_t from the apool_t. - pool_t* restrict pool = bli_apool_pool( apool ); + pool_t* pool = bli_apool_pool( apool ); // ---------------------------------------------------------------------------- @@ -351,7 +349,7 @@ void bli_apool_checkin_array // change. // Query the block_ptrs array. - array_t** restrict block_ptrs = bli_pool_block_ptrs( pool ); + array_t** block_ptrs = bli_pool_block_ptrs( pool ); // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); @@ -376,8 +374,8 @@ void bli_apool_checkin_array pool_t* bli_apool_array_elem ( - siz_t index, - array_t* restrict array + siz_t index, + array_t* array ) { err_t r_val; @@ -391,8 +389,8 @@ pool_t* bli_apool_array_elem // stores in the array_t are pool_t*, that means that the function is // actually returning the address of a pool_t*, or pool_t**, hence the // dereferencing below. - pool_t** restrict pool_p = bli_array_elem( index, array ); - pool_t* pool = *pool_p; + pool_t** pool_p = bli_array_elem( index, array ); + pool_t* pool = *pool_p; // If the element is NULL, then it means a pool_t has not yet been created // and allocated for the given index (thread id). @@ -463,8 +461,8 @@ pool_t* bli_apool_array_elem void bli_apool_grow ( - siz_t num_blocks_add, - apool_t* restrict apool + siz_t num_blocks_add, + apool_t* apool ) { err_t r_val; @@ -473,7 +471,7 @@ void bli_apool_grow if ( num_blocks_add == 0 ) return; // Query the underlying pool_t from the apool_t. - pool_t* restrict pool = bli_apool_pool( apool ); + pool_t* pool = bli_apool_pool( apool ); // Query the default initial array length from the apool_t. const siz_t num_elem = bli_apool_def_array_len( apool ); @@ -499,7 +497,7 @@ void bli_apool_grow const siz_t block_ptrs_len_new = 2 * block_ptrs_len_cur; // Query the current block_ptrs array. - array_t** restrict block_ptrs_cur = bli_pool_block_ptrs( pool ); + array_t** block_ptrs_cur = bli_pool_block_ptrs( pool ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_grow(): growing block_ptrs_len (%d -> %d): ", @@ -507,7 +505,7 @@ void bli_apool_grow #endif // Allocate a new block_ptrs array. - array_t** restrict block_ptrs_new + array_t** block_ptrs_new = bli_malloc_intl( block_ptrs_len_new * sizeof( array_t* ), &r_val ); @@ -541,7 +539,7 @@ void bli_apool_grow // blocks. // Query the current block_ptrs array (which was maybe just resized). - array_t** restrict block_ptrs = bli_pool_block_ptrs( pool ); + array_t** block_ptrs = bli_pool_block_ptrs( pool ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_grow(): growing apool_t (%d -> %d).\n", diff --git a/frame/base/bli_apool.h b/frame/base/bli_apool.h index e6e91958a..d06f79207 100644 --- a/frame/base/bli_apool.h +++ b/frame/base/bli_apool.h @@ -61,16 +61,14 @@ BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) return &(apool->mutex); } -BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) +BLIS_INLINE siz_t bli_apool_def_array_len( const apool_t* pool ) { return pool->def_array_len; } -BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) +BLIS_INLINE bool bli_apool_is_exhausted( const apool_t* apool ) { - pool_t* restrict pool = bli_apool_pool( apool ); - - return bli_pool_is_exhausted( pool ); + return bli_pool_is_exhausted( &apool->pool ); } // apool action @@ -96,44 +94,44 @@ BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool void bli_apool_init ( - apool_t* restrict apool + apool_t* apool ); void bli_apool_finalize ( - apool_t* restrict apool + apool_t* apool ); array_t* bli_apool_checkout_array ( - siz_t n_threads, - apool_t* restrict apool + siz_t n_threads, + apool_t* apool ); void bli_apool_checkin_array ( - array_t* restrict array, - apool_t* restrict apool + array_t* array, + apool_t* apool ); pool_t* bli_apool_array_elem ( - siz_t index, - array_t* restrict array + siz_t index, + array_t* array ); void bli_apool_grow ( - siz_t num_blocks_add, - apool_t* restrict apool + siz_t num_blocks_add, + apool_t* apool ); void bli_apool_alloc_block ( - siz_t num_elem, - array_t** restrict array_p + siz_t num_elem, + array_t** array_p ); void bli_apool_free_block ( - array_t* restrict array + array_t* array ); diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index 54aa64d42..48b50a774 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -121,7 +121,7 @@ void bli_arch_set_id( void ) // initialized. Query the address of an internal context data structure // corresponding to req_id. This pointer will be NULL if the associated // subconfig is not available. - cntx_t** req_cntx = bli_gks_lookup_id( req_id ); + const cntx_t* const * req_cntx = bli_gks_lookup_id( req_id ); // This function checks the context pointer and aborts with a useful // error message if the pointer is found to be NULL. @@ -253,7 +253,7 @@ void bli_arch_set_id( void ) // enumeration that is typedef'ed in bli_type_defs.h. That is, the // index order of each string should correspond to the implied/assigned // enum value given to the corresponding BLIS_ARCH_ value. -static char* config_name[ BLIS_NUM_ARCHS ] = +static const char* config_name[ BLIS_NUM_ARCHS ] = { "skx", "knl", @@ -283,11 +283,11 @@ static char* config_name[ BLIS_NUM_ARCHS ] = "power9", "power7", "bgq", - + "generic" }; -char* bli_arch_string( arch_t id ) +const char* bli_arch_string( arch_t id ) { return config_name[ id ]; } @@ -306,9 +306,9 @@ bool bli_arch_get_logging( void ) return arch_dolog; } -void bli_arch_log( char* fmt, ... ) +void bli_arch_log( const char* fmt, ... ) { - char prefix[] = "libblis: "; + const char prefix[] = "libblis: "; int n_chars = strlen( prefix ) + strlen( fmt ) + 1; if ( bli_arch_get_logging() && fmt ) diff --git a/frame/base/bli_arch.h b/frame/base/bli_arch.h index 0cd55dace..08af7ae79 100644 --- a/frame/base/bli_arch.h +++ b/frame/base/bli_arch.h @@ -40,11 +40,11 @@ BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); -BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); +BLIS_EXPORT_BLIS const char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); -void bli_arch_log( char*, ... ); +void bli_arch_log( const char*, ... ); #endif diff --git a/frame/base/bli_array.c b/frame/base/bli_array.c index 3844cd52f..ea47a0024 100644 --- a/frame/base/bli_array.c +++ b/frame/base/bli_array.c @@ -38,9 +38,9 @@ void bli_array_init ( - const siz_t num_elem, - const siz_t elem_size, - array_t* restrict array + siz_t num_elem, + siz_t elem_size, + array_t* array ) { err_t r_val; @@ -54,7 +54,7 @@ void bli_array_init const size_t array_size = num_elem * elem_size; // Allocate the array buffer. - void* restrict buf = bli_malloc_intl( array_size, &r_val ); + void* buf = bli_malloc_intl( array_size, &r_val ); // Initialize the array elements to zero. THIS IS IMPORANT because // consumer threads will use the NULL-ness of the array elements to @@ -70,8 +70,8 @@ void bli_array_init void bli_array_resize ( - const siz_t num_elem_new, - array_t* restrict array + siz_t num_elem_new, + array_t* array ) { err_t r_val; @@ -94,7 +94,7 @@ void bli_array_resize const size_t array_size_new = num_elem_new * elem_size; // Query the previous array buffer. - void* restrict buf_prev = bli_array_buf( array ); + void* buf_prev = bli_array_buf( array ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_array_resize(): allocating array [%d * %d]: ", @@ -102,7 +102,7 @@ void bli_array_resize #endif // Allocate a new array buffer. - char* restrict buf_new = bli_malloc_intl( array_size_new, &r_val ); + char* buf_new = bli_malloc_intl( array_size_new, &r_val ); // Copy the previous array contents to the new array. memcpy( buf_new, buf_prev, array_size_prev ); @@ -129,7 +129,7 @@ void bli_array_resize void bli_array_finalize ( - array_t* restrict array + array_t* array ) { #ifdef BLIS_ENABLE_MEM_TRACING @@ -138,7 +138,7 @@ void bli_array_finalize #endif // Query the buffer from the array. - void* restrict buf = bli_array_buf( array ); + void* buf = bli_array_buf( array ); // Free the buffer. bli_free_intl( buf ); @@ -146,8 +146,8 @@ void bli_array_finalize void* bli_array_elem ( - const siz_t index, - array_t* restrict array + siz_t index, + const array_t* array ) { // Query the number of elements in the array. @@ -161,7 +161,7 @@ void* bli_array_elem // Query the buffer from the array, but store it as a char* so we can use // it to easily perform byte pointer arithmetic. - char* restrict buf = bli_array_buf( array ); + char* buf = bli_array_buf( array ); // Advance the pointer by (index * elem_size) bytes. buf += index * elem_size; @@ -172,17 +172,19 @@ void* bli_array_elem void bli_array_set_elem ( - void* restrict elem, - const siz_t index, - array_t* restrict array + void* elem, + siz_t index, + array_t* array ) { // Query the size of each element in the array. const siz_t elem_size = bli_array_elem_size( array ); // Query the buffer from the array as a char*. - char* restrict buf = bli_array_buf( array ); + char* buf = bli_array_buf( array ); +// memcpy() is the only safe way to copy data of unknown type +#if 0 if ( elem_size == sizeof( void* ) ) { #ifdef BLIS_ENABLE_MEM_TRACING @@ -193,16 +195,19 @@ void bli_array_set_elem // Special case: Handle elem_size = sizeof( void* ) without calling // memcpy(). - void** restrict buf_vvp = ( void** )buf; - void** restrict elem_vvp = ( void** )elem; + void** buf_vvp = ( void** )buf; + void** elem_vvp = ( void** )elem; buf_vvp[ index ] = *elem_vvp; } else { +#endif // General case: Copy the elem_size bytes from elem to buf at the // element index specified by index. memcpy( &buf[ index * elem_size ], elem, ( size_t )elem_size ); +#if 0 } +#endif } diff --git a/frame/base/bli_array.h b/frame/base/bli_array.h index 4cb00496b..c1e6ce038 100644 --- a/frame/base/bli_array.h +++ b/frame/base/bli_array.h @@ -51,17 +51,17 @@ typedef struct // Array entry query -BLIS_INLINE void* bli_array_buf( array_t* array ) +BLIS_INLINE void* bli_array_buf( const array_t* array ) { return array->buf; } -BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) +BLIS_INLINE siz_t bli_array_num_elem( const array_t* array ) { return array->num_elem; } -BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) +BLIS_INLINE siz_t bli_array_elem_size( const array_t* array ) { return array->elem_size; } @@ -87,30 +87,30 @@ BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ void bli_array_init ( - const siz_t num_elem, - const siz_t elem_size, - array_t* restrict array + siz_t num_elem, + siz_t elem_size, + array_t* array ); void bli_array_resize ( - const siz_t num_elem_new, - array_t* restrict array + siz_t num_elem_new, + array_t* array ); void bli_array_finalize ( - array_t* restrict array + array_t* array ); void* bli_array_elem ( - const siz_t index, - array_t* restrict array + siz_t index, + const array_t* array ); void bli_array_set_elem ( - void* restrict elem, - const siz_t index, - array_t* restrict array + void* elem, + siz_t index, + array_t* array ); #endif diff --git a/frame/base/bli_auxinfo.h b/frame/base/bli_auxinfo.h index d8c6cbb13..166480b30 100644 --- a/frame/base/bli_auxinfo.h +++ b/frame/base/bli_auxinfo.h @@ -38,49 +38,49 @@ // auxinfo_t field query -BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) +BLIS_INLINE pack_t bli_auxinfo_schema_a( const auxinfo_t* ai ) { return ai->schema_a; } -BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) +BLIS_INLINE pack_t bli_auxinfo_schema_b( const auxinfo_t* ai ) { return ai->schema_b; } -BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) +BLIS_INLINE const void* bli_auxinfo_next_a( const auxinfo_t* ai ) { return ai->a_next; } -BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) +BLIS_INLINE const void* bli_auxinfo_next_b( const auxinfo_t* ai ) { return ai->b_next; } -BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) +BLIS_INLINE inc_t bli_auxinfo_is_a( const auxinfo_t* ai ) { return ai->is_a; } -BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) +BLIS_INLINE inc_t bli_auxinfo_is_b( const auxinfo_t* ai ) { return ai->is_b; } -BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) +BLIS_INLINE inc_t bli_auxinfo_ps_a( const auxinfo_t* ai ) { return ai->ps_a; } -BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) +BLIS_INLINE inc_t bli_auxinfo_ps_b( const auxinfo_t* ai ) { return ai->ps_b; } -BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) +BLIS_INLINE void_fp bli_auxinfo_ukr( const auxinfo_t* ai ) { - return ai->ukr; + return ai->ukr; } -BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) +BLIS_INLINE const void* bli_auxinfo_params( const auxinfo_t* ai ) { - return ai->params; + return ai->params; } @@ -95,15 +95,15 @@ BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) ai->schema_b = schema; } -BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) +BLIS_INLINE void bli_auxinfo_set_next_a( const void* p, auxinfo_t* ai ) { ai->a_next = p; } -BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) +BLIS_INLINE void bli_auxinfo_set_next_b( const void* p, auxinfo_t* ai ) { ai->b_next = p; } -BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) +BLIS_INLINE void bli_auxinfo_set_next_ab( const void* ap, const void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; @@ -129,11 +129,11 @@ BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { - ai->ukr = ukr; + ai->ukr = ukr; } -BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) +BLIS_INLINE void bli_auxinfo_set_params( const void* params, auxinfo_t* ai ) { - ai->params = params; + ai->params = params; } #endif diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index 524653d74..38b4b7956 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -235,12 +235,12 @@ void bli_blksz_reduce_max_to dim_t bli_determine_blocksize ( - dir_t direct, - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx + dir_t direct, + dim_t i, + dim_t dim, + const obj_t* obj, + bszid_t bszid, + const cntx_t* cntx ) { if ( direct == BLIS_FWD ) @@ -251,17 +251,17 @@ dim_t bli_determine_blocksize dim_t bli_determine_blocksize_f ( - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx + dim_t i, + dim_t dim, + const obj_t* obj, + bszid_t bszid, + const cntx_t* cntx ) { - num_t dt; - blksz_t* bsize; - dim_t b_alg, b_max; - dim_t b_use; + num_t dt; + const blksz_t* bsize; + dim_t b_alg, b_max; + dim_t b_use; // Extract the execution datatype and use it to query the corresponding // blocksize and blocksize maximum values from the blksz_t object. @@ -277,17 +277,17 @@ dim_t bli_determine_blocksize_f dim_t bli_determine_blocksize_b ( - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx + dim_t i, + dim_t dim, + const obj_t* obj, + bszid_t bszid, + const cntx_t* cntx ) { - num_t dt; - blksz_t* bsize; - dim_t b_alg, b_max; - dim_t b_use; + num_t dt; + const blksz_t* bsize; + dim_t b_alg, b_max; + dim_t b_use; // Extract the execution datatype and use it to query the corresponding // blocksize and blocksize maximum values from the blksz_t object. diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h index 2e0fefeae..d91c0542d 100644 --- a/frame/base/bli_blksz.h +++ b/frame/base/bli_blksz.h @@ -36,8 +36,8 @@ BLIS_INLINE dim_t bli_blksz_get_def ( - num_t dt, - blksz_t* b + num_t dt, + const blksz_t* b ) { return b->v[ dt ]; @@ -45,8 +45,8 @@ BLIS_INLINE dim_t bli_blksz_get_def BLIS_INLINE dim_t bli_blksz_get_max ( - num_t dt, - blksz_t* b + num_t dt, + const blksz_t* b ) { return b->e[ dt ]; @@ -77,8 +77,8 @@ BLIS_INLINE void bli_blksz_set_max BLIS_INLINE void bli_blksz_copy ( - blksz_t* b_src, - blksz_t* b_dst + const blksz_t* b_src, + blksz_t* b_dst ) { *b_dst = *b_src; @@ -86,8 +86,8 @@ BLIS_INLINE void bli_blksz_copy BLIS_INLINE void bli_blksz_copy_if_pos ( - blksz_t* b_src, - blksz_t* b_dst + const blksz_t* b_src, + blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that @@ -116,8 +116,8 @@ BLIS_INLINE void bli_blksz_copy_if_pos BLIS_INLINE void bli_blksz_copy_def_dt ( - num_t dt_src, blksz_t* b_src, - num_t dt_dst, blksz_t* b_dst + num_t dt_src, const blksz_t* b_src, + num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); @@ -127,8 +127,8 @@ BLIS_INLINE void bli_blksz_copy_def_dt BLIS_INLINE void bli_blksz_copy_max_dt ( - num_t dt_src, blksz_t* b_src, - num_t dt_dst, blksz_t* b_dst + num_t dt_src, const blksz_t* b_src, + num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); @@ -138,8 +138,8 @@ BLIS_INLINE void bli_blksz_copy_max_dt BLIS_INLINE void bli_blksz_copy_dt ( - num_t dt_src, blksz_t* b_src, - num_t dt_dst, blksz_t* b_dst + num_t dt_src, const blksz_t* b_src, + num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); @@ -252,30 +252,30 @@ void bli_blksz_reduce_max_to dim_t bli_determine_blocksize ( - dir_t direct, - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx + dir_t direct, + dim_t i, + dim_t dim, + const obj_t* obj, + bszid_t bszid, + const cntx_t* cntx ); dim_t bli_determine_blocksize_f ( - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx + dim_t i, + dim_t dim, + const obj_t* obj, + bszid_t bszid, + const cntx_t* cntx ); dim_t bli_determine_blocksize_b ( - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx + dim_t i, + dim_t dim, + const obj_t* obj, + bszid_t bszid, + const cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub diff --git a/frame/base/bli_check.c b/frame/base/bli_check.c index e76314036..16c418b49 100644 --- a/frame/base/bli_check.c +++ b/frame/base/bli_check.c @@ -37,7 +37,7 @@ // -- General stuff ------------------------------------------------------------ -err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ) +err_t bli_check_error_code_helper( gint_t code, const char* file, guint_t line ) { if ( code == BLIS_SUCCESS ) return code; @@ -68,7 +68,7 @@ err_t bli_check_valid_error_level( errlev_t level ) return e_val; } -err_t bli_check_null_pointer( void* ptr ) +err_t bli_check_null_pointer( const void* ptr ) { err_t e_val = BLIS_SUCCESS; @@ -128,7 +128,7 @@ err_t bli_check_valid_diag( diag_t diag ) return e_val; } -err_t bli_check_nonunit_diag( obj_t* a ) +err_t bli_check_nonunit_diag( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; @@ -155,7 +155,7 @@ err_t bli_check_valid_datatype( num_t dt ) return e_val; } -err_t bli_check_object_valid_datatype( obj_t* a ) +err_t bli_check_object_valid_datatype( const obj_t* a ) { err_t e_val; num_t dt; @@ -176,7 +176,7 @@ err_t bli_check_noninteger_datatype( num_t dt ) return e_val; } -err_t bli_check_noninteger_object( obj_t* a ) +err_t bli_check_noninteger_object( const obj_t* a ) { err_t e_val; num_t dt; @@ -197,7 +197,7 @@ err_t bli_check_nonconstant_datatype( num_t dt ) return e_val; } -err_t bli_check_nonconstant_object( obj_t* a ) +err_t bli_check_nonconstant_object( const obj_t* a ) { err_t e_val; num_t dt; @@ -221,7 +221,7 @@ err_t bli_check_floating_datatype( num_t dt ) return e_val; } -err_t bli_check_floating_object( obj_t* a ) +err_t bli_check_floating_object( const obj_t* a ) { err_t e_val; num_t dt; @@ -243,7 +243,7 @@ err_t bli_check_real_datatype( num_t dt ) return e_val; } -err_t bli_check_real_object( obj_t* a ) +err_t bli_check_real_object( const obj_t* a ) { err_t e_val; num_t dt; @@ -264,7 +264,7 @@ err_t bli_check_integer_datatype( num_t dt ) return e_val; } -err_t bli_check_integer_object( obj_t* a ) +err_t bli_check_integer_object( const obj_t* a ) { err_t e_val; num_t dt; @@ -287,7 +287,7 @@ err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ) return e_val; } -err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ) +err_t bli_check_consistent_object_datatypes( const obj_t* a, const obj_t* b ) { err_t e_val; num_t dt_a; @@ -315,7 +315,7 @@ err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ) return e_val; } -err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ) +err_t bli_check_object_real_proj_of( const obj_t* c, const obj_t* r ) { err_t e_val; num_t dt_c; @@ -329,7 +329,7 @@ err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ) return e_val; } -err_t bli_check_real_valued_object( obj_t* a ) +err_t bli_check_real_valued_object( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; double a_real; @@ -363,7 +363,7 @@ err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ) return e_val; } -err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ) +err_t bli_check_consistent_object_precisions( const obj_t* a, const obj_t* b ) { err_t e_val; num_t dt_a; @@ -379,7 +379,7 @@ err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ) // -- Dimension-related checks ------------------------------------------------- -err_t bli_check_conformal_dims( obj_t* a, obj_t* b ) +err_t bli_check_conformal_dims( const obj_t* a, const obj_t* b ) { err_t e_val = BLIS_SUCCESS; dim_t m_a, n_a; @@ -396,7 +396,7 @@ err_t bli_check_conformal_dims( obj_t* a, obj_t* b ) return e_val; } -err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ) +err_t bli_check_level3_dims( const obj_t* a, const obj_t* b, const obj_t* c ) { err_t e_val = BLIS_SUCCESS; dim_t m_c, n_c; @@ -420,7 +420,7 @@ err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ) return e_val; } -err_t bli_check_scalar_object( obj_t* a ) +err_t bli_check_scalar_object( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; @@ -435,7 +435,7 @@ err_t bli_check_scalar_object( obj_t* a ) return e_val; } -err_t bli_check_vector_object( obj_t* a ) +err_t bli_check_vector_object( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; @@ -449,7 +449,7 @@ err_t bli_check_vector_object( obj_t* a ) return e_val; } -err_t bli_check_matrix_object( obj_t* a ) +err_t bli_check_matrix_object( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; @@ -460,7 +460,7 @@ err_t bli_check_matrix_object( obj_t* a ) return e_val; } -err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ) +err_t bli_check_equal_vector_lengths( const obj_t* x, const obj_t* y ) { err_t e_val = BLIS_SUCCESS; dim_t dim_x; @@ -475,7 +475,7 @@ err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ) return e_val; } -err_t bli_check_square_object( obj_t* a ) +err_t bli_check_square_object( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; @@ -485,7 +485,7 @@ err_t bli_check_square_object( obj_t* a ) return e_val; } -err_t bli_check_object_length_equals( obj_t* a, dim_t m ) +err_t bli_check_object_length_equals( const obj_t* a, dim_t m ) { err_t e_val = BLIS_SUCCESS; @@ -495,7 +495,7 @@ err_t bli_check_object_length_equals( obj_t* a, dim_t m ) return e_val; } -err_t bli_check_object_width_equals( obj_t* a, dim_t n ) +err_t bli_check_object_width_equals( const obj_t* a, dim_t n ) { err_t e_val = BLIS_SUCCESS; @@ -505,7 +505,7 @@ err_t bli_check_object_width_equals( obj_t* a, dim_t n ) return e_val; } -err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ) +err_t bli_check_vector_dim_equals( const obj_t* a, dim_t n ) { err_t e_val = BLIS_SUCCESS; @@ -515,7 +515,7 @@ err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ) return e_val; } -err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ) +err_t bli_check_object_diag_offset_equals( const obj_t* a, doff_t offset ) { err_t e_val = BLIS_SUCCESS; @@ -612,7 +612,7 @@ err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ) // -- Structure-related checks ------------------------------------------------- -err_t bli_check_general_object( obj_t* a ) +err_t bli_check_general_object( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; @@ -622,7 +622,7 @@ err_t bli_check_general_object( obj_t* a ) return e_val; } -err_t bli_check_hermitian_object( obj_t* a ) +err_t bli_check_hermitian_object( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; @@ -632,7 +632,7 @@ err_t bli_check_hermitian_object( obj_t* a ) return e_val; } -err_t bli_check_symmetric_object( obj_t* a ) +err_t bli_check_symmetric_object( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; @@ -642,7 +642,7 @@ err_t bli_check_symmetric_object( obj_t* a ) return e_val; } -err_t bli_check_triangular_object( obj_t* a ) +err_t bli_check_triangular_object( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; @@ -652,7 +652,7 @@ err_t bli_check_triangular_object( obj_t* a ) return e_val; } -err_t bli_check_object_struc( obj_t* a, struc_t struc ) +err_t bli_check_object_struc( const obj_t* a, struc_t struc ) { err_t e_val = BLIS_SUCCESS; @@ -666,7 +666,7 @@ err_t bli_check_object_struc( obj_t* a, struc_t struc ) // -- Storage-related checks --------------------------------------------------- -err_t bli_check_upper_or_lower_object( obj_t* a ) +err_t bli_check_upper_or_lower_object( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; @@ -731,7 +731,7 @@ err_t bli_check_valid_3x3_subpart( subpart_t part ) // -- Control tree-related checks ---------------------------------------------- -err_t bli_check_valid_cntl( void* cntl ) +err_t bli_check_valid_cntl( const void* cntl ) { err_t e_val = BLIS_SUCCESS; @@ -743,7 +743,7 @@ err_t bli_check_valid_cntl( void* cntl ) // -- Packing-related checks --------------------------------------------------- -err_t bli_check_packm_schema_on_unpack( obj_t* a ) +err_t bli_check_packm_schema_on_unpack( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; @@ -756,7 +756,7 @@ err_t bli_check_packm_schema_on_unpack( obj_t* a ) return e_val; } -err_t bli_check_packv_schema_on_unpack( obj_t* a ) +err_t bli_check_packv_schema_on_unpack( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; @@ -768,7 +768,7 @@ err_t bli_check_packv_schema_on_unpack( obj_t* a ) // -- Buffer-related checks ---------------------------------------------------- -err_t bli_check_object_buffer( obj_t* a ) +err_t bli_check_object_buffer( const obj_t* a ) { err_t e_val = BLIS_SUCCESS; @@ -783,7 +783,7 @@ err_t bli_check_object_buffer( obj_t* a ) // -- Memory checks ------------------------------------------------------------ -err_t bli_check_valid_malloc_buf( void* ptr ) +err_t bli_check_valid_malloc_buf( const void* ptr ) { err_t e_val = BLIS_SUCCESS; @@ -809,7 +809,7 @@ err_t bli_check_valid_packbuf( packbuf_t buf_type ) return e_val; } -err_t bli_check_if_exhausted_pool( pool_t* pool ) +err_t bli_check_if_exhausted_pool( const pool_t* pool ) { err_t e_val = BLIS_SUCCESS; @@ -819,7 +819,7 @@ err_t bli_check_if_exhausted_pool( pool_t* pool ) return e_val; } -err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ) +err_t bli_check_sufficient_stack_buf_size( const cntx_t* cntx ) { err_t e_val = BLIS_SUCCESS; num_t dt; @@ -873,7 +873,7 @@ err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ) // -- Object-related errors ---------------------------------------------------- -err_t bli_check_object_alias_of( obj_t* a, obj_t* b ) +err_t bli_check_object_alias_of( const obj_t* a, const obj_t* b ) { err_t e_val = BLIS_SUCCESS; @@ -895,7 +895,7 @@ err_t bli_check_valid_arch_id( arch_t id ) return e_val; } -err_t bli_check_initialized_gks_cntx( cntx_t** cntx ) +err_t bli_check_initialized_gks_cntx( const cntx_t* const * cntx ) { err_t e_val = BLIS_SUCCESS; @@ -907,7 +907,7 @@ err_t bli_check_initialized_gks_cntx( cntx_t** cntx ) // -- Architecture-related errors ---------------------------------------------- -err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ) +err_t bli_check_valid_mc_mod_mult( const blksz_t* mc, const blksz_t* mr ) { num_t dt; @@ -924,7 +924,7 @@ err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ) return BLIS_SUCCESS; } -err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ) +err_t bli_check_valid_nc_mod_mult( const blksz_t* nc, const blksz_t* nr ) { num_t dt; @@ -941,7 +941,7 @@ err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ) return BLIS_SUCCESS; } -err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ) +err_t bli_check_valid_kc_mod_mult( const blksz_t* kc, const blksz_t* kr ) { num_t dt; diff --git a/frame/base/bli_check.h b/frame/base/bli_check.h index 276d27689..f1e2201a7 100644 --- a/frame/base/bli_check.h +++ b/frame/base/bli_check.h @@ -34,85 +34,85 @@ */ -BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); +BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, const char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); -err_t bli_check_null_pointer( void* ptr ); +err_t bli_check_null_pointer( const void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); -err_t bli_check_nonunit_diag( obj_t* a ); +err_t bli_check_nonunit_diag( const obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); -err_t bli_check_object_valid_datatype( obj_t* a ); +err_t bli_check_object_valid_datatype( const obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); -err_t bli_check_noninteger_object( obj_t* a ); +err_t bli_check_noninteger_object( const obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); -err_t bli_check_nonconstant_object( obj_t* a ); +err_t bli_check_nonconstant_object( const obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); -err_t bli_check_floating_object( obj_t* a ); +err_t bli_check_floating_object( const obj_t* a ); err_t bli_check_real_datatype( num_t dt ); -err_t bli_check_real_object( obj_t* a ); +err_t bli_check_real_object( const obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); -err_t bli_check_integer_object( obj_t* a ); +err_t bli_check_integer_object( const obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); -err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); +err_t bli_check_consistent_object_datatypes( const obj_t* a, const obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); -err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); -err_t bli_check_real_valued_object( obj_t* a ); +err_t bli_check_object_real_proj_of( const obj_t* c, const obj_t* r ); +err_t bli_check_real_valued_object( const obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); -err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); - -err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); -err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); -err_t bli_check_scalar_object( obj_t* a ); -err_t bli_check_vector_object( obj_t* a ); -err_t bli_check_matrix_object( obj_t* a ); -err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); -err_t bli_check_square_object( obj_t* a ); -err_t bli_check_object_length_equals( obj_t* a, dim_t m ); -err_t bli_check_object_width_equals( obj_t* a, dim_t n ); -err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); -err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); +err_t bli_check_consistent_object_precisions( const obj_t* a, const obj_t* b ); + +err_t bli_check_conformal_dims( const obj_t* a, const obj_t* b ); +err_t bli_check_level3_dims( const obj_t* a, const obj_t* b, const obj_t* c ); +err_t bli_check_scalar_object( const obj_t* a ); +err_t bli_check_vector_object( const obj_t* a ); +err_t bli_check_matrix_object( const obj_t* a ); +err_t bli_check_equal_vector_lengths( const obj_t* x, const obj_t* y ); +err_t bli_check_square_object( const obj_t* a ); +err_t bli_check_object_length_equals( const obj_t* a, dim_t m ); +err_t bli_check_object_width_equals( const obj_t* a, dim_t n ); +err_t bli_check_vector_dim_equals( const obj_t* a, dim_t n ); +err_t bli_check_object_diag_offset_equals( const obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); -err_t bli_check_general_object( obj_t* a ); -err_t bli_check_hermitian_object( obj_t* a ); -err_t bli_check_symmetric_object( obj_t* a ); -err_t bli_check_triangular_object( obj_t* a ); -err_t bli_check_object_struc( obj_t* a, struc_t struc ); +err_t bli_check_general_object( const obj_t* a ); +err_t bli_check_hermitian_object( const obj_t* a ); +err_t bli_check_symmetric_object( const obj_t* a ); +err_t bli_check_triangular_object( const obj_t* a ); +err_t bli_check_object_struc( const obj_t* a, struc_t struc ); -err_t bli_check_upper_or_lower_object( obj_t* a ); +err_t bli_check_upper_or_lower_object( const obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); -err_t bli_check_valid_cntl( void* cntl ); +err_t bli_check_valid_cntl( const void* cntl ); -err_t bli_check_packm_schema_on_unpack( obj_t* a ); -err_t bli_check_packv_schema_on_unpack( obj_t* a ); +err_t bli_check_packm_schema_on_unpack( const obj_t* a ); +err_t bli_check_packv_schema_on_unpack( const obj_t* a ); -err_t bli_check_object_buffer( obj_t* a ); +err_t bli_check_object_buffer( const obj_t* a ); -err_t bli_check_valid_malloc_buf( void* ptr ); +err_t bli_check_valid_malloc_buf( const void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); -err_t bli_check_if_exhausted_pool( pool_t* pool ); -err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); +err_t bli_check_if_exhausted_pool( const pool_t* pool ); +err_t bli_check_sufficient_stack_buf_size( const cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); -err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); +err_t bli_check_object_alias_of( const obj_t* a, const obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); -err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); +err_t bli_check_initialized_gks_cntx( const cntx_t* const * cntx ); -err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); -err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); -err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); +err_t bli_check_valid_mc_mod_mult( const blksz_t* mc, const blksz_t* mr ); +err_t bli_check_valid_nc_mod_mult( const blksz_t* nc, const blksz_t* nr ); +err_t bli_check_valid_kc_mod_mult( const blksz_t* kc, const blksz_t* kr ); diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c index f8846198f..b22ddbee0 100644 --- a/frame/base/bli_cntl.c +++ b/frame/base/bli_cntl.c @@ -349,8 +349,8 @@ void bli_cntl_mark_family dim_t bli_cntl_calc_num_threads_in ( - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm, + const cntl_t* cntl ) { dim_t n_threads_in = 1; diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h index 67dd02f0c..406a350ee 100644 --- a/frame/base/bli_cntl.h +++ b/frame/base/bli_cntl.h @@ -119,45 +119,45 @@ BLIS_EXPORT_BLIS void bli_cntl_mark_family dim_t bli_cntl_calc_num_threads_in ( - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm, + const cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) -BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) +BLIS_INLINE opid_t bli_cntl_family( const cntl_t* cntl ) { return cntl->family; } -BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) +BLIS_INLINE bszid_t bli_cntl_bszid( const cntl_t* cntl ) { return cntl->bszid; } -BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) +BLIS_INLINE void_fp bli_cntl_var_func( const cntl_t* cntl ) { return cntl->var_func; } -BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) +BLIS_INLINE cntl_t* bli_cntl_sub_prenode( const cntl_t* cntl ) { return cntl->sub_prenode; } -BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) +BLIS_INLINE cntl_t* bli_cntl_sub_node( const cntl_t* cntl ) { return cntl->sub_node; } -BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) +BLIS_INLINE void* bli_cntl_params( const cntl_t* cntl ) { return cntl->params; } -BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) +BLIS_INLINE uint64_t bli_cntl_params_size( const cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); @@ -170,19 +170,19 @@ BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) // cntl_t query (complex) -BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) +BLIS_INLINE bool bli_cntl_is_null( const cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } -BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) +BLIS_INLINE bool bli_cntl_is_leaf( const cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } -BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) +BLIS_INLINE bool bli_cntl_does_part( const cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 218325d5a..70057060f 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -70,8 +70,8 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... ) // Query the context for the addresses of: // - the blocksize object array // - the blocksize multiple array - blksz_t* cntx_blkszs = bli_cntx_blkszs_buf( cntx ); - bszid_t* cntx_bmults = bli_cntx_bmults_buf( cntx ); + blksz_t* cntx_blkszs = cntx->blkszs; + bszid_t* cntx_bmults = cntx->bmults; // Initialize variable argument environment. va_list args; @@ -165,7 +165,7 @@ void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... ) // Query the context for the blksz_t object assoicated with the // current blocksize id, and also query the object corresponding // to the blocksize multiple. - blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx ); + blksz_t* cntx_blksz = ( blksz_t* )bli_cntx_get_blksz( bs_id, cntx ); // Copy the real domain value of the blksz_t object into the // corresponding complex domain slot of the same object. @@ -218,7 +218,7 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... ) */ // Query the context for the address of the ukernel func_t array - func_t* cntx_ukrs = bli_cntx_ukrs_buf( cntx ); + func_t* cntx_ukrs = cntx->ukrs; // Initialize variable argument environment. va_list args; @@ -262,7 +262,7 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... ) case BLIS_GEMMTRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_U_VIR_UKR ]; break; case BLIS_TRSM_L_UKR: ukrs = &cntx_ukrs[ BLIS_TRSM_L_VIR_UKR ]; break; case BLIS_TRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_TRSM_U_VIR_UKR ]; break; - default: ukrs = NULL; break; + default: ukrs = NULL; break; }; if ( ukrs ) @@ -297,7 +297,7 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... ) */ // Query the context for the address of the ukernel preference mbool_t array - mbool_t* cntx_ukr_prefs = bli_cntx_ukr_prefs_buf( cntx ); + mbool_t* cntx_ukr_prefs = cntx->ukr_prefs; // Initialize variable argument environment. va_list args; @@ -355,7 +355,7 @@ void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... ) */ // Query the context for the address of the l3 sup handlers array. - void_fp* cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx ); + void_fp* cntx_l3_sup_handlers = cntx->l3_sup_handlers; // Initialize variable argument environment. va_list args; @@ -386,7 +386,7 @@ void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... ) // ----------------------------------------------------------------------------- -void bli_cntx_print( cntx_t* cntx ) +void bli_cntx_print( const cntx_t* cntx ) { dim_t i; @@ -410,7 +410,7 @@ void bli_cntx_print( cntx_t* cntx ) for ( i = 0; i < BLIS_NUM_UKRS; ++i ) { - func_t* ukr = bli_cntx_get_ukrs( i, cntx ); + const func_t* ukr = bli_cntx_get_ukrs( i, cntx ); printf( "ukr %2lu: %16p %16p %16p %16p\n", ( unsigned long )i, @@ -423,7 +423,7 @@ void bli_cntx_print( cntx_t* cntx ) for ( i = 0; i < BLIS_NUM_UKR_PREFS; ++i ) { - mbool_t* ukr_pref = bli_cntx_get_ukr_prefs( i, cntx ); + const mbool_t* ukr_pref = bli_cntx_get_ukr_prefs( i, cntx ); printf( "ukr pref %2lu: %d %d %d %d\n", ( unsigned long )i, diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 412430e9b..827b19cfd 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -62,27 +62,7 @@ typedef struct cntx_s // -- cntx_t query (fields only) ----------------------------------------------- // -BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) -{ - return cntx->blkszs; -} -BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) -{ - return cntx->bmults; -} -BLIS_INLINE func_t* bli_cntx_ukrs_buf( cntx_t* cntx ) -{ - return cntx->ukrs; -} -BLIS_INLINE mbool_t* bli_cntx_ukr_prefs_buf( cntx_t* cntx ) -{ - return cntx->ukr_prefs; -} -BLIS_INLINE void_fp* bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) -{ - return cntx->l3_sup_handlers; -} -BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) +BLIS_INLINE ind_t bli_cntx_method( const cntx_t* cntx ) { return cntx->method; } @@ -104,75 +84,66 @@ BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) // -- cntx_t query (complex) --------------------------------------------------- // -BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) +BLIS_INLINE const blksz_t* bli_cntx_get_blksz( bszid_t bs_id, const cntx_t* cntx ) { - blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); - blksz_t* blksz = &blkszs[ bs_id ]; - // Return the address of the blksz_t identified by bs_id. - return blksz; + return &cntx->blkszs[ bs_id ]; } -BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) +BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx ) { - blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); - dim_t bs_dt = bli_blksz_get_def( dt, blksz ); + const blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); + dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } -BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) +BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx ) { - blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); - dim_t bs_dt = bli_blksz_get_max( dt, blksz ); + const blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); + dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } -BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) +BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, const cntx_t* cntx ) { - bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); - bszid_t bm_id = bmults[ bs_id ]; - - return bm_id; + return cntx->bmults[ bs_id ]; } -BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) +BLIS_INLINE const blksz_t* bli_cntx_get_bmult( bszid_t bs_id, const cntx_t* cntx ) { - bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); - blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); + bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); + const blksz_t* bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } -BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) +BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx ) { - blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); - dim_t bm_dt = bli_blksz_get_def( dt, bmult ); + const blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); + dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- -BLIS_INLINE func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE const func_t* bli_cntx_get_ukrs( ukr_t ukr_id, const cntx_t* cntx ) { - func_t* funcs = bli_cntx_ukrs_buf( cntx ); - func_t* func = &funcs[ ukr_id ]; - - return func; + return &cntx->ukrs[ ukr_id ]; } -BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx ) { - func_t* func = bli_cntx_get_ukrs( ukr_id, cntx ); + const func_t* func = bli_cntx_get_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } -BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx ) { switch ( ukr_id ) { @@ -189,24 +160,21 @@ BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* // ----------------------------------------------------------------------------- -BLIS_INLINE mbool_t* bli_cntx_get_ukr_prefs( ukr_pref_t ukr_id, cntx_t* cntx ) +BLIS_INLINE const mbool_t* bli_cntx_get_ukr_prefs( ukr_pref_t pref_id, const cntx_t* cntx ) { - mbool_t* mbools = bli_cntx_ukr_prefs_buf( cntx ); - mbool_t* mbool = &mbools[ ukr_id ]; - - return mbool; + return &cntx->ukr_prefs[ pref_id ]; } -BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, ukr_pref_t ukr_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, ukr_pref_t ukr_id, const cntx_t* cntx ) { - mbool_t* mbool = bli_cntx_get_ukr_prefs( ukr_id, cntx ); + const mbool_t* mbool = bli_cntx_get_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- -BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, const cntx_t* cntx ) { if ( m < bli_cntx_get_blksz_def_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_blksz_def_dt( dt, BLIS_NT, cntx ) ) return TRUE; @@ -217,17 +185,14 @@ BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_ // ----------------------------------------------------------------------------- -BLIS_INLINE void_fp bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) +BLIS_INLINE void_fp bli_cntx_get_l3_sup_handler( opid_t op, const cntx_t* cntx ) { - void_fp* funcs = bli_cntx_l3_sup_handlers_buf( cntx ); - void_fp func = funcs[ op ]; - - return func; + return cntx->l3_sup_handlers[ op ]; } // ----------------------------------------------------------------------------- -BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx ) { // This initial value will get overwritten during the switch statement below. ukr_pref_t ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF; @@ -275,12 +240,12 @@ BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, cntx_t* c return bli_cntx_get_ukr_prefs_dt( dt, ukr_pref_id, cntx ); } -BLIS_INLINE bool bli_cntx_ukr_prefers_cols_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_ukr_prefers_cols_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx ) { return ! bli_cntx_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } -BLIS_INLINE bool bli_cntx_prefers_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_prefers_storage_of( const obj_t* obj, ukr_t ukr_id, const cntx_t* cntx ) { const bool ukr_prefers_rows = bli_cntx_ukr_prefers_rows_dt( bli_obj_dt( obj ), ukr_id, cntx ); @@ -291,7 +256,7 @@ BLIS_INLINE bool bli_cntx_prefers_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* return FALSE; } -BLIS_INLINE bool bli_cntx_dislikes_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx ) +BLIS_INLINE bool bli_cntx_dislikes_storage_of( const obj_t* obj, ukr_t ukr_id, const cntx_t* cntx ) { return ! bli_cntx_prefers_storage_of( obj, ukr_id, cntx ); } @@ -307,58 +272,43 @@ BLIS_INLINE bool bli_cntx_dislikes_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { - blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); - bszid_t* bmults = bli_cntx_bmults_buf( cntx ); - - blkszs[ bs_id ] = *blksz; - bmults[ bs_id ] = mult_id; + cntx->blkszs[ bs_id ] = *blksz; + cntx->bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { - blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); - blksz_t* blksz = &blkszs[ bs_id ]; - - bli_blksz_set_def( bs, dt, blksz ); + bli_blksz_set_def( bs, dt, &cntx->blkszs[ bs_id ] ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { - blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); - blksz_t* blksz = &blkszs[ bs_id ]; - - bli_blksz_set_max( bs, dt, blksz ); + bli_blksz_set_max( bs, dt, &cntx->blkszs[ bs_id ]); } -BLIS_INLINE void bli_cntx_set_ukr( ukr_t ukr_id, func_t* func, cntx_t* cntx ) +BLIS_INLINE void bli_cntx_set_ukr( ukr_t ukr_id, const func_t* func, cntx_t* cntx ) { - func_t* funcs = bli_cntx_ukrs_buf( cntx ); - - funcs[ ukr_id ] = *func; + cntx->ukrs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, ukr_t ker_id, cntx_t* cntx ) { - func_t* func = bli_cntx_get_ukrs( ker_id, cntx ); - - bli_func_set_dt( fp, dt, func ); + bli_func_set_dt( fp, dt, &cntx->ukrs[ ker_id ] ); } BLIS_INLINE void bli_cntx_set_ukr_pref( ukr_pref_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { - mbool_t* mbools = bli_cntx_ukr_prefs_buf( cntx ); - - mbools[ ukr_id ] = *prefs; + cntx->ukr_prefs[ ukr_id ] = *prefs; } -BLIS_INLINE void_fp bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) +BLIS_INLINE void_fp bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, const cntx_t* cntx ) { ukr_t ukr_id = bli_stor3_ukr( stor_id ); return bli_cntx_get_ukr_dt( dt, ukr_id, cntx ); } -BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) +BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx ) { switch ( bs_id ) { @@ -374,7 +324,7 @@ BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cnt return bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ); } -BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) +BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx ) { switch ( bs_id ) { @@ -403,7 +353,9 @@ BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* c BLIS_EXPORT_BLIS void bli_cntx_set_ukrs( cntx_t* cntx, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ukr_prefs( cntx_t* cntx, ... ); -BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); +BLIS_EXPORT_BLIS void bli_cntx_print( const cntx_t* cntx ); + +BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... ); diff --git a/frame/base/bli_const.c b/frame/base/bli_const.c index f20bc8447..210d6ae77 100644 --- a/frame/base/bli_const.c +++ b/frame/base/bli_const.c @@ -44,11 +44,11 @@ static constdata_t bli_mtwo_buffer = bli_obj_init_constdata( -2.0 ); // Statically initialize global scalar constants, attaching the addresses // of the corresponding structs above. -obj_t BLIS_TWO = bli_obj_init_const( &bli_two_buffer ); -obj_t BLIS_ONE = bli_obj_init_const( &bli_one_buffer ); -obj_t BLIS_ZERO = bli_obj_init_const( &bli_zero_buffer ); -obj_t BLIS_MINUS_ONE = bli_obj_init_const( &bli_mone_buffer ); -obj_t BLIS_MINUS_TWO = bli_obj_init_const( &bli_mtwo_buffer ); +const obj_t BLIS_TWO = bli_obj_init_const( &bli_two_buffer ); +const obj_t BLIS_ONE = bli_obj_init_const( &bli_one_buffer ); +const obj_t BLIS_ZERO = bli_obj_init_const( &bli_zero_buffer ); +const obj_t BLIS_MINUS_ONE = bli_obj_init_const( &bli_mone_buffer ); +const obj_t BLIS_MINUS_TWO = bli_obj_init_const( &bli_mtwo_buffer ); #if 0 obj_t BLIS_TWO = {}; diff --git a/frame/base/bli_env.c b/frame/base/bli_env.c index 92aba6970..0972f1771 100644 --- a/frame/base/bli_env.c +++ b/frame/base/bli_env.c @@ -67,8 +67,8 @@ gint_t bli_env_get_var( const char* env, gint_t fallback ) { - gint_t r_val; - char* str; + gint_t r_val; + const char* str; // Query the environment variable and store the result in str. str = getenv( env ); diff --git a/frame/base/bli_error.c b/frame/base/bli_error.c index 37add3b67..f4933d962 100644 --- a/frame/base/bli_error.c +++ b/frame/base/bli_error.c @@ -36,7 +36,7 @@ #include "blis.h" // Internal array to hold error strings. -static char *bli_error_string[-BLIS_ERROR_CODE_MAX] = +static const char *bli_error_string[-BLIS_ERROR_CODE_MAX] = { [-BLIS_INVALID_ERROR_CHECKING_LEVEL] = "Invalid error checking level.", [-BLIS_UNDEFINED_ERROR_CODE] = "Undefined error code.", @@ -116,7 +116,7 @@ static char *bli_error_string[-BLIS_ERROR_CODE_MAX] = // ----------------------------------------------------------------------------- -void bli_print_msg( char* str, char* file, guint_t line ) +void bli_print_msg( const char* str, const char* file, guint_t line ) { fprintf( stderr, "\n" ); fprintf( stderr, "libblis: %s (line %lu):\n", file, ( long unsigned int )line ); @@ -156,7 +156,7 @@ bool bli_error_checking_is_enabled( void ) return bli_error_checking_level() != BLIS_NO_ERROR_CHECKING; } -char* bli_error_string_for_code( gint_t code ) +const char* bli_error_string_for_code( gint_t code ) { return bli_error_string[-code]; } diff --git a/frame/base/bli_error.h b/frame/base/bli_error.h index e6e6f35dd..f3037e2c2 100644 --- a/frame/base/bli_error.h +++ b/frame/base/bli_error.h @@ -39,8 +39,8 @@ BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); -void bli_print_msg( char* str, char* file, guint_t line ); +void bli_print_msg( const char* str, const char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); -char* bli_error_string_for_code( gint_t code ); +const char* bli_error_string_for_code( gint_t code ); diff --git a/frame/base/bli_func.c b/frame/base/bli_func.c index 477710ff0..7cb7aac6d 100644 --- a/frame/base/bli_func.c +++ b/frame/base/bli_func.c @@ -93,13 +93,13 @@ void bli_func_free( func_t* f ) // ----------------------------------------------------------------------------- -bool bli_func_is_null_dt( num_t dt, - func_t* f ) +bool bli_func_is_null_dt( num_t dt, + const func_t* f ) { return ( bli_func_get_dt( dt, f ) == NULL ); } -bool bli_func_is_null( func_t* f ) +bool bli_func_is_null( const func_t* f ) { bool r_val = TRUE; num_t dt; diff --git a/frame/base/bli_func.h b/frame/base/bli_func.h index 7bdd1ab10..cf89df389 100644 --- a/frame/base/bli_func.h +++ b/frame/base/bli_func.h @@ -38,11 +38,11 @@ BLIS_INLINE void_fp bli_func_get_dt ( - num_t dt, - func_t* func + num_t dt, + const func_t* func ) { - return func->ptr[ dt ]; + return func->ptr[ dt ]; } // func_t modification @@ -54,13 +54,13 @@ BLIS_INLINE void bli_func_set_dt func_t* func ) { - func->ptr[ dt ] = fp; + func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( - num_t dt_src, func_t* func_src, - num_t dt_dst, func_t* func_dst + num_t dt_src, const func_t* func_src, + num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); @@ -96,7 +96,7 @@ void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- -bool bli_func_is_null_dt( num_t dt, - func_t* f ); -bool bli_func_is_null( func_t* f ); +bool bli_func_is_null_dt( num_t dt, + const func_t* f ); +bool bli_func_is_null( const func_t* f ); diff --git a/frame/base/bli_getopt.c b/frame/base/bli_getopt.c index 184439db5..e1d90d323 100644 --- a/frame/base/bli_getopt.c +++ b/frame/base/bli_getopt.c @@ -45,12 +45,12 @@ void bli_getopt_init_state( int opterr, getopt_t* state ) state->optopt = 0; } -int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ) +int bli_getopt( int argc, const char* const * argv, const char* optstring, getopt_t* state ) { - static char* nextchar = NULL; + static const char* nextchar = NULL; - char* elem_str; - char* optstr_char; + const char* elem_str; + const char* optstr_char; // If argv contains no more arguments to process, return. if ( state->optind == argc ) return -1; diff --git a/frame/base/bli_getopt.h b/frame/base/bli_getopt.h index 1b5a7a002..bb0e4f2cf 100644 --- a/frame/base/bli_getopt.h +++ b/frame/base/bli_getopt.h @@ -34,13 +34,13 @@ typedef struct getopt_s { - char* optarg; - int optind; - int opterr; - int optopt; + const char* optarg; + int optind; + int opterr; + int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); -BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); +BLIS_EXPORT_BLIS int bli_getopt( int argc, const char* const * argv, const char* optstring, getopt_t* state ); diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 1372a055a..4a7ccbbc3 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -226,7 +226,7 @@ void bli_gks_finalize( void ) // Iterate over the architectures in the gks array. for ( id = 0; id < BLIS_NUM_ARCHS; ++id ) { - cntx_t** restrict gks_id = gks[ id ]; + cntx_t** gks_id = gks[ id ]; // Only consider context arrays for architectures that were allocated // in the first place. @@ -236,7 +236,7 @@ void bli_gks_finalize( void ) // referenced by cntx_pp. for ( ind = 0; ind < BLIS_NUM_IND_METHODS; ++ind ) { - cntx_t* restrict gks_id_ind = gks_id[ ind ]; + cntx_t* gks_id_ind = gks_id[ ind ]; // If the current context was allocated, free it. if ( gks_id_ind != NULL ) @@ -282,7 +282,7 @@ void bli_gks_init_index( void ) // ----------------------------------------------------------------------------- -cntx_t* bli_gks_lookup_nat_cntx +const cntx_t* bli_gks_lookup_nat_cntx ( arch_t id ) @@ -295,7 +295,7 @@ cntx_t* bli_gks_lookup_nat_cntx // ----------------------------------------------------------------------------- -cntx_t* bli_gks_lookup_ind_cntx +const cntx_t* bli_gks_lookup_ind_cntx ( arch_t id, ind_t ind @@ -316,8 +316,8 @@ cntx_t* bli_gks_lookup_ind_cntx // Index into the array of context pointers for the given architecture id, // and then index into the subarray for the given induced method. - cntx_t** restrict gks_id = gks[ id ]; - cntx_t* restrict gks_id_ind = gks_id[ ind ]; + cntx_t** gks_id = gks[ id ]; + cntx_t* gks_id_ind = gks_id[ ind ]; // Return the context pointer at gks_id_ind. return gks_id_ind; @@ -325,7 +325,7 @@ cntx_t* bli_gks_lookup_ind_cntx // ----------------------------------------------------------------------------- -cntx_t** bli_gks_lookup_id +const cntx_t* const * bli_gks_lookup_id ( arch_t id ) @@ -336,10 +336,10 @@ cntx_t** bli_gks_lookup_id // initialized. // Index into the array of context pointers for the given architecture id. - cntx_t** restrict gks_id = gks[ id ]; + cntx_t** gks_id = gks[ id ]; // Return the context pointer at gks_id_ind. - return gks_id; + return ( const cntx_t* const * )gks_id; } // ----------------------------------------------------------------------------- @@ -405,7 +405,7 @@ void bli_gks_register_cntx gks[ id ] = bli_calloc_intl( sizeof( cntx_t* ) * BLIS_NUM_IND_METHODS, &r_val ); // Alias the allocated array for readability. - cntx_t** restrict gks_id = gks[ id ]; + cntx_t** gks_id = gks[ id ]; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_gks_register_cntx(): " ); @@ -417,7 +417,7 @@ void bli_gks_register_cntx gks_id[ BLIS_NAT ] = bli_calloc_intl( sizeof( cntx_t ), &r_val ); // Alias the allocated context address for readability. - cntx_t* restrict gks_id_nat = gks_id[ BLIS_NAT ]; + cntx_t* gks_id_nat = gks_id[ BLIS_NAT ]; // Call the context initialization function on the element of the newly // allocated array corresponding to native execution. @@ -440,12 +440,12 @@ void bli_gks_register_cntx // kernel is called. err_t e_val; - blksz_t* restrict mc = bli_cntx_get_blksz( BLIS_MC, gks_id_nat ); - blksz_t* restrict nc = bli_cntx_get_blksz( BLIS_NC, gks_id_nat ); - blksz_t* restrict kc = bli_cntx_get_blksz( BLIS_KC, gks_id_nat ); - blksz_t* restrict mr = bli_cntx_get_blksz( BLIS_MR, gks_id_nat ); - blksz_t* restrict nr = bli_cntx_get_blksz( BLIS_NR, gks_id_nat ); - blksz_t* restrict kr = bli_cntx_get_blksz( BLIS_KR, gks_id_nat ); + const blksz_t* mc = bli_cntx_get_blksz( BLIS_MC, gks_id_nat ); + const blksz_t* nc = bli_cntx_get_blksz( BLIS_NC, gks_id_nat ); + const blksz_t* kc = bli_cntx_get_blksz( BLIS_KC, gks_id_nat ); + const blksz_t* mr = bli_cntx_get_blksz( BLIS_MR, gks_id_nat ); + const blksz_t* nr = bli_cntx_get_blksz( BLIS_NR, gks_id_nat ); + const blksz_t* kr = bli_cntx_get_blksz( BLIS_KR, gks_id_nat ); e_val = bli_check_valid_mc_mod_mult( mc, mr ); bli_check_error_code( e_val ); e_val = bli_check_valid_nc_mod_mult( nc, nr ); bli_check_error_code( e_val ); @@ -463,12 +463,12 @@ void bli_gks_register_cntx // ----------------------------------------------------------------------------- -cntx_t* bli_gks_query_cntx( void ) +const cntx_t* bli_gks_query_cntx( void ) { return bli_gks_query_nat_cntx(); } -cntx_t* bli_gks_query_nat_cntx( void ) +const cntx_t* bli_gks_query_nat_cntx( void ) { bli_init_once(); @@ -480,14 +480,14 @@ cntx_t* bli_gks_query_nat_cntx( void ) arch_t id = bli_arch_query_id(); // Use the architecture id to look up a pointer to its context. - cntx_t* cntx = bli_gks_lookup_nat_cntx( id ); + const cntx_t* cntx = bli_gks_lookup_nat_cntx( id ); return cntx; } // ----------------------------------------------------------------------------- -cntx_t* bli_gks_query_cntx_noinit( void ) +const cntx_t* bli_gks_query_cntx_noinit( void ) { // This function is identical to bli_gks_query_cntx(), except that it // does not call bli_init_once(). @@ -496,7 +496,7 @@ cntx_t* bli_gks_query_cntx_noinit( void ) arch_t id = bli_arch_query_id(); // Use the architecture id to look up a pointer to its context. - cntx_t* cntx = bli_gks_lookup_nat_cntx( id ); + const cntx_t* cntx = bli_gks_lookup_nat_cntx( id ); return cntx; } @@ -507,7 +507,7 @@ cntx_t* bli_gks_query_cntx_noinit( void ) // with a new entry corresponding to a context for an ind_t value. static bli_pthread_mutex_t gks_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER; -cntx_t* bli_gks_query_ind_cntx +const cntx_t* bli_gks_query_ind_cntx ( ind_t ind, num_t dt @@ -547,8 +547,8 @@ cntx_t* bli_gks_query_ind_cntx // Query the gks for the array of context pointers corresponding to the // given architecture id. - cntx_t** restrict gks_id = gks[ id ]; - cntx_t* restrict gks_id_nat = gks_id[ BLIS_NAT ]; + cntx_t** gks_id = gks[ id ]; + cntx_t* gks_id_nat = gks_id[ BLIS_NAT ]; // If for some reason the native context was requested, we can return // its address early. @@ -634,9 +634,9 @@ void bli_gks_init_ref_cntx bool bli_gks_cntx_l3_nat_ukr_is_ref ( - num_t dt, - ukr_t ukr_id, - cntx_t* cntx + num_t dt, + ukr_t ukr_id, + const cntx_t* cntx ) { cntx_t ref_cntx; @@ -658,7 +658,7 @@ bool bli_gks_cntx_l3_nat_ukr_is_ref // -- level-3 micro-kernel implementation strings ------------------------------ // -static char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] = +static const char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] = { "refrnce", "virtual", @@ -668,15 +668,15 @@ static char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] = // ----------------------------------------------------------------------------- -char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt ) +const char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt ) { kimpl_t ki; // Query the context for the current induced method and datatype, and // then query the ukernel function pointer for the given datatype from // that context. - cntx_t* cntx = bli_gks_query_ind_cntx( method, dt ); - void_fp fp = bli_cntx_get_ukr_dt( dt, ukr, cntx ); + const cntx_t* cntx = bli_gks_query_ind_cntx( method, dt ); + void_fp fp = bli_cntx_get_ukr_dt( dt, ukr, cntx ); // Check whether the ukernel function pointer is NULL for the given // datatype. If it is NULL, return the string for not applicable. @@ -742,7 +742,7 @@ kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt ) } // Query the native context from the gks. - cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id ); + const cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id ); if ( bli_gks_cntx_l3_nat_ukr_is_ref( dt, ukr, nat_cntx ) ) return BLIS_REFERENCE_UKERNEL; diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h index b8e4c4fe0..30e3b2e39 100644 --- a/frame/base/bli_gks.h +++ b/frame/base/bli_gks.h @@ -35,31 +35,31 @@ #ifndef BLIS_GKS_H #define BLIS_GKS_H -void bli_gks_init( void ); -void bli_gks_finalize( void ); +void bli_gks_init( void ); +void bli_gks_finalize( void ); -void bli_gks_init_index( void ); +void bli_gks_init_index( void ); -cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); -cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); -cntx_t** bli_gks_lookup_id( arch_t id ); -void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); +const cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); +const cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); +const cntx_t* const * bli_gks_lookup_id( arch_t id ); +void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); -BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); -BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); +BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_cntx( void ); +BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_nat_cntx( void ); -cntx_t* bli_gks_query_cntx_noinit( void ); +const cntx_t* bli_gks_query_cntx_noinit( void ); -BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); +BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); -BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); +BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); -bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, ukr_t ukr_id, cntx_t* cntx ); +bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, ukr_t ukr_id, const cntx_t* cntx ); -BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt ); -BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt ); +BLIS_EXPORT_BLIS const char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt ); +BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt ); -//char* bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt ); +//char* bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt ); #endif diff --git a/frame/base/bli_ind.c b/frame/base/bli_ind.c index a359e89a3..fbe740465 100644 --- a/frame/base/bli_ind.c +++ b/frame/base/bli_ind.c @@ -34,7 +34,7 @@ #include "blis.h" -static char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] = +static const char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] = { /* 1m */ "1m", /* nat */ "native", @@ -46,7 +46,7 @@ void bli_ind_init( void ) { // NOTE: Instead of calling bli_gks_query_cntx(), we call // bli_gks_query_cntx_noinit() to avoid the call to bli_init_once(). - cntx_t* cntx = bli_gks_query_cntx_noinit(); + const cntx_t* cntx = bli_gks_query_cntx_noinit(); // For each precision, enable the default induced method (1m) if both of // the following conditions are met: @@ -151,8 +151,8 @@ bool bli_ind_oper_is_impl( opid_t oper, ind_t method ) // All other operations should be reported as not implemented, // unless the requested check was for BLIS_NAT, in which case // all operations are implemented. - if ( method == BLIS_NAT ) is_impl = TRUE; - else is_impl = FALSE; + if ( method == BLIS_NAT ) is_impl = TRUE; + else is_impl = FALSE; } return is_impl; @@ -176,7 +176,7 @@ ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ) return method; } -char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ) +const char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ) { ind_t method = bli_ind_oper_find_avail( oper, dt ); @@ -185,7 +185,7 @@ char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ) // ----------------------------------------------------------------------------- -char* bli_ind_get_impl_string( ind_t method ) +const char* bli_ind_get_impl_string( ind_t method ) { return bli_ind_impl_str[ method ]; } diff --git a/frame/base/bli_ind.h b/frame/base/bli_ind.h index 85cad648e..e162c5809 100644 --- a/frame/base/bli_ind.h +++ b/frame/base/bli_ind.h @@ -38,25 +38,25 @@ // level-3 induced method management #include "bli_l3_ind.h" -void bli_ind_init( void ); -void bli_ind_finalize( void ); +void bli_ind_init( void ); +void bli_ind_finalize( void ); -BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); -BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); -BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); +BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); +BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); +BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); -BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); -BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); -BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); +BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); +BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); +BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); -BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); +BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); -BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); -BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); -BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); +BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); +BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); +BLIS_EXPORT_BLIS const char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); -char* bli_ind_get_impl_string( ind_t method ); -num_t bli_ind_map_cdt_to_index( num_t dt ); +const char* bli_ind_get_impl_string( ind_t method ); +num_t bli_ind_map_cdt_to_index( num_t dt ); #endif diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index bfa5ca9a3..72b54ca20 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -40,12 +40,11 @@ // This string gets defined via -D on the command line when BLIS is compiled. // This string is (or rather, should be) only used here. -static char* bli_version_str = BLIS_VERSION_STRING; -static char* bli_int_type_size_str = STRINGIFY_INT( BLIS_INT_TYPE_SIZE ); - -char* bli_info_get_version_str( void ) { return bli_version_str; } -char* bli_info_get_int_type_size_str( void ) { return bli_int_type_size_str; } +static const char* bli_version_str = BLIS_VERSION_STRING; +static const char* bli_int_type_size_str = STRINGIFY_INT( BLIS_INT_TYPE_SIZE ); +const char* bli_info_get_version_str( void ) { return bli_version_str; } +const char* bli_info_get_int_type_size_str( void ) { return bli_int_type_size_str; } // -- General configuration-related -------------------------------------------- @@ -158,36 +157,34 @@ gint_t bli_info_get_enable_sandbox( void ) } - // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- -char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ) +const char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ) { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMM_UKR, method, dt ); } -char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ) +const char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ) { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMMTRSM_L_UKR, method, dt ); } -char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ) +const char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ) { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMMTRSM_U_UKR, method, dt ); } -char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ) +const char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ) { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_TRSM_L_UKR, method, dt ); } -char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ) +const char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ) { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_TRSM_U_UKR, method, dt ); } - // -- BLIS implementation query (level-3) -------------------------------------- -char* bli_info_get_gemm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMM, dt ); } -char* bli_info_get_gemmt_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } -char* bli_info_get_hemm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HEMM, dt ); } -char* bli_info_get_herk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } -char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } -char* bli_info_get_symm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYMM, dt ); } -char* bli_info_get_syrk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } -char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } -char* bli_info_get_trmm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM, dt ); } -char* bli_info_get_trmm3_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM3, dt ); } -char* bli_info_get_trsm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRSM, dt ); } +const char* bli_info_get_gemm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMM, dt ); } +const char* bli_info_get_gemmt_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } +const char* bli_info_get_hemm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HEMM, dt ); } +const char* bli_info_get_herk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } +const char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } +const char* bli_info_get_symm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYMM, dt ); } +const char* bli_info_get_syrk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } +const char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } +const char* bli_info_get_trmm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM, dt ); } +const char* bli_info_get_trmm3_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM3, dt ); } +const char* bli_info_get_trsm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRSM, dt ); } diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index 99c7d000d..250504c23 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -36,8 +36,8 @@ // -- General library information ---------------------------------------------- -BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); -BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); +BLIS_EXPORT_BLIS const char* bli_info_get_version_str( void ); +BLIS_EXPORT_BLIS const char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- @@ -81,24 +81,24 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Level-3 kernel definitions -- -BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- -BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); -BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_gemm_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_gemmt_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_hemm_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_herk_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_her2k_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_symm_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_syrk_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_syr2k_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_trmm_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_trmm3_impl_string( num_t dt ); +BLIS_EXPORT_BLIS const char* bli_info_get_trsm_impl_string( num_t dt ); diff --git a/frame/base/bli_mbool.h b/frame/base/bli_mbool.h index 6a989590b..d00424273 100644 --- a/frame/base/bli_mbool.h +++ b/frame/base/bli_mbool.h @@ -36,7 +36,7 @@ // mbool_t query -BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) +BLIS_INLINE bool bli_mbool_get_dt( num_t dt, const mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } diff --git a/frame/base/bli_mem.h b/frame/base/bli_mem.h index d61e97021..c25511486 100644 --- a/frame/base/bli_mem.h +++ b/frame/base/bli_mem.h @@ -66,33 +66,33 @@ BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) return &(mem->pblk); } -BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) +BLIS_INLINE void* bli_mem_buffer( const mem_t* mem ) { - return bli_pblk_buf( bli_mem_pblk( mem ) ); + return bli_pblk_buf( bli_mem_pblk( ( mem_t* )mem ) ); } -BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) +BLIS_INLINE packbuf_t bli_mem_buf_type( const mem_t* mem ) { return mem->buf_type; } -BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) +BLIS_INLINE pool_t* bli_mem_pool( const mem_t* mem ) { return mem->pool; } -BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) +BLIS_INLINE siz_t bli_mem_size( const mem_t* mem ) { return mem->size; } -BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) +BLIS_INLINE bool bli_mem_is_alloc( const mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } -BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) +BLIS_INLINE bool bli_mem_is_unalloc( const mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); @@ -160,4 +160,4 @@ BLIS_INLINE void bli_mem_clear( mem_t* mem ) } -#endif +#endif diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c index ca3c46f99..7b62ded5c 100644 --- a/frame/base/bli_memsys.c +++ b/frame/base/bli_memsys.c @@ -44,7 +44,7 @@ void bli_memsys_init( void ) // contexts for induced methods. // NOTE: Instead of calling bli_gks_query_cntx(), we call // bli_gks_query_cntx_noinit() to avoid the call to bli_init_once(). - cntx_t* cntx_p = bli_gks_query_cntx_noinit(); + const cntx_t* cntx_p = bli_gks_query_cntx_noinit(); // Initialize the packing block allocator and its data structures. bli_pba_init( cntx_p ); diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index 23fbb4cd1..043bd1088 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -82,16 +82,13 @@ void bli_obj_create_without_buffer obj_t* obj ) { - siz_t elem_size; - void* s; - bli_init_once(); if ( bli_error_checking_is_enabled() ) bli_obj_create_without_buffer_check( dt, m, n, obj ); // Query the size of one element of the object's pre-set datatype. - elem_size = bli_dt_size( dt ); + siz_t elem_size = bli_dt_size( dt ); // Set any default properties that are appropriate. bli_obj_set_defaults( obj ); @@ -125,7 +122,7 @@ void bli_obj_create_without_buffer // Set the internal scalar to 1.0. bli_obj_set_scalar_dt( dt, obj ); - s = bli_obj_internal_scalar_buffer( obj ); + void* s = bli_obj_internal_scalar_buffer( obj ); // Always writing the imaginary component is needed in mixed-domain // scenarios. Failing to do this can lead to reading uninitialized @@ -147,21 +144,17 @@ void bli_obj_alloc_buffer obj_t* obj ) { - dim_t n_elem = 0; - dim_t m, n; - siz_t elem_size; - siz_t buffer_size; - void* p; - err_t r_val; + dim_t n_elem = 0; + err_t r_val; bli_init_once(); // Query the dimensions of the object we are allocating. - m = bli_obj_length( obj ); - n = bli_obj_width( obj ); + dim_t m = bli_obj_length( obj ); + dim_t n = bli_obj_width( obj ); // Query the size of one element. - elem_size = bli_obj_elem_size( obj ); + siz_t elem_size = bli_obj_elem_size( obj ); // Adjust the strides, if needed, before doing anything else // (particularly, before doing any error checking). @@ -198,10 +191,10 @@ void bli_obj_alloc_buffer // Compute the size of the total buffer to be allocated, which includes // padding if the leading dimension was increased for alignment purposes. - buffer_size = ( siz_t )n_elem * elem_size; + siz_t buffer_size = ( siz_t )n_elem * elem_size; // Allocate the buffer. - p = bli_malloc_user( buffer_size, &r_val ); + void* p = bli_malloc_user( buffer_size, &r_val ); // Set individual fields. bli_obj_set_buffer( p, obj ); @@ -264,8 +257,8 @@ void bli_obj_create_1x1_with_attached_buffer void bli_obj_create_conf_to ( - obj_t* s, - obj_t* d + const obj_t* s, + obj_t* d ) { const num_t dt = bli_obj_dt( s ); @@ -552,7 +545,7 @@ static char* dt_names[ BLIS_NUM_FP_TYPES+1 ] = "int" }; -char* bli_dt_string +const char* bli_dt_string ( num_t dt ) @@ -600,15 +593,13 @@ dim_t bli_align_dim_to_size dim_t bli_align_ptr_to_size ( - void* p, - size_t align_size + const void* p, + size_t align_size ) { - dim_t dim; - - dim = ( ( ( uintptr_t )p + align_size - 1 ) / - align_size - ) * align_size; + dim_t dim = ( ( ( uintptr_t )p + align_size - 1 ) / + align_size + ) * align_size; return dim; } @@ -634,13 +625,13 @@ num_t bli_dt_union( num_t dt1, num_t dt2 ) void bli_obj_print ( - char* label, - obj_t* obj + const char* label, + const obj_t* obj ) { bli_init_once(); - FILE* file = stdout; + FILE* file = stdout; if ( bli_error_checking_is_enabled() ) bli_obj_print_check( label, obj ); diff --git a/frame/base/bli_obj.h b/frame/base/bli_obj.h index 4436d2cd8..a446c09c8 100644 --- a/frame/base/bli_obj.h +++ b/frame/base/bli_obj.h @@ -95,8 +95,8 @@ BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( - obj_t* s, - obj_t* d + const obj_t* s, + obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free @@ -119,7 +119,7 @@ BLIS_EXPORT_BLIS siz_t bli_dt_size num_t dt ); -BLIS_EXPORT_BLIS char* bli_dt_string +BLIS_EXPORT_BLIS const char* bli_dt_string ( num_t dt ); @@ -139,13 +139,13 @@ BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( - void* p, - size_t align_size + const void* p, + size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( - char* label, - obj_t* obj + const char* label, + const obj_t* obj ); diff --git a/frame/base/bli_obj_scalar.c b/frame/base/bli_obj_scalar.c index e28d4fda9..2ef9751f6 100644 --- a/frame/base/bli_obj_scalar.c +++ b/frame/base/bli_obj_scalar.c @@ -41,15 +41,13 @@ void bli_obj_scalar_init_detached obj_t* beta ) { - void* p; - // Initialize beta without a buffer and then attach its internal buffer. // NOTE: This initializes both the storage datatype and scalar datatype // bitfields within beta to dt. bli_obj_create_without_buffer( dt, 1, 1, beta ); // Query the address of the object's internal scalar buffer. - p = bli_obj_internal_scalar_buffer( beta ); + void* p = bli_obj_internal_scalar_buffer( beta ); // Update the object. bli_obj_set_buffer( p, beta ); @@ -59,10 +57,10 @@ void bli_obj_scalar_init_detached void bli_obj_scalar_init_detached_copy_of ( - num_t dt, - conj_t conj, - obj_t* alpha, - obj_t* beta + num_t dt, + conj_t conj, + const obj_t* alpha, + obj_t* beta ) { obj_t alpha_local; @@ -81,8 +79,8 @@ void bli_obj_scalar_init_detached_copy_of void bli_obj_scalar_detach ( - obj_t* a, - obj_t* alpha + const obj_t* a, + obj_t* alpha ) { // Use the scalar datatype of A as the storage datatype of the detached @@ -103,9 +101,9 @@ void bli_obj_scalar_detach void bli_obj_scalar_attach ( - conj_t conj, - obj_t* alpha, - obj_t* a + conj_t conj, + const obj_t* alpha, + obj_t* a ) { obj_t alpha_cast; @@ -165,8 +163,8 @@ void bli_obj_scalar_cast_to void bli_obj_scalar_apply_scalar ( - obj_t* alpha, - obj_t* a + const obj_t* alpha, + obj_t* a ) { obj_t alpha_cast; @@ -193,9 +191,9 @@ void bli_obj_scalar_reset obj_t* a ) { - num_t dt = bli_obj_scalar_dt( a ); - void* scalar_a = bli_obj_internal_scalar_buffer( a ); - void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + num_t dt = bli_obj_scalar_dt( a ); + void* scalar_a = bli_obj_internal_scalar_buffer( a ); + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); if ( bli_is_float( dt ) ) *(( float* )scalar_a) = *(( float* )one); else if ( bli_is_double( dt ) ) *(( double* )scalar_a) = *(( double* )one); @@ -211,9 +209,9 @@ bool bli_obj_scalar_has_nonzero_imag obj_t* a ) { - bool r_val = FALSE; - num_t dt = bli_obj_scalar_dt( a ); - void* scalar_a = bli_obj_internal_scalar_buffer( a ); + bool r_val = FALSE; + num_t dt = bli_obj_scalar_dt( a ); + void* scalar_a = bli_obj_internal_scalar_buffer( a ); // FGVZ: Reimplement by using bli_obj_imag_part() and then // bli_obj_equals( &BLIS_ZERO, ... ). @@ -236,16 +234,15 @@ bool bli_obj_scalar_has_nonzero_imag bool bli_obj_scalar_equals ( - obj_t* a, - obj_t* beta + const obj_t* a, + const obj_t* beta ) { obj_t scalar_a; - bool r_val; bli_obj_scalar_detach( a, &scalar_a ); - r_val = bli_obj_equals( &scalar_a, beta ); + bool r_val = bli_obj_equals( &scalar_a, beta ); return r_val; } diff --git a/frame/base/bli_obj_scalar.h b/frame/base/bli_obj_scalar.h index 86b699659..23bf573c6 100644 --- a/frame/base/bli_obj_scalar.h +++ b/frame/base/bli_obj_scalar.h @@ -40,23 +40,23 @@ BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( - num_t dt, - conj_t conj, - obj_t* alpha, - obj_t* beta + num_t dt, + conj_t conj, + const obj_t* alpha, + obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( - obj_t* a, - obj_t* alpha + const obj_t* a, + obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( - conj_t conj, - obj_t* alpha, - obj_t* a + conj_t conj, + const obj_t* alpha, + obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to @@ -67,8 +67,8 @@ BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( - obj_t* alpha, - obj_t* a + const obj_t* alpha, + obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset @@ -83,7 +83,7 @@ BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( - obj_t* a, - obj_t* beta + const obj_t* a, + const obj_t* beta ); diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c index 95587e4a7..f3a2deeb4 100644 --- a/frame/base/bli_part.c +++ b/frame/base/bli_part.c @@ -40,12 +40,12 @@ void bli_acquire_mpart ( - dim_t i, - dim_t j, - dim_t bm, - dim_t bn, - obj_t* parent, - obj_t* child + dim_t i, + dim_t j, + dim_t bm, + dim_t bn, + const obj_t* parent, + obj_t* child ) { // Query the dimensions of the parent object. @@ -83,11 +83,11 @@ void bli_acquire_mpart void bli_acquire_mpart_t2b ( - subpart_t req_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj + subpart_t req_part, + dim_t i, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ) { bli_acquire_mpart_mdim( BLIS_FWD, req_part, i, b, obj, sub_obj ); @@ -96,11 +96,11 @@ void bli_acquire_mpart_t2b void bli_acquire_mpart_b2t ( - subpart_t req_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj + subpart_t req_part, + dim_t i, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ) { bli_acquire_mpart_mdim( BLIS_BWD, req_part, i, b, obj, sub_obj ); @@ -109,12 +109,12 @@ void bli_acquire_mpart_b2t void bli_acquire_mpart_mdim ( - dir_t direct, - subpart_t req_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj + dir_t direct, + subpart_t req_part, + dim_t i, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ) { dim_t m; @@ -123,7 +123,6 @@ void bli_acquire_mpart_mdim dim_t n_part = 0; inc_t offm_inc = 0; inc_t offn_inc = 0; - doff_t diag_off_inc; // Call a special function for partitioning packed objects. (By only @@ -235,7 +234,7 @@ void bli_acquire_mpart_mdim // Compute the diagonal offset based on the m and n offsets. - diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; + doff_t diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; // Begin by copying the info, elem size, buffer, row stride, and column @@ -307,24 +306,24 @@ void bli_acquire_mpart_mdim void bli_acquire_mpart_l2r ( - subpart_t req_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj + subpart_t req_part, + dim_t j, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ) { - bli_acquire_mpart_ndim( BLIS_FWD, req_part, i, b, obj, sub_obj ); + bli_acquire_mpart_ndim( BLIS_FWD, req_part, j, b, obj, sub_obj ); } void bli_acquire_mpart_r2l ( - subpart_t req_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj + subpart_t req_part, + dim_t j, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ) { bli_acquire_mpart_ndim( BLIS_BWD, req_part, j, b, obj, sub_obj ); @@ -333,12 +332,12 @@ void bli_acquire_mpart_r2l void bli_acquire_mpart_ndim ( - dir_t direct, - subpart_t req_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj + dir_t direct, + subpart_t req_part, + dim_t j, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ) { dim_t m; @@ -347,7 +346,6 @@ void bli_acquire_mpart_ndim dim_t n_part = 0; inc_t offm_inc = 0; inc_t offn_inc = 0; - doff_t diag_off_inc; // Call a special function for partitioning packed objects. (By only @@ -459,7 +457,7 @@ void bli_acquire_mpart_ndim // Compute the diagonal offset based on the m and n offsets. - diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; + doff_t diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; // Begin by copying the info, elem size, buffer, row stride, and column @@ -530,11 +528,11 @@ void bli_acquire_mpart_ndim void bli_acquire_mpart_tl2br ( - subpart_t req_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj + subpart_t req_part, + dim_t i, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ) { bli_acquire_mpart_mndim( BLIS_FWD, req_part, i, b, obj, sub_obj ); @@ -543,11 +541,11 @@ void bli_acquire_mpart_tl2br void bli_acquire_mpart_br2tl ( - subpart_t req_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj + subpart_t req_part, + dim_t j, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ) { bli_acquire_mpart_mndim( BLIS_BWD, req_part, j, b, obj, sub_obj ); @@ -556,12 +554,12 @@ void bli_acquire_mpart_br2tl void bli_acquire_mpart_mndim ( - dir_t direct, - subpart_t req_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj + dir_t direct, + subpart_t req_part, + dim_t ij, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ) { dim_t m; @@ -571,7 +569,6 @@ void bli_acquire_mpart_mndim dim_t n_part = 0; inc_t offm_inc = 0; inc_t offn_inc = 0; - doff_t diag_off_inc; // Call a special function for partitioning packed objects. (By only @@ -712,7 +709,7 @@ void bli_acquire_mpart_mndim // Compute the diagonal offset based on the m and n offsets. - diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; + doff_t diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; // Begin by copying the info, elem size, buffer, row stride, and column @@ -798,11 +795,11 @@ void bli_acquire_mpart_mndim void bli_acquire_vpart_f2b ( - subpart_t req_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj + subpart_t req_part, + dim_t i, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ) { if ( bli_obj_is_col_vector( obj ) ) @@ -814,11 +811,11 @@ void bli_acquire_vpart_f2b void bli_acquire_vpart_b2f ( - subpart_t req_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj + subpart_t req_part, + dim_t i, + dim_t b, + const obj_t* obj, + obj_t* sub_obj ) { if ( bli_obj_is_col_vector( obj ) ) @@ -833,10 +830,10 @@ void bli_acquire_vpart_b2f void bli_acquire_mij ( - dim_t i, - dim_t j, - obj_t* obj, - obj_t* sub_obj + dim_t i, + dim_t j, + const obj_t* obj, + obj_t* sub_obj ) { obj_t tmp_obj; @@ -848,9 +845,9 @@ void bli_acquire_mij void bli_acquire_vi ( - dim_t i, - obj_t* obj, - obj_t* sub_obj + dim_t i, + const obj_t* obj, + obj_t* sub_obj ) { if ( bli_obj_is_col_vector( obj ) ) diff --git a/frame/base/bli_part.h b/frame/base/bli_part.h index 5e56a9fec..6d3e00ced 100644 --- a/frame/base/bli_part.h +++ b/frame/base/bli_part.h @@ -38,12 +38,12 @@ BLIS_EXPORT_BLIS void bli_acquire_mpart ( - dim_t i, - dim_t j, - dim_t m, - dim_t n, - obj_t* obj, - obj_t* sub_obj + dim_t i, + dim_t j, + dim_t m, + dim_t n, + const obj_t* obj, + obj_t* sub_obj ); #undef GENPROT @@ -51,11 +51,11 @@ BLIS_EXPORT_BLIS void bli_acquire_mpart \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ - subpart_t req_part, \ - dim_t i, \ - dim_t b, \ - obj_t* obj, \ - obj_t* sub_obj \ + subpart_t req_part, \ + dim_t i, \ + dim_t b, \ + const obj_t* obj, \ + obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) @@ -71,12 +71,12 @@ GENPROT( acquire_mpart_br2tl ) \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ - dir_t direct, \ - subpart_t req_part, \ - dim_t i, \ - dim_t b, \ - obj_t* obj, \ - obj_t* sub_obj \ + dir_t direct, \ + subpart_t req_part, \ + dim_t i, \ + dim_t b, \ + const obj_t* obj, \ + obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) @@ -91,11 +91,11 @@ GENPROT( acquire_mpart_mndim ) \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ - subpart_t req_part, \ - dim_t i, \ - dim_t b, \ - obj_t* obj, \ - obj_t* sub_obj \ + subpart_t req_part, \ + dim_t i, \ + dim_t b, \ + const obj_t* obj, \ + obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) @@ -105,16 +105,16 @@ GENPROT( acquire_vpart_b2f ) BLIS_EXPORT_BLIS void bli_acquire_mij ( - dim_t i, - dim_t j, - obj_t* obj, - obj_t* sub_obj + dim_t i, + dim_t j, + const obj_t* obj, + obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( - dim_t i, - obj_t* obj, - obj_t* sub_obj + dim_t i, + const obj_t* obj, + obj_t* sub_obj ); diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c index f8835e5de..68dffd728 100644 --- a/frame/base/bli_pba.c +++ b/frame/base/bli_pba.c @@ -48,10 +48,10 @@ pba_t* bli_pba_query( void ) void bli_pba_init ( - cntx_t* restrict cntx + const cntx_t* cntx ) { - pba_t* restrict pba = bli_pba_query(); + pba_t* pba = bli_pba_query(); const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE_GEN; malloc_ft malloc_fp = BLIS_MALLOC_POOL; @@ -77,7 +77,7 @@ void bli_pba_finalize void ) { - pba_t* restrict pba = bli_pba_query(); + pba_t* pba = bli_pba_query(); #ifdef BLIS_ENABLE_PBA_POOLS bli_pba_finalize_pools( pba ); @@ -201,15 +201,11 @@ void bli_pba_release mem_t* mem ) { - packbuf_t buf_type; - pool_t* pool; - pblk_t* pblk; - // Query the memory broker from the runtime. pba_t* pba = bli_rntm_pba( rntm ); // Extract the buffer type so we know what kind of memory was allocated. - buf_type = bli_mem_buf_type( mem ); + packbuf_t buf_type = bli_mem_buf_type( mem ); #ifndef BLIS_ENABLE_PBA_POOLS #ifdef BLIS_ENABLE_MEM_TRACING @@ -231,10 +227,10 @@ void bli_pba_release { // Extract the address of the pool from which the memory was // allocated. - pool = bli_mem_pool( mem ); + pool_t* pool = bli_mem_pool( mem ); // Extract the address of the pblk_t struct within the mem_t struct. - pblk = bli_mem_pblk( mem ); + pblk_t* pblk = bli_mem_pblk( mem ); // Acquire the mutex associated with the pba object. bli_pba_lock( pba ); @@ -284,8 +280,8 @@ void bli_pba_acquire_v siz_t bli_pba_pool_size ( - pba_t* pba, - packbuf_t buf_type + const pba_t* pba, + packbuf_t buf_type ) { siz_t r_val; @@ -304,7 +300,7 @@ siz_t bli_pba_pool_size // Acquire the pointer to the pool corresponding to the buf_type // provided. pool_index = bli_packbuf_index( buf_type ); - pool = bli_pba_pool( pool_index, pba ); + pool = bli_pba_pool( pool_index, ( pba_t* )pba ); // Compute the pool "size" as the product of the block size // and the number of blocks in the pool. @@ -319,8 +315,8 @@ siz_t bli_pba_pool_size void bli_pba_init_pools ( - cntx_t* cntx, - pba_t* pba + const cntx_t* cntx, + pba_t* pba ) { // Map each of the packbuf_t values to an index starting at zero. @@ -402,10 +398,10 @@ void bli_pba_finalize_pools void bli_pba_compute_pool_block_sizes ( - siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + const cntx_t* cntx ) { const ind_t im = bli_cntx_method( cntx ); @@ -414,12 +410,10 @@ void bli_pba_compute_pool_block_sizes siz_t bs_cand_b = 0; siz_t bs_cand_c = 0; - num_t dt; - // Compute pool block sizes for each datatype and find the maximum // size for each pool. This is done so that new pools do not need // to be allocated if the user switches datatypes. - for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) + for ( num_t dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) { siz_t bs_dt_a; siz_t bs_dt_b; @@ -449,71 +443,43 @@ void bli_pba_compute_pool_block_sizes void bli_pba_compute_pool_block_sizes_dt ( - num_t dt, - siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx + num_t dt, + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + const cntx_t* cntx ) { - siz_t size_dt = bli_dt_size( dt ); - - blksz_t* mr; - blksz_t* nr; - - blksz_t* mc; - blksz_t* kc; - blksz_t* nc; - - dim_t mr_dt; - dim_t nr_dt; - dim_t max_mnr_dt; - - dim_t mc_max_dt; - dim_t kc_max_dt; - dim_t nc_max_dt; - - dim_t packmr_dt; - dim_t packnr_dt; - dim_t max_packmnr_dt; - - dim_t scale_num_dt; - dim_t scale_den_dt; - - dim_t pool_mc_dt, left_mc_dt; - dim_t pool_nc_dt, left_nc_dt; - dim_t pool_kc_dt; - // // Find the larger of the two register blocksizes. // // Query the mr and nr blksz_t objects for the given method of // execution. - mr = bli_cntx_get_blksz( BLIS_MR, cntx ); - nr = bli_cntx_get_blksz( BLIS_NR, cntx ); + const blksz_t* mr = bli_cntx_get_blksz( BLIS_MR, cntx ); + const blksz_t* nr = bli_cntx_get_blksz( BLIS_NR, cntx ); // Extract the mr and nr values specific to the current datatype. - mr_dt = bli_blksz_get_def( dt, mr ); - nr_dt = bli_blksz_get_def( dt, nr ); + dim_t mr_dt = bli_blksz_get_def( dt, mr ); + dim_t nr_dt = bli_blksz_get_def( dt, nr ); // Find the maximum of mr and nr. - max_mnr_dt = bli_max( mr_dt, nr_dt ); + dim_t max_mnr_dt = bli_max( mr_dt, nr_dt ); // // Define local maximum cache blocksizes. // // Query the mc, kc, and nc blksz_t objects for native execution. - mc = bli_cntx_get_blksz( BLIS_MC, cntx ); - kc = bli_cntx_get_blksz( BLIS_KC, cntx ); - nc = bli_cntx_get_blksz( BLIS_NC, cntx ); + const blksz_t* mc = bli_cntx_get_blksz( BLIS_MC, cntx ); + const blksz_t* kc = bli_cntx_get_blksz( BLIS_KC, cntx ); + const blksz_t* nc = bli_cntx_get_blksz( BLIS_NC, cntx ); // Extract the maximum mc, kc, and nc values specific to the current // datatype. - mc_max_dt = bli_blksz_get_max( dt, mc ); - kc_max_dt = bli_blksz_get_max( dt, kc ); - nc_max_dt = bli_blksz_get_max( dt, nc ); + dim_t mc_max_dt = bli_blksz_get_max( dt, mc ); + dim_t kc_max_dt = bli_blksz_get_max( dt, kc ); + dim_t nc_max_dt = bli_blksz_get_max( dt, nc ); // Add max(mr,nr) to kc to make room for the nudging of kc at // runtime to be a multiple of mr or nr for triangular operations @@ -545,8 +511,11 @@ void bli_pba_compute_pool_block_sizes_dt // So, if packmr * nr >= packnr * mr, then we will use packmr and mr as // our scaling factors. Otherwise, we'll use packnr and nr. - packmr_dt = bli_blksz_get_max( dt, mr ); - packnr_dt = bli_blksz_get_max( dt, nr ); + dim_t packmr_dt = bli_blksz_get_max( dt, mr ); + dim_t packnr_dt = bli_blksz_get_max( dt, nr ); + + dim_t scale_num_dt; + dim_t scale_den_dt; if ( packmr_dt * nr_dt >= packnr_dt * mr_dt ) { scale_num_dt = packmr_dt; @@ -558,13 +527,13 @@ void bli_pba_compute_pool_block_sizes_dt // Compute pool block dimensions. // - pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt; - left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt; + dim_t pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt; + dim_t left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt; - pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt; - left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt; + dim_t pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt; + dim_t left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt; - pool_kc_dt = ( kc_max_dt ); + dim_t pool_kc_dt = ( kc_max_dt ); if ( left_mc_dt > 0 ) pool_mc_dt += 1; if ( left_nc_dt > 0 ) pool_nc_dt += 1; @@ -573,10 +542,12 @@ void bli_pba_compute_pool_block_sizes_dt // Compute pool block sizes // + siz_t size_dt = bli_dt_size( dt ); + // We add an extra micro-panel of space to the block sizes for A and B // just to be sure any pre-loading performed by the micro-kernel does // not cause a segmentation fault. - max_packmnr_dt = bli_max( packmr_dt, packnr_dt ); + dim_t max_packmnr_dt = bli_max( packmr_dt, packnr_dt ); *bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; *bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; diff --git a/frame/base/bli_pba.h b/frame/base/bli_pba.h index 6431607ec..dfda53090 100644 --- a/frame/base/bli_pba.h +++ b/frame/base/bli_pba.h @@ -34,8 +34,8 @@ */ -#ifndef BLIS_MEMBRK_H -#define BLIS_MEMBRK_H +#ifndef BLIS_PBA_H +#define BLIS_PBA_H // Packing block allocator (formerly memory broker) @@ -73,17 +73,17 @@ BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) return &(pba->pools[ pool_index ]); } -BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) +BLIS_INLINE siz_t bli_pba_align_size( const pba_t* pba ) { return pba->align_size; } -BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) +BLIS_INLINE malloc_ft bli_pba_malloc_fp( const pba_t* pba ) { return pba->malloc_fp; } -BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) +BLIS_INLINE free_ft bli_pba_free_fp( const pba_t* pba ) { return pba->free_fp; } @@ -123,7 +123,7 @@ BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( - cntx_t* cntx + const cntx_t* cntx ); void bli_pba_finalize ( @@ -156,16 +156,16 @@ BLIS_INLINE void bli_pba_rntm_set_pba siz_t bli_pba_pool_size ( - pba_t* pba, - packbuf_t buf_type + const pba_t* pba, + packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( - cntx_t* cntx, - pba_t* pba + const cntx_t* cntx, + pba_t* pba ); void bli_pba_finalize_pools ( @@ -174,18 +174,18 @@ void bli_pba_finalize_pools void bli_pba_compute_pool_block_sizes ( - siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + const cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( - num_t dt, - siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx + num_t dt, + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + const cntx_t* cntx ); #endif diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c index 112ab68e8..684b0ef73 100644 --- a/frame/base/bli_pool.c +++ b/frame/base/bli_pool.c @@ -39,14 +39,14 @@ void bli_pool_init ( - siz_t num_blocks, - siz_t block_ptrs_len, - siz_t block_size, - siz_t align_size, - siz_t offset_size, - malloc_ft malloc_fp, - free_ft free_fp, - pool_t* restrict pool + siz_t num_blocks, + siz_t block_ptrs_len, + siz_t block_size, + siz_t align_size, + siz_t offset_size, + malloc_ft malloc_fp, + free_ft free_fp, + pool_t* pool ) { err_t r_val; @@ -67,7 +67,7 @@ void bli_pool_init // Allocate the block_ptrs array. // FGVZ: Do we want to call malloc_fp() for internal data structures as // well as pool blocks? If so, don't forget to s/bli_free_intl/free_fp/g. - pblk_t* restrict block_ptrs + pblk_t* block_ptrs = bli_malloc_intl( block_ptrs_len * sizeof( pblk_t ), &r_val ); @@ -115,7 +115,7 @@ void bli_pool_init void bli_pool_finalize ( - pool_t* restrict pool + pool_t* pool ) { // NOTE: This implementation assumes that either: @@ -124,7 +124,7 @@ void bli_pool_finalize // is bli_pool_reinit(). // Query the block_ptrs array. - pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); + pblk_t* block_ptrs = bli_pool_block_ptrs( pool ); // Query the total number of blocks currently allocated. const siz_t num_blocks = bli_pool_num_blocks( pool ); @@ -196,12 +196,12 @@ void bli_pool_finalize void bli_pool_reinit ( - siz_t num_blocks_new, - siz_t block_ptrs_len_new, - siz_t block_size_new, - siz_t align_size_new, - siz_t offset_size_new, - pool_t* restrict pool + siz_t num_blocks_new, + siz_t block_ptrs_len_new, + siz_t block_size_new, + siz_t align_size_new, + siz_t offset_size_new, + pool_t* pool ) { // Preserve the pointers to malloc() and free() provided when the pool @@ -234,9 +234,9 @@ void bli_pool_reinit void bli_pool_checkout_block ( - siz_t req_size, - pblk_t* restrict block, - pool_t* restrict pool + siz_t req_size, + pblk_t* block, + pool_t* pool ) { // If the requested block size is smaller than what the pool was @@ -282,7 +282,7 @@ void bli_pool_checkout_block // At this point, at least one block is guaranteed to be available. // Query the block_ptrs array. - pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); + pblk_t* block_ptrs = bli_pool_block_ptrs( pool ); // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); @@ -309,8 +309,8 @@ void bli_pool_checkout_block void bli_pool_checkin_block ( - pblk_t* restrict block, - pool_t* restrict pool + pblk_t* block, + pool_t* pool ) { // If the pblk_t being checked in was allocated with a different block @@ -330,7 +330,7 @@ void bli_pool_checkin_block } // Query the block_ptrs array. - pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); + pblk_t* block_ptrs = bli_pool_block_ptrs( pool ); // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); @@ -353,8 +353,8 @@ void bli_pool_checkin_block void bli_pool_grow ( - siz_t num_blocks_add, - pool_t* restrict pool + siz_t num_blocks_add, + pool_t* pool ) { err_t r_val; @@ -394,12 +394,12 @@ void bli_pool_grow #endif // Query the current block_ptrs array. - pblk_t* restrict block_ptrs_cur = bli_pool_block_ptrs( pool ); + pblk_t* block_ptrs_cur = bli_pool_block_ptrs( pool ); // Allocate a new block_ptrs array. // FGVZ: Do we want to call malloc_fp() for internal data structures as // well as pool blocks? If so, don't forget to s/bli_free_intl/free_fp/g. - pblk_t* restrict block_ptrs_new + pblk_t* block_ptrs_new = bli_malloc_intl( block_ptrs_len_new * sizeof( pblk_t ), &r_val ); @@ -433,7 +433,7 @@ void bli_pool_grow // blocks. // Query the current block_ptrs array (which was mabye just resized). - pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); + pblk_t* block_ptrs = bli_pool_block_ptrs( pool ); // Query the block size and alignment size of the pool. const siz_t block_size = bli_pool_block_size( pool ); @@ -470,8 +470,8 @@ void bli_pool_grow void bli_pool_shrink ( - siz_t num_blocks_sub, - pool_t* restrict pool + siz_t num_blocks_sub, + pool_t* pool ) { // If the requested decrease is zero, return early. @@ -493,7 +493,7 @@ void bli_pool_shrink num_blocks_sub = bli_min( num_blocks_sub, num_blocks_avail ); // Query the block_ptrs array. - pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); + pblk_t* block_ptrs = bli_pool_block_ptrs( pool ); // Compute the new total number of blocks. const siz_t num_blocks_new = num_blocks - num_blocks_sub; @@ -520,11 +520,11 @@ void bli_pool_shrink void bli_pool_alloc_block ( - siz_t block_size, - siz_t align_size, - siz_t offset_size, - malloc_ft malloc_fp, - pblk_t* restrict block + siz_t block_size, + siz_t align_size, + siz_t offset_size, + malloc_ft malloc_fp, + pblk_t* block ) { err_t r_val; @@ -540,7 +540,7 @@ void bli_pool_alloc_block // be recovered when it's time to free the block. Note that we have to // add offset_size to the number of bytes requested since we will skip // that many bytes at the beginning of the allocated memory. - void* restrict buf + void* buf = bli_fmalloc_align( malloc_fp, block_size + offset_size, align_size, &r_val ); @@ -579,7 +579,7 @@ void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, - pblk_t* restrict block + pblk_t* block ) { #ifdef BLIS_ENABLE_MEM_TRACING @@ -590,7 +590,7 @@ void bli_pool_free_block // Extract the pblk_t buffer, which is the aligned address returned from // bli_fmalloc_align() when the block was allocated. - void* restrict buf = bli_pblk_buf( block ); + void* buf = bli_pblk_buf( block ); // Undo the pointer advancement by offset_size bytes performed previously // by bli_pool_alloc_block(). @@ -604,7 +604,7 @@ void bli_pool_free_block void bli_pool_print ( - pool_t* restrict pool + const pool_t* pool ) { pblk_t* block_ptrs = bli_pool_block_ptrs( pool ); @@ -633,7 +633,7 @@ void bli_pool_print void bli_pblk_print ( - pblk_t* restrict pblk + const pblk_t* pblk ) { void* buf = bli_pblk_buf( pblk ); diff --git a/frame/base/bli_pool.h b/frame/base/bli_pool.h index b4bb23fec..0b16ae8ee 100644 --- a/frame/base/bli_pool.h +++ b/frame/base/bli_pool.h @@ -70,12 +70,12 @@ typedef struct // Pool block query -BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) +BLIS_INLINE void* bli_pblk_buf( const pblk_t* pblk ) { return pblk->buf; } -BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) +BLIS_INLINE siz_t bli_pblk_block_size( const pblk_t* pblk ) { return pblk->block_size; } @@ -115,52 +115,52 @@ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) // Pool entry query -BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) +BLIS_INLINE void* bli_pool_block_ptrs( const pool_t* pool ) { return pool->block_ptrs; } -BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) +BLIS_INLINE siz_t bli_pool_block_ptrs_len( const pool_t* pool ) { return pool->block_ptrs_len; } -BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) +BLIS_INLINE siz_t bli_pool_num_blocks( const pool_t* pool ) { return pool->num_blocks; } -BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) +BLIS_INLINE siz_t bli_pool_block_size( const pool_t* pool ) { return pool->block_size; } -BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) +BLIS_INLINE siz_t bli_pool_align_size( const pool_t* pool ) { return pool->align_size; } -BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) +BLIS_INLINE siz_t bli_pool_offset_size( const pool_t* pool ) { return pool->offset_size; } -BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) +BLIS_INLINE malloc_ft bli_pool_malloc_fp( const pool_t* pool ) { return pool->malloc_fp; } -BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) +BLIS_INLINE free_ft bli_pool_free_fp( const pool_t* pool ) { return pool->free_fp; } -BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) +BLIS_INLINE siz_t bli_pool_top_index( const pool_t* pool ) { return pool->top_index; } -BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) +BLIS_INLINE bool bli_pool_is_exhausted( const pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); @@ -217,74 +217,74 @@ BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ void bli_pool_init ( - siz_t num_blocks, - siz_t block_ptrs_len, - siz_t block_size, - siz_t align_size, - siz_t offset_size, - malloc_ft malloc_fp, - free_ft free_fp, - pool_t* restrict pool + siz_t num_blocks, + siz_t block_ptrs_len, + siz_t block_size, + siz_t align_size, + siz_t offset_size, + malloc_ft malloc_fp, + free_ft free_fp, + pool_t* pool ); void bli_pool_finalize ( - pool_t* restrict pool + pool_t* pool ); void bli_pool_reinit ( - siz_t num_blocks_new, - siz_t block_ptrs_len_new, - siz_t block_size_new, - siz_t align_size_new, - siz_t offset_size_new, - pool_t* restrict pool + siz_t num_blocks_new, + siz_t block_ptrs_len_new, + siz_t block_size_new, + siz_t align_size_new, + siz_t offset_size_new, + pool_t* pool ); void bli_pool_checkout_block ( - siz_t req_size, - pblk_t* restrict block, - pool_t* restrict pool + siz_t req_size, + pblk_t* block, + pool_t* pool ); void bli_pool_checkin_block ( - pblk_t* restrict block, - pool_t* restrict pool + pblk_t* block, + pool_t* pool ); void bli_pool_grow ( - siz_t num_blocks_add, - pool_t* restrict pool + siz_t num_blocks_add, + pool_t* pool ); void bli_pool_shrink ( - siz_t num_blocks_sub, - pool_t* restrict pool + siz_t num_blocks_sub, + pool_t* pool ); void bli_pool_alloc_block ( - siz_t block_size, - siz_t align_size, - siz_t offset_size, - malloc_ft malloc_fp, - pblk_t* restrict block + siz_t block_size, + siz_t align_size, + siz_t offset_size, + malloc_ft malloc_fp, + pblk_t* block ); void bli_pool_free_block ( - siz_t offset_size, - free_ft free_fp, - pblk_t* restrict block + siz_t offset_size, + free_ft free_fp, + pblk_t* block ); void bli_pool_print ( - pool_t* restrict pool + const pool_t* pool ); void bli_pblk_print ( - pblk_t* restrict pblk + const pblk_t* pblk ); #endif diff --git a/frame/base/bli_query.c b/frame/base/bli_query.c index c62a30ccc..140fc2f97 100644 --- a/frame/base/bli_query.c +++ b/frame/base/bli_query.c @@ -34,7 +34,7 @@ #include "blis.h" -bool bli_obj_equals( obj_t* a, obj_t* b ) +bool bli_obj_equals( const obj_t* a, const obj_t* b ) { #if 0 bool r_val = FALSE; @@ -95,7 +95,7 @@ bool bli_obj_equals( obj_t* a, obj_t* b ) #endif } -bool bli_obj_imag_equals( obj_t* a, obj_t* b ) +bool bli_obj_imag_equals( const obj_t* a, const obj_t* b ) { #if 0 bool r_val = FALSE; @@ -165,7 +165,7 @@ bool bli_obj_imag_equals( obj_t* a, obj_t* b ) return r_val; } -bool bli_obj_imag_is_zero( obj_t* a ) +bool bli_obj_imag_is_zero( const obj_t* a ) { bool r_val = TRUE; diff --git a/frame/base/bli_query.h b/frame/base/bli_query.h index 65246050b..d2decf928 100644 --- a/frame/base/bli_query.h +++ b/frame/base/bli_query.h @@ -32,8 +32,8 @@ */ -BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); +BLIS_EXPORT_BLIS bool bli_obj_equals( const obj_t* a, const obj_t* b ); -BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); +BLIS_EXPORT_BLIS bool bli_obj_imag_equals( const obj_t* a, const obj_t* b ); -BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); +BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( const obj_t* a ); diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c index a6ded35b3..2c13c74a2 100644 --- a/frame/base/bli_rntm.c +++ b/frame/base/bli_rntm.c @@ -410,7 +410,7 @@ void bli_rntm_set_ways_from_rntm_sup void bli_rntm_print ( - rntm_t* rntm + const rntm_t* rntm ) { dim_t af = bli_rntm_auto_factor( rntm ); @@ -433,8 +433,8 @@ void bli_rntm_print dim_t bli_rntm_calc_num_threads_in ( - bszid_t* restrict bszid_cur, - rntm_t* restrict rntm + const bszid_t* bszid_cur, + const rntm_t* rntm ) { /* // bp algorithm: diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h index 249a69805..2a39f8894 100644 --- a/frame/base/bli_rntm.h +++ b/frame/base/bli_rntm.h @@ -61,56 +61,56 @@ typedef struct rntm_s // -- rntm_t query (public API) ------------------------------------------------ // -BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) +BLIS_INLINE bool bli_rntm_auto_factor( const rntm_t* rntm ) { return rntm->auto_factor; } -BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) +BLIS_INLINE dim_t bli_rntm_num_threads( const rntm_t* rntm ) { return rntm->num_threads; } -BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) +BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, const rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } -BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) +BLIS_INLINE dim_t bli_rntm_jc_ways( const rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } -BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) +BLIS_INLINE dim_t bli_rntm_pc_ways( const rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } -BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) +BLIS_INLINE dim_t bli_rntm_ic_ways( const rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } -BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) +BLIS_INLINE dim_t bli_rntm_jr_ways( const rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } -BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) +BLIS_INLINE dim_t bli_rntm_ir_ways( const rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } -BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) +BLIS_INLINE dim_t bli_rntm_pr_ways( const rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } -BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) +BLIS_INLINE bool bli_rntm_pack_a( const rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } -BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) +BLIS_INLINE bool bli_rntm_pack_b( const rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } -BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) +BLIS_INLINE bool bli_rntm_l3_sup( const rntm_t* rntm ) { return rntm->l3_sup; } @@ -119,12 +119,12 @@ BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) // -- rntm_t query (internal use only) ----------------------------------------- // -BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) +BLIS_INLINE pool_t* bli_rntm_sba_pool( const rntm_t* rntm ) { return rntm->sba_pool; } -BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) +BLIS_INLINE pba_t* bli_rntm_pba( const rntm_t* rntm ) { return rntm->pba; } @@ -334,7 +334,7 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) BLIS_INLINE dim_t bli_rntm_calc_num_threads ( - rntm_t* restrict rntm + const rntm_t* rntm ) { dim_t n_threads; @@ -382,13 +382,13 @@ void bli_rntm_set_ways_from_rntm_sup void bli_rntm_print ( - rntm_t* rntm + const rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( - bszid_t* restrict bszid_cur, - rntm_t* restrict rntm + const bszid_t* bszid_cur, + const rntm_t* rntm ); #endif diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c index 5b6ff6a0f..776622bb4 100644 --- a/frame/base/bli_sba.c +++ b/frame/base/bli_sba.c @@ -57,8 +57,8 @@ void bli_sba_finalize( void ) void* bli_sba_acquire ( - rntm_t* restrict rntm, - siz_t req_size + rntm_t* rntm, + siz_t req_size ) { void* block; @@ -74,7 +74,7 @@ void* bli_sba_acquire pblk_t pblk; // Query the small block pool from the rntm. - pool_t* restrict pool = bli_rntm_sba_pool( rntm ); + pool_t* pool = bli_rntm_sba_pool( rntm ); // We don't expect NULL sba_pool pointers in the normal course of BLIS // operation. However, there are rare instances where it is convenient @@ -122,8 +122,8 @@ void* bli_sba_acquire void bli_sba_release ( - rntm_t* restrict rntm, - void* restrict block + rntm_t* rntm, + void* block ) { #ifdef BLIS_ENABLE_SBA_POOLS @@ -136,7 +136,7 @@ void bli_sba_release pblk_t pblk; // Query the small block pool from the rntm. - pool_t* restrict pool = bli_rntm_sba_pool( rntm ); + pool_t* pool = bli_rntm_sba_pool( rntm ); if ( pool == NULL ) { @@ -182,7 +182,7 @@ array_t* bli_sba_checkout_array void bli_sba_checkin_array ( - array_t* restrict array + array_t* array ) { #ifndef BLIS_ENABLE_SBA_POOLS @@ -194,9 +194,9 @@ void bli_sba_checkin_array void bli_sba_rntm_set_pool ( - siz_t index, - array_t* restrict array, - rntm_t* restrict rntm + siz_t index, + array_t* array, + rntm_t* rntm ) { #ifndef BLIS_ENABLE_SBA_POOLS @@ -205,7 +205,7 @@ void bli_sba_rntm_set_pool #endif // Query the pool_t* in the array_t corresponding to index. - pool_t* restrict pool = bli_apool_array_elem( index, array ); + pool_t* pool = bli_apool_array_elem( index, array ); // Embed the pool_t* into the rntm_t. bli_rntm_set_sba_pool( pool, rntm ); diff --git a/frame/base/bli_sba.h b/frame/base/bli_sba.h index f5e36d759..4fc3aaaee 100644 --- a/frame/base/bli_sba.h +++ b/frame/base/bli_sba.h @@ -44,30 +44,30 @@ void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( - const siz_t n_threads + siz_t n_threads ); void bli_sba_checkin_array ( - array_t* restrict array + array_t* array ); void bli_sba_rntm_set_pool ( - siz_t index, - array_t* restrict array, - rntm_t* restrict rntm + siz_t index, + array_t* array, + rntm_t* rntm ); void* bli_sba_acquire ( - rntm_t* restrict rntm, - siz_t req_size + rntm_t* rntm, + siz_t req_size ); void bli_sba_release ( - rntm_t* restrict rntm, - void* restrict block + rntm_t* rntm, + void* block ); diff --git a/frame/base/bli_setgetijm.c b/frame/base/bli_setgetijm.c index 78ff58a29..d056a2e44 100644 --- a/frame/base/bli_setgetijm.c +++ b/frame/base/bli_setgetijm.c @@ -36,21 +36,21 @@ typedef void (*setijm_fp) ( - double ar, - double ai, - dim_t i, - dim_t j, - void* restrict b, inc_t rs, inc_t cs + double ar, + double ai, + dim_t i, + dim_t j, + void* b, inc_t rs, inc_t cs ); static setijm_fp GENARRAY(ftypes_setijm,setijm); err_t bli_setijm ( - double ar, - double ai, - dim_t i, - dim_t j, - obj_t* b + double ar, + double ai, + dim_t i, + dim_t j, + const obj_t* b ) { dim_t m = bli_obj_length( b ); @@ -90,16 +90,16 @@ err_t bli_setijm \ void PASTEMAC(ch,opname) \ ( \ - double ar, \ - double ai, \ - dim_t i, \ - dim_t j, \ - void* restrict b, inc_t rs, inc_t cs \ + double ar, \ + double ai, \ + dim_t i, \ + dim_t j, \ + void* b, inc_t rs, inc_t cs \ ) \ { \ - ctype* restrict b_cast = ( ctype* )b; \ + ctype* b_cast = ( ctype* )b; \ \ - ctype* restrict b_ij = b_cast + (i )*rs + (j )*cs; \ + ctype* b_ij = b_cast + (i )*rs + (j )*cs; \ \ PASTEMAC2(z,ch,sets)( ar, ai, *b_ij ); \ } @@ -110,21 +110,21 @@ INSERT_GENTFUNC_BASIC0( setijm ) typedef void (*getijm_fp) ( - dim_t i, - dim_t j, - void* restrict b, inc_t rs, inc_t cs, - double* ar, - double* ai + dim_t i, + dim_t j, + const void* b, inc_t rs, inc_t cs, + double* ar, + double* ai ); static getijm_fp GENARRAY(ftypes_getijm,getijm); err_t bli_getijm ( - dim_t i, - dim_t j, - obj_t* b, - double* ar, - double* ai + dim_t i, + dim_t j, + const obj_t* b, + double* ar, + double* ai ) { dim_t m = bli_obj_length( b ); @@ -164,16 +164,16 @@ err_t bli_getijm \ void PASTEMAC(ch,opname) \ ( \ - dim_t i, \ - dim_t j, \ - void* restrict b, inc_t rs, inc_t cs, \ - double* ar, \ - double* ai \ + dim_t i, \ + dim_t j, \ + const void* b, inc_t rs, inc_t cs, \ + double* ar, \ + double* ai \ ) \ { \ - ctype* restrict b_cast = ( ctype* )b; \ + const ctype* b_cast = ( const ctype* )b; \ \ - ctype* restrict b_ij = b_cast + (i )*rs + (j )*cs; \ + const ctype* b_ij = b_cast + (i )*rs + (j )*cs; \ \ PASTEMAC2(ch,z,gets)( *b_ij, *ar, *ai ); \ } diff --git a/frame/base/bli_setgetijm.h b/frame/base/bli_setgetijm.h index 55ce0ee11..a2db16d11 100644 --- a/frame/base/bli_setgetijm.h +++ b/frame/base/bli_setgetijm.h @@ -34,11 +34,11 @@ BLIS_EXPORT_BLIS err_t bli_setijm ( - double ar, - double ai, - dim_t i, - dim_t j, - obj_t* b + double ar, + double ai, + dim_t i, + dim_t j, + const obj_t* b ); #undef GENTPROT @@ -46,11 +46,11 @@ BLIS_EXPORT_BLIS err_t bli_setijm \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - double ar, \ - double ai, \ - dim_t i, \ - dim_t j, \ - void* restrict b, inc_t rs, inc_t cs \ + double ar, \ + double ai, \ + dim_t i, \ + dim_t j, \ + void* b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) @@ -59,11 +59,11 @@ INSERT_GENTPROT_BASIC0( setijm ) BLIS_EXPORT_BLIS err_t bli_getijm ( - dim_t i, - dim_t j, - obj_t* b, - double* ar, - double* ai + dim_t i, + dim_t j, + const obj_t* b, + double* ar, + double* ai ); #undef GENTPROT @@ -71,11 +71,11 @@ BLIS_EXPORT_BLIS err_t bli_getijm \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - dim_t i, \ - dim_t j, \ - void* restrict b, inc_t rs, inc_t cs, \ - double* ar, \ - double* ai \ + dim_t i, \ + dim_t j, \ + const void* b, inc_t rs, inc_t cs, \ + double* ar, \ + double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) diff --git a/frame/base/bli_setgetijv.c b/frame/base/bli_setgetijv.c index 610f6f271..6cee789c7 100644 --- a/frame/base/bli_setgetijv.c +++ b/frame/base/bli_setgetijv.c @@ -36,19 +36,19 @@ typedef void (*setijv_fp) ( - double ar, - double ai, - dim_t i, - void* restrict x, inc_t incx + double ar, + double ai, + dim_t i, + void* x, inc_t incx ); static setijv_fp GENARRAY(ftypes_setijv,setijv); err_t bli_setijv ( - double ar, - double ai, - dim_t i, - obj_t* x + double ar, + double ai, + dim_t i, + const obj_t* x ) { dim_t n = bli_obj_vector_dim( x ); @@ -84,10 +84,10 @@ err_t bli_setijv \ void PASTEMAC(ch,opname) \ ( \ - double ar, \ - double ai, \ - dim_t i, \ - void* restrict x, inc_t incx \ + double ar, \ + double ai, \ + dim_t i, \ + void* x, inc_t incx \ ) \ { \ ctype* restrict x_cast = ( ctype* )x; \ @@ -103,19 +103,19 @@ INSERT_GENTFUNC_BASIC0( setijv ) typedef void (*getijv_fp) ( - dim_t i, - void* restrict x, inc_t incx, - double* ar, - double* ai + dim_t i, + const void* x, inc_t incx, + double* ar, + double* ai ); static getijv_fp GENARRAY(ftypes_getijv,getijv); err_t bli_getijv ( - dim_t i, - obj_t* x, - double* ar, - double* ai + dim_t i, + const obj_t* x, + double* ar, + double* ai ) { dim_t n = bli_obj_vector_dim( x ); @@ -151,15 +151,15 @@ err_t bli_getijv \ void PASTEMAC(ch,opname) \ ( \ - dim_t i, \ - void* restrict x, inc_t incx, \ - double* ar, \ - double* ai \ + dim_t i, \ + const void* x, inc_t incx, \ + double* ar, \ + double* ai \ ) \ { \ - ctype* restrict x_cast = ( ctype* )x; \ + const ctype* restrict x_cast = ( const ctype* )x; \ \ - ctype* restrict x_i = x_cast + (i )*incx; \ + const ctype* restrict x_i = x_cast + (i )*incx; \ \ PASTEMAC2(ch,z,gets)( *x_i, *ar, *ai ); \ } diff --git a/frame/base/bli_setgetijv.h b/frame/base/bli_setgetijv.h index 703fe41aa..a9badce4d 100644 --- a/frame/base/bli_setgetijv.h +++ b/frame/base/bli_setgetijv.h @@ -34,10 +34,10 @@ BLIS_EXPORT_BLIS err_t bli_setijv ( - double ar, - double ai, - dim_t i, - obj_t* x + double ar, + double ai, + dim_t i, + const obj_t* x ); #undef GENTPROT @@ -45,10 +45,10 @@ BLIS_EXPORT_BLIS err_t bli_setijv \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - double ar, \ - double ai, \ - dim_t i, \ - void* restrict x, inc_t incx \ + double ar, \ + double ai, \ + dim_t i, \ + void* x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) @@ -57,10 +57,10 @@ INSERT_GENTPROT_BASIC0( setijv ) BLIS_EXPORT_BLIS err_t bli_getijv ( - dim_t i, - obj_t* x, - double* ar, - double* ai + dim_t i, + const obj_t* x, + double* ar, + double* ai ); #undef GENTPROT @@ -68,10 +68,10 @@ BLIS_EXPORT_BLIS err_t bli_getijv \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - dim_t i, \ - void* restrict b, inc_t incx, \ - double* ar, \ - double* ai \ + dim_t i, \ + const void* b, inc_t incx, \ + double* ar, \ + double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) diff --git a/frame/base/bli_setri.c b/frame/base/bli_setri.c index 7220571c0..15e698b2b 100644 --- a/frame/base/bli_setri.c +++ b/frame/base/bli_setri.c @@ -38,8 +38,8 @@ void bli_setrm ( - obj_t* alpha, - obj_t* b + const obj_t* alpha, + const obj_t* b ) { obj_t alpha_real; @@ -67,8 +67,8 @@ void bli_setrm void bli_setrv ( - obj_t* alpha, - obj_t* x + const obj_t* alpha, + const obj_t* x ) { obj_t alpha_real; @@ -98,8 +98,8 @@ void bli_setrv void bli_setim ( - obj_t* alpha, - obj_t* b + const obj_t* alpha, + const obj_t* b ) { obj_t alpha_real; @@ -130,8 +130,8 @@ void bli_setim void bli_setiv ( - obj_t* alpha, - obj_t* x + const obj_t* alpha, + const obj_t* x ) { obj_t alpha_real; diff --git a/frame/base/bli_setri.h b/frame/base/bli_setri.h index dd6ce9f3f..ff5a09681 100644 --- a/frame/base/bli_setri.h +++ b/frame/base/bli_setri.h @@ -36,27 +36,27 @@ BLIS_EXPORT_BLIS void bli_setrm ( - obj_t* alpha, - obj_t* b + const obj_t* alpha, + const obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( - obj_t* alpha, - obj_t* x + const obj_t* alpha, + const obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( - obj_t* alpha, - obj_t* b + const obj_t* alpha, + const obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( - obj_t* alpha, - obj_t* x + const obj_t* alpha, + const obj_t* x ); diff --git a/frame/base/cast/bli_castm.c b/frame/base/cast/bli_castm.c index 64db75d24..57dd48bbc 100644 --- a/frame/base/cast/bli_castm.c +++ b/frame/base/cast/bli_castm.c @@ -41,11 +41,11 @@ typedef void (*FUNCPTR_T) ( - trans_t transa, - dim_t m, - dim_t n, - void* restrict a, inc_t rs_a, inc_t cs_a, - void* restrict b, inc_t rs_b, inc_t cs_b + trans_t transa, + dim_t m, + dim_t n, + const void* a, inc_t rs_a, inc_t cs_a, + void* b, inc_t rs_b, inc_t cs_b ); static FUNCPTR_T GENARRAY2_ALL(ftypes,castm); @@ -56,27 +56,25 @@ static FUNCPTR_T GENARRAY2_ALL(ftypes,castm); void bli_castm ( - obj_t* a, - obj_t* b + const obj_t* a, + const obj_t* b ) { - num_t dt_a = bli_obj_dt( a ); - num_t dt_b = bli_obj_dt( b ); + const num_t dt_a = bli_obj_dt( a ); + const num_t dt_b = bli_obj_dt( b ); - trans_t transa = bli_obj_conjtrans_status( a ); + const trans_t transa = bli_obj_conjtrans_status( a ); - dim_t m = bli_obj_length( b ); - dim_t n = bli_obj_width( b ); + const dim_t m = bli_obj_length( b ); + const dim_t n = bli_obj_width( b ); - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t rs_a = bli_obj_row_stride( a ); - inc_t cs_a = bli_obj_col_stride( a ); + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t rs_a = bli_obj_row_stride( a ); + const inc_t cs_a = bli_obj_col_stride( a ); - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t cs_b = bli_obj_col_stride( b ); - - FUNCPTR_T f; + void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t cs_b = bli_obj_col_stride( b ); // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -93,10 +91,7 @@ void bli_castm // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_a][dt_b]; - - // Invoke the void pointer-based function. - f + ftypes[dt_a][dt_b] ( transa, m, @@ -117,21 +112,21 @@ void bli_castm \ void PASTEMAC2(cha,chb,opname) \ ( \ - trans_t transa, \ - dim_t m, \ - dim_t n, \ - void* restrict a, inc_t rs_a, inc_t cs_a, \ - void* restrict b, inc_t rs_b, inc_t cs_b \ + trans_t transa, \ + dim_t m, \ + dim_t n, \ + const void* a, inc_t rs_a, inc_t cs_a, \ + void* b, inc_t rs_b, inc_t cs_b \ ) \ { \ - ctype_a* restrict a_cast = a; \ - ctype_b* restrict b_cast = b; \ - conj_t conja; \ - dim_t n_iter; \ - dim_t n_elem; \ - inc_t lda, inca; \ - inc_t ldb, incb; \ - dim_t j, i; \ + const ctype_a* restrict a_cast = a; \ + ctype_b* restrict b_cast = b; \ + conj_t conja; \ + dim_t n_iter; \ + dim_t n_elem; \ + inc_t lda, inca; \ + inc_t ldb, incb; \ + dim_t j, i; \ \ /* Set various loop parameters. */ \ bli_set_dims_incs_2m \ @@ -150,8 +145,8 @@ void PASTEMAC2(cha,chb,opname) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ - ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ - ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ + const ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ @@ -163,8 +158,8 @@ void PASTEMAC2(cha,chb,opname) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ - ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ - ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ + const ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ @@ -182,8 +177,8 @@ void PASTEMAC2(cha,chb,opname) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ - ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ - ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ + const ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ @@ -195,8 +190,8 @@ void PASTEMAC2(cha,chb,opname) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ - ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ - ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ + const ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ @@ -221,8 +216,8 @@ INSERT_GENTFUNC2_MIXDP0( castm ) void bli_castm_check ( - obj_t* a, - obj_t* b + const obj_t* a, + const obj_t* b ) { err_t e_val; diff --git a/frame/base/cast/bli_castm.h b/frame/base/cast/bli_castm.h index e9e1dee21..c06d1241a 100644 --- a/frame/base/cast/bli_castm.h +++ b/frame/base/cast/bli_castm.h @@ -38,8 +38,8 @@ BLIS_EXPORT_BLIS void bli_castm ( - obj_t* a, - obj_t* b + const obj_t* a, + const obj_t* b ); // @@ -51,11 +51,11 @@ BLIS_EXPORT_BLIS void bli_castm \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ - trans_t transa, \ - dim_t m, \ - dim_t n, \ - void* a, inc_t rs_a, inc_t cs_a, \ - void* b, inc_t rs_b, inc_t cs_b \ + trans_t transa, \ + dim_t m, \ + dim_t n, \ + const void* a, inc_t rs_a, inc_t cs_a, \ + void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) @@ -67,7 +67,7 @@ INSERT_GENTPROT2_MIXDP0( castm ) void bli_castm_check ( - obj_t* a, - obj_t* b + const obj_t* a, + const obj_t* b ); diff --git a/frame/base/cast/bli_castnzm.c b/frame/base/cast/bli_castnzm.c index a50bdfc15..071233169 100644 --- a/frame/base/cast/bli_castnzm.c +++ b/frame/base/cast/bli_castnzm.c @@ -41,11 +41,11 @@ typedef void (*FUNCPTR_T) ( - trans_t transa, - dim_t m, - dim_t n, - void* restrict a, inc_t rs_a, inc_t cs_a, - void* restrict b, inc_t rs_b, inc_t cs_b + trans_t transa, + dim_t m, + dim_t n, + const void* a, inc_t rs_a, inc_t cs_a, + void* b, inc_t rs_b, inc_t cs_b ); static FUNCPTR_T GENARRAY2_ALL(ftypes,castnzm); @@ -56,27 +56,25 @@ static FUNCPTR_T GENARRAY2_ALL(ftypes,castnzm); void bli_castnzm ( - obj_t* a, - obj_t* b + const obj_t* a, + const obj_t* b ) { - num_t dt_a = bli_obj_dt( a ); - num_t dt_b = bli_obj_dt( b ); + const num_t dt_a = bli_obj_dt( a ); + const num_t dt_b = bli_obj_dt( b ); - trans_t transa = bli_obj_conjtrans_status( a ); + const trans_t transa = bli_obj_conjtrans_status( a ); - dim_t m = bli_obj_length( b ); - dim_t n = bli_obj_width( b ); + const dim_t m = bli_obj_length( b ); + const dim_t n = bli_obj_width( b ); - void* buf_a = bli_obj_buffer_at_off( a ); - inc_t rs_a = bli_obj_row_stride( a ); - inc_t cs_a = bli_obj_col_stride( a ); + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t rs_a = bli_obj_row_stride( a ); + const inc_t cs_a = bli_obj_col_stride( a ); - void* buf_b = bli_obj_buffer_at_off( b ); - inc_t rs_b = bli_obj_row_stride( b ); - inc_t cs_b = bli_obj_col_stride( b ); - - FUNCPTR_T f; + void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t cs_b = bli_obj_col_stride( b ); // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -93,10 +91,7 @@ void bli_castnzm // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_a][dt_b]; - - // Invoke the void pointer-based function. - f + ftypes[dt_a][dt_b] ( transa, m, @@ -117,21 +112,21 @@ void bli_castnzm \ void PASTEMAC2(cha,chb,opname) \ ( \ - trans_t transa, \ - dim_t m, \ - dim_t n, \ - void* restrict a, inc_t rs_a, inc_t cs_a, \ - void* restrict b, inc_t rs_b, inc_t cs_b \ + trans_t transa, \ + dim_t m, \ + dim_t n, \ + const void* a, inc_t rs_a, inc_t cs_a, \ + void* b, inc_t rs_b, inc_t cs_b \ ) \ { \ - ctype_a* restrict a_cast = a; \ - ctype_b* restrict b_cast = b; \ - conj_t conja; \ - dim_t n_iter; \ - dim_t n_elem; \ - inc_t lda, inca; \ - inc_t ldb, incb; \ - dim_t j, i; \ + const ctype_a* restrict a_cast = a; \ + ctype_b* restrict b_cast = b; \ + conj_t conja; \ + dim_t n_iter; \ + dim_t n_elem; \ + inc_t lda, inca; \ + inc_t ldb, incb; \ + dim_t j, i; \ \ /* Set various loop parameters. */ \ bli_set_dims_incs_2m \ @@ -150,8 +145,8 @@ void PASTEMAC2(cha,chb,opname) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ - ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ - ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ + const ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ @@ -163,8 +158,8 @@ void PASTEMAC2(cha,chb,opname) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ - ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ - ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ + const ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ @@ -182,8 +177,8 @@ void PASTEMAC2(cha,chb,opname) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ - ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ - ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ + const ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ @@ -195,8 +190,8 @@ void PASTEMAC2(cha,chb,opname) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ - ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ - ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ + const ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ + ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ @@ -221,8 +216,8 @@ INSERT_GENTFUNC2_MIXDP0( castnzm ) void bli_castnzm_check ( - obj_t* a, - obj_t* b + const obj_t* a, + const obj_t* b ) { err_t e_val; diff --git a/frame/base/cast/bli_castnzm.h b/frame/base/cast/bli_castnzm.h index 42cfef8c0..03860fe40 100644 --- a/frame/base/cast/bli_castnzm.h +++ b/frame/base/cast/bli_castnzm.h @@ -38,8 +38,8 @@ BLIS_EXPORT_BLIS void bli_castnzm ( - obj_t* a, - obj_t* b + const obj_t* a, + const obj_t* b ); // @@ -51,11 +51,11 @@ BLIS_EXPORT_BLIS void bli_castnzm \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ - trans_t transa, \ - dim_t m, \ - dim_t n, \ - void* a, inc_t rs_a, inc_t cs_a, \ - void* b, inc_t rs_b, inc_t cs_b \ + trans_t transa, \ + dim_t m, \ + dim_t n, \ + const void* a, inc_t rs_a, inc_t cs_a, \ + void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) @@ -67,7 +67,7 @@ INSERT_GENTPROT2_MIXDP0( castnzm ) void bli_castnzm_check ( - obj_t* a, - obj_t* b + const obj_t* a, + const obj_t* b ); diff --git a/frame/base/cast/bli_castv.c b/frame/base/cast/bli_castv.c index 213c960d8..c46a2798c 100644 --- a/frame/base/cast/bli_castv.c +++ b/frame/base/cast/bli_castv.c @@ -41,10 +41,10 @@ typedef void (*FUNCPTR_T) ( - conj_t conjx, - dim_t n, - void* restrict x, inc_t inc_x, - void* restrict y, inc_t inc_y + conj_t conjx, + dim_t n, + const void* x, inc_t inc_x, + void* y, inc_t inc_y ); static FUNCPTR_T GENARRAY2_ALL(ftypes,castv); @@ -55,24 +55,22 @@ static FUNCPTR_T GENARRAY2_ALL(ftypes,castv); void bli_castv ( - obj_t* x, - obj_t* y + const obj_t* x, + const obj_t* y ) { - num_t dt_x = bli_obj_dt( x ); - num_t dt_y = bli_obj_dt( y ); + const num_t dt_x = bli_obj_dt( x ); + const num_t dt_y = bli_obj_dt( y ); - conj_t conjx = bli_obj_conj_status( x ); + const conj_t conjx = bli_obj_conj_status( x ); - dim_t n = bli_obj_vector_dim( x ); + const dim_t n = bli_obj_vector_dim( x ); - void* buf_x = bli_obj_buffer_at_off( x ); - inc_t inc_x = bli_obj_vector_inc( x ); + const void* buf_x = bli_obj_buffer_at_off( x ); + const inc_t inc_x = bli_obj_vector_inc( x ); - void* buf_y = bli_obj_buffer_at_off( y ); - inc_t inc_y = bli_obj_vector_inc( y ); - - FUNCPTR_T f; + void* buf_y = bli_obj_buffer_at_off( y ); + const inc_t inc_y = bli_obj_vector_inc( y ); // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -89,10 +87,7 @@ void bli_castv // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_x][dt_y]; - - // Invoke the void pointer-based function. - f + ftypes[dt_x][dt_y] ( conjx, n, @@ -112,15 +107,15 @@ void bli_castv \ void PASTEMAC2(chx,chy,opname) \ ( \ - conj_t conjx, \ - dim_t n, \ - void* restrict x, inc_t incx, \ - void* restrict y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + const void* x, inc_t incx, \ + void* y, inc_t incy \ ) \ { \ - ctype_x* restrict x1 = x; \ - ctype_y* restrict y1 = y; \ - dim_t i; \ + const ctype_x* restrict x1 = x; \ + ctype_y* restrict y1 = y; \ + dim_t i; \ \ if ( bli_is_conj( conjx ) ) \ { \ @@ -175,8 +170,8 @@ INSERT_GENTFUNC2_MIXDP0( castv ) void bli_castv_check ( - obj_t* x, - obj_t* y + const obj_t* x, + const obj_t* y ) { err_t e_val; diff --git a/frame/base/cast/bli_castv.h b/frame/base/cast/bli_castv.h index 9a8261514..85d87d911 100644 --- a/frame/base/cast/bli_castv.h +++ b/frame/base/cast/bli_castv.h @@ -38,8 +38,8 @@ BLIS_EXPORT_BLIS void bli_castv ( - obj_t* x, - obj_t* y + const obj_t* x, + const obj_t* y ); // @@ -51,10 +51,10 @@ BLIS_EXPORT_BLIS void bli_castv \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ - conj_t conjx, \ - dim_t n, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + const void* x, inc_t incx, \ + void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) @@ -66,7 +66,7 @@ INSERT_GENTPROT2_MIXDP0( castv ) void bli_castv_check ( - obj_t* x, - obj_t* y + const obj_t* x, + const obj_t* y ); diff --git a/frame/base/check/bli_obj_check.c b/frame/base/check/bli_obj_check.c index a971fa19a..cbacdd0fc 100644 --- a/frame/base/check/bli_obj_check.c +++ b/frame/base/check/bli_obj_check.c @@ -34,12 +34,12 @@ #include "blis.h" -void bli_obj_create_check( num_t dt, - dim_t m, - dim_t n, - inc_t rs, - inc_t cs, - obj_t* obj ) +void bli_obj_create_check( num_t dt, + dim_t m, + dim_t n, + inc_t rs, + inc_t cs, + const obj_t* obj ) { err_t e_val; @@ -53,10 +53,10 @@ void bli_obj_create_check( num_t dt, bli_check_error_code( e_val ); } -void bli_obj_create_without_buffer_check( num_t dt, - dim_t m, - dim_t n, - obj_t* obj ) +void bli_obj_create_without_buffer_check( num_t dt, + dim_t m, + dim_t n, + const obj_t* obj ) { err_t e_val; @@ -67,10 +67,10 @@ void bli_obj_create_without_buffer_check( num_t dt, bli_check_error_code( e_val ); } -void bli_obj_alloc_buffer_check( inc_t rs, - inc_t cs, - inc_t is, - obj_t* obj ) +void bli_obj_alloc_buffer_check( inc_t rs, + inc_t cs, + inc_t is, + const obj_t* obj ) { err_t e_val; @@ -83,11 +83,11 @@ void bli_obj_alloc_buffer_check( inc_t rs, bli_check_error_code( e_val ); } -void bli_obj_attach_buffer_check( void* p, - inc_t rs, - inc_t cs, - inc_t is, - obj_t* obj ) +void bli_obj_attach_buffer_check( const void* p, + inc_t rs, + inc_t cs, + inc_t is, + const obj_t* obj ) { err_t e_val; @@ -109,8 +109,7 @@ void bli_obj_attach_buffer_check( void* p, bli_check_error_code( e_val ); } -void bli_obj_create_scalar_check( num_t dt, - obj_t* obj ) +void bli_obj_create_scalar_check( num_t dt, const obj_t* obj ) { err_t e_val; @@ -121,7 +120,7 @@ void bli_obj_create_scalar_check( num_t dt, bli_check_error_code( e_val ); } -void bli_obj_free_check( obj_t* obj ) +void bli_obj_free_check( const obj_t* obj ) { //err_t e_val; @@ -131,7 +130,7 @@ void bli_obj_free_check( obj_t* obj ) //bli_check_error_code( e_val ); } -void bli_obj_create_const_check( double value, obj_t* obj ) +void bli_obj_create_const_check( double value, const obj_t* obj ) { err_t e_val; @@ -185,7 +184,7 @@ void bli_dt_union_check( num_t dt1, num_t dt2 ) bli_check_error_code( e_val ); } -void bli_obj_print_check( char* label, obj_t* obj ) +void bli_obj_print_check( const char* label, const obj_t* obj ) { err_t e_val; diff --git a/frame/base/check/bli_obj_check.h b/frame/base/check/bli_obj_check.h index 201842844..8572f0cfb 100644 --- a/frame/base/check/bli_obj_check.h +++ b/frame/base/check/bli_obj_check.h @@ -32,37 +32,36 @@ */ -void bli_obj_create_check( num_t dt, - dim_t m, - dim_t n, - inc_t rs, - inc_t cs, - obj_t* obj ); - -void bli_obj_create_without_buffer_check( num_t dt, - dim_t m, - dim_t n, - obj_t* obj ); - -void bli_obj_alloc_buffer_check( inc_t rs, +void bli_obj_create_check( num_t dt, + dim_t m, + dim_t n, + inc_t rs, inc_t cs, - inc_t is, - obj_t* obj ); + const obj_t* obj ); -void bli_obj_attach_buffer_check( void* p, - inc_t rs, - inc_t cs, - inc_t is, - obj_t* obj ); +void bli_obj_create_without_buffer_check( num_t dt, + dim_t m, + dim_t n, + const obj_t* obj ); -void bli_obj_create_scalar_check( num_t dt, - obj_t* obj ); +void bli_obj_alloc_buffer_check( inc_t rs, + inc_t cs, + inc_t is, + const obj_t* obj ); -void bli_obj_free_check( obj_t* obj ); +void bli_obj_attach_buffer_check( const void* p, + inc_t rs, + inc_t cs, + inc_t is, + const obj_t* obj ); -void bli_obj_create_const_check( double value, obj_t* obj ); +void bli_obj_create_scalar_check( num_t dt, const obj_t* obj ); -void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); +void bli_obj_free_check( const obj_t* obj ); + +void bli_obj_create_const_check( double value, const obj_t* obj ); + +void bli_obj_create_const_copy_of_check( const obj_t* a, const obj_t* b ); void bli_dt_size_check( num_t dt ); @@ -70,5 +69,5 @@ void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); -void bli_obj_print_check( char* label, obj_t* obj ); +void bli_obj_print_check( const char* label, const obj_t* obj ); diff --git a/frame/base/check/bli_part_check.c b/frame/base/check/bli_part_check.c index 6d9aa37b9..d13a8c22f 100644 --- a/frame/base/check/bli_part_check.c +++ b/frame/base/check/bli_part_check.c @@ -34,11 +34,11 @@ #include "blis.h" -void bli_acquire_mpart_t2b_check( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_t2b_check( subpart_t requested_part, + dim_t i, + dim_t b, + const obj_t* obj, + const obj_t* sub_obj ) { err_t e_val; @@ -52,11 +52,11 @@ void bli_acquire_mpart_t2b_check( subpart_t requested_part, bli_check_error_code( e_val ); } -void bli_acquire_mpart_l2r_check( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_l2r_check( subpart_t requested_part, + dim_t j, + dim_t b, + const obj_t* obj, + const obj_t* sub_obj ) { err_t e_val; @@ -70,11 +70,11 @@ void bli_acquire_mpart_l2r_check( subpart_t requested_part, bli_check_error_code( e_val ); } -void bli_acquire_mpart_tl2br_check( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_tl2br_check( subpart_t requested_part, + dim_t ij, + dim_t b, + const obj_t* obj, + const obj_t* sub_obj ) { err_t e_val; diff --git a/frame/base/check/bli_part_check.h b/frame/base/check/bli_part_check.h index 2905af0e4..810c5a3a7 100644 --- a/frame/base/check/bli_part_check.h +++ b/frame/base/check/bli_part_check.h @@ -32,21 +32,21 @@ */ -void bli_acquire_mpart_t2b_check( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); - -void bli_acquire_mpart_l2r_check( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); - -void bli_acquire_mpart_tl2br_check( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +void bli_acquire_mpart_t2b_check( subpart_t requested_part, + dim_t i, + dim_t b, + const obj_t* obj, + const obj_t* sub_obj ); + +void bli_acquire_mpart_l2r_check( subpart_t requested_part, + dim_t j, + dim_t b, + const obj_t* obj, + const obj_t* sub_obj ); + +void bli_acquire_mpart_tl2br_check( subpart_t requested_part, + dim_t ij, + dim_t b, + const obj_t* obj, + const obj_t* sub_obj ); diff --git a/frame/base/proj/bli_projm.c b/frame/base/proj/bli_projm.c index 949bc2cc9..c79897083 100644 --- a/frame/base/proj/bli_projm.c +++ b/frame/base/proj/bli_projm.c @@ -36,8 +36,8 @@ void bli_projm ( - obj_t* a, - obj_t* b + const obj_t* a, + const obj_t* b ) { // Check parameters. @@ -88,8 +88,8 @@ void bli_projm void bli_projm_check ( - obj_t* a, - obj_t* b + const obj_t* a, + const obj_t* b ) { err_t e_val; diff --git a/frame/base/proj/bli_projm.h b/frame/base/proj/bli_projm.h index e95f7f2f5..924924f9b 100644 --- a/frame/base/proj/bli_projm.h +++ b/frame/base/proj/bli_projm.h @@ -34,13 +34,13 @@ BLIS_EXPORT_BLIS void bli_projm ( - obj_t* a, - obj_t* b + const obj_t* a, + const obj_t* b ); void bli_projm_check ( - obj_t* a, - obj_t* b + const obj_t* a, + const obj_t* b ); diff --git a/frame/base/proj/bli_projv.c b/frame/base/proj/bli_projv.c index 9a6587e5b..588ac39c3 100644 --- a/frame/base/proj/bli_projv.c +++ b/frame/base/proj/bli_projv.c @@ -36,8 +36,8 @@ void bli_projv ( - obj_t* x, - obj_t* y + const obj_t* x, + const obj_t* y ) { // Check parameters. @@ -88,8 +88,8 @@ void bli_projv void bli_projv_check ( - obj_t* x, - obj_t* y + const obj_t* x, + const obj_t* y ) { err_t e_val; diff --git a/frame/base/proj/bli_projv.h b/frame/base/proj/bli_projv.h index b738b2f97..abdf35522 100644 --- a/frame/base/proj/bli_projv.h +++ b/frame/base/proj/bli_projv.h @@ -34,13 +34,13 @@ BLIS_EXPORT_BLIS void bli_projv ( - obj_t* x, - obj_t* y + const obj_t* x, + const obj_t* y ); void bli_projv_check ( - obj_t* x, - obj_t* y + const obj_t* x, + const obj_t* y ); diff --git a/frame/compat/extra/bla_gemm3m.c b/frame/compat/extra/bla_gemm3m.c index 4533375f0..31f677db6 100644 --- a/frame/compat/extra/bla_gemm3m.c +++ b/frame/compat/extra/bla_gemm3m.c @@ -103,7 +103,7 @@ void PASTEF77(ch,blasname) \ abbreviated version of bli_gemm_ex() so that we can bypass consideration of sup, which doesn't make sense in this context. */ \ { \ - cntx_t* cntx = bli_gks_query_ind_cntx( BLIS_1M, dt ); \ + cntx_t* cntx = ( cntx_t* )bli_gks_query_ind_cntx( BLIS_1M, dt ); \ \ rntm_t rntm_l; \ rntm_t* rntm = &rntm_l; \ @@ -222,7 +222,7 @@ void PASTEF77(ch,blasname) \ abbreviated version of bli_gemm_ex() so that we can bypass consideration of sup, which doesn't make sense in this context. */ \ { \ - cntx_t* cntx = bli_gks_query_ind_cntx( BLIS_1M, dt ); \ + cntx_t* cntx = ( cntx_t* )bli_gks_query_ind_cntx( BLIS_1M, dt ); \ \ rntm_t rntm_l; \ rntm_t* rntm = &rntm_l; \ diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h index 9773e5e69..42ad9c72b 100644 --- a/frame/include/bli_extern_defs.h +++ b/frame/include/bli_extern_defs.h @@ -35,13 +35,13 @@ #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H -BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; -BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; +BLIS_EXPORT_BLIS extern const obj_t BLIS_TWO; +BLIS_EXPORT_BLIS extern const obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; -BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; +BLIS_EXPORT_BLIS extern const obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; -BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; -BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; +BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_ONE; +BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; diff --git a/frame/include/bli_oapi_ba.h b/frame/include/bli_oapi_ba.h index dc17507d1..d80263597 100644 --- a/frame/include/bli_oapi_ba.h +++ b/frame/include/bli_oapi_ba.h @@ -54,6 +54,6 @@ // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS -#define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; +#define BLIS_OAPI_EX_DECLS const cntx_t* cntx = NULL; ( void )cntx; \ + rntm_t* rntm = NULL; ( void )rntm; diff --git a/frame/include/bli_oapi_ex.h b/frame/include/bli_oapi_ex.h index 0eb5eb2a1..7252fd7ff 100644 --- a/frame/include/bli_oapi_ex.h +++ b/frame/include/bli_oapi_ex.h @@ -48,7 +48,7 @@ // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm +#define BLIS_OAPI_EX_PARAMS , const cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index fe174202c..9adaef211 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -42,363 +42,363 @@ // Info query -BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) +BLIS_INLINE num_t bli_obj_dt( const obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } -BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_float( const obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } -BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_double( const obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } -BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_scomplex( const obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } -BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_dcomplex( const obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } -BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_int( const obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } -BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_const( const obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } -BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) +BLIS_INLINE dom_t bli_obj_domain( const obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } -BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) +BLIS_INLINE prec_t bli_obj_prec( const obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } -BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_single_prec( const obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } -BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_double_prec( const obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } -BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) +BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( const obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } -BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) +BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( const obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } -BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_real( const obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } -BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_complex( const obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } -BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) +BLIS_INLINE num_t bli_obj_dt_proj_to_real( const obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } -BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) +BLIS_INLINE num_t bli_obj_dt_proj_to_complex( const obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } -BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) +BLIS_INLINE num_t bli_obj_target_dt( const obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } -BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) +BLIS_INLINE dom_t bli_obj_target_domain( const obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } -BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) +BLIS_INLINE prec_t bli_obj_target_prec( const obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } -BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) +BLIS_INLINE num_t bli_obj_exec_dt( const obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } -BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) +BLIS_INLINE dom_t bli_obj_exec_domain( const obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } -BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) +BLIS_INLINE prec_t bli_obj_exec_prec( const obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } -BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) +BLIS_INLINE num_t bli_obj_comp_dt( const obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } -BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) +BLIS_INLINE dom_t bli_obj_comp_domain( const obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } -BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) +BLIS_INLINE prec_t bli_obj_comp_prec( const obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. -BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) +BLIS_INLINE num_t bli_obj_scalar_dt( const obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. -BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) +BLIS_INLINE dom_t bli_obj_scalar_domain( const obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. -BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) +BLIS_INLINE prec_t bli_obj_scalar_prec( const obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } -BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) +BLIS_INLINE trans_t bli_obj_conjtrans_status( const obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } -BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) +BLIS_INLINE trans_t bli_obj_onlytrans_status( const obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } -BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) +BLIS_INLINE bool bli_obj_has_trans( const obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } -BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) +BLIS_INLINE bool bli_obj_has_notrans( const obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } -BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) +BLIS_INLINE conj_t bli_obj_conj_status( const obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } -BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) +BLIS_INLINE bool bli_obj_has_conj( const obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } -BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) +BLIS_INLINE bool bli_obj_has_noconj( const obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } -BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) +BLIS_INLINE uplo_t bli_obj_uplo( const obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } -BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_upper( const obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } -BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_lower( const obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } -BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_upper_or_lower( const obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } -BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_dense( const obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } -BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_zeros( const obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } -BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) +BLIS_INLINE diag_t bli_obj_diag( const obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } -BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) +BLIS_INLINE bool bli_obj_has_nonunit_diag( const obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } -BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) +BLIS_INLINE bool bli_obj_has_unit_diag( const obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } -BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) +BLIS_INLINE bool bli_obj_has_inverted_diag( const obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } -BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( const obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } -BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( const obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } -BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) +BLIS_INLINE pack_t bli_obj_pack_schema( const obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } -BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_packed( const obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } -BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_row_packed( const obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ - BLIS_BITVAL_PACKED_ROWS ) ); + BLIS_BITVAL_PACKED_ROWS ) ); } -BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_col_packed( const obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ - BLIS_BITVAL_PACKED_COLUMNS ) ); + BLIS_BITVAL_PACKED_COLUMNS ) ); } -BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_panel_packed( const obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } -BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) +BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( const obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } -BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) +BLIS_INLINE struc_t bli_obj_struc( const obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } -BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_general( const obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } -BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_hermitian( const obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } -BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_symmetric( const obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } -BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_triangular( const obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); @@ -599,49 +599,49 @@ BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) // Root matrix query -BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) +BLIS_INLINE obj_t* bli_obj_root( const obj_t* obj ) { return ( obj_t* )( obj->root ); } -BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) +BLIS_INLINE bool bli_obj_root_is_general( const obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } -BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) +BLIS_INLINE bool bli_obj_root_is_hermitian( const obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } -BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) +BLIS_INLINE bool bli_obj_root_is_symmetric( const obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } -BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) +BLIS_INLINE bool bli_obj_root_is_triangular( const obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } -BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) +BLIS_INLINE bool bli_obj_root_is_herm_or_symm( const obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } -BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) +BLIS_INLINE bool bli_obj_root_is_upper( const obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } -BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) +BLIS_INLINE bool bli_obj_root_is_lower( const obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); @@ -656,13 +656,13 @@ BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) // Diagonal offset query -BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) +BLIS_INLINE doff_t bli_obj_diag_offset( const obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } -BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) +BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( const obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) @@ -688,46 +688,46 @@ BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) // Dimension query -BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_length( const obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } -BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_width( const obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } -BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) +BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, const obj_t* obj ) { return ( obj->dim[ mdim ] ); } -BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_min_dim( const obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } -BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_max_dim( const obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } -BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_length_after_trans( const obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } -BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_width_after_trans( const obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } -BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) +BLIS_INLINE bool bli_obj_is_1x1( const obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && @@ -736,34 +736,34 @@ BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) // Stride/increment query -BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) +BLIS_INLINE inc_t bli_obj_row_stride( const obj_t* obj ) { return ( obj->rs ); } -BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) +BLIS_INLINE inc_t bli_obj_col_stride( const obj_t* obj ) { return ( obj->cs ); } -BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) +BLIS_INLINE inc_t bli_obj_imag_stride( const obj_t* obj ) { return ( obj->is ); } -BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) +BLIS_INLINE inc_t bli_obj_row_stride_mag( const obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } -BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) +BLIS_INLINE inc_t bli_obj_col_stride_mag( const obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } -BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) +BLIS_INLINE inc_t bli_obj_imag_stride_mag( const obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); @@ -773,7 +773,7 @@ BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. -BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_length_stored( const obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) @@ -784,7 +784,7 @@ BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) ); } -BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_width_stored( const obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) @@ -795,25 +795,25 @@ BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) ); } -BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_length_stored_after_trans( const obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } -BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_width_stored_after_trans( const obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } -BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) +BLIS_INLINE dim_t bli_obj_vector_dim( const obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } -BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) +BLIS_INLINE inc_t bli_obj_vector_inc( const obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) @@ -821,26 +821,26 @@ BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) ); } -BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) +BLIS_INLINE bool bli_obj_is_vector( const obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } -BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) +BLIS_INLINE bool bli_obj_is_row_vector( const obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } -BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) +BLIS_INLINE bool bli_obj_is_col_vector( const obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } -BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) +BLIS_INLINE bool bli_obj_has_zero_dim( const obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || @@ -894,32 +894,32 @@ BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, o // "obj" macros are used on packed matrices. // -BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_row_stored( const obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } -BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_col_stored( const obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } -BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_gen_stored( const obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } -BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_row_tilted( const obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } -BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_col_tilted( const obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); @@ -950,17 +950,17 @@ BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) // Offset query -BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_row_off( const obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } -BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_col_off( const obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } -BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) +BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, const obj_t* obj ) { return ( obj->off[ mdim ] ); } @@ -991,33 +991,33 @@ BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) // Diagonal offset predicates -BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_strictly_above_diag( const obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } -BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_strictly_below_diag( const obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } -BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_outside_diag( const obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } -BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) +BLIS_INLINE bool bli_obj_intersects_diag( const obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } -BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) +BLIS_INLINE bool bli_obj_is_unstored_subpart( const obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || @@ -1026,7 +1026,7 @@ BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) // Buffer address query -BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) +BLIS_INLINE void* bli_obj_buffer( const obj_t* obj ) { return ( void* ) ( obj->buffer ); @@ -1041,7 +1041,7 @@ BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) // Bufferless scalar field query -BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) +BLIS_INLINE void* bli_obj_internal_scalar_buffer( const obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); @@ -1049,14 +1049,14 @@ BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) // Bufferless scalar field modification -BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) +BLIS_INLINE void bli_obj_copy_internal_scalar( const obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query -BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) +BLIS_INLINE siz_t bli_obj_elem_size( const obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); @@ -1071,12 +1071,12 @@ BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) // Packed matrix info query -BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_padded_length( const obj_t* obj ) { return ( obj->m_padded ); } -BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_padded_width( const obj_t* obj ) { return ( obj->n_padded ); } @@ -1101,22 +1101,22 @@ BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) // Packed panel info query -BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_panel_length( const obj_t* obj ) { return ( obj->m_panel ); } -BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) +BLIS_INLINE dim_t bli_obj_panel_width( const obj_t* obj ) { return ( obj->n_panel ); } -BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) +BLIS_INLINE inc_t bli_obj_panel_dim( const obj_t* obj ) { return ( obj->pd ); } -BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) +BLIS_INLINE inc_t bli_obj_panel_stride( const obj_t* obj ) { return ( obj->ps ); } @@ -1151,7 +1151,7 @@ BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) // stor3_t-related -BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) +BLIS_INLINE stor3_t bli_obj_stor3_from_strides( const obj_t* c, const obj_t* a, const obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); @@ -1191,22 +1191,22 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) // Function pointer query -BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) +BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( const obj_t* obj ) { return obj->pack_fn; } -BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) +BLIS_INLINE void* bli_obj_pack_params( const obj_t* obj ) { return obj->pack_params; } -BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) +BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( const obj_t* obj ) { return obj->ker_fn; } -BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) +BLIS_INLINE void* bli_obj_ker_params( const obj_t* obj ) { return obj->ker_params; } @@ -1261,7 +1261,7 @@ BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); - void* restrict s = bli_obj_internal_scalar_buffer( obj ); + void* s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } @@ -1315,7 +1315,7 @@ BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) // Acquire buffer at object's submatrix offset (offset-aware buffer query). -BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) +BLIS_INLINE void* bli_obj_buffer_at_off( const obj_t* obj ) { return ( void* ) ( @@ -1330,7 +1330,7 @@ BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) // Acquire buffer from BLIS_CONSTANT object. -BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) +BLIS_INLINE const void* bli_obj_buffer_for_const( num_t dt, const obj_t* obj ) { void* p; @@ -1345,7 +1345,7 @@ BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. -BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) +BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, const obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) @@ -1360,21 +1360,21 @@ BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { - bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); - bli_obj_set_offs( 0, 0, obj ); + bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); + bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). -BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) +BLIS_INLINE void bli_obj_alias_to( const obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. -BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) +BLIS_INLINE bool bli_obj_is_alias_of( const obj_t* a, const obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); @@ -1384,7 +1384,7 @@ BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) // Create an alias with a trans value applied. // (Note: trans may include a conj component.) -BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) +BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, const obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); @@ -1392,7 +1392,7 @@ BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) // Create an alias with a conj value applied. -BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) +BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, const obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); @@ -1400,7 +1400,7 @@ BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) // Alias only the real part. -BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) +BLIS_INLINE void bli_obj_real_part( const obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); @@ -1433,7 +1433,7 @@ BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) // Alias only the imaginary part. -BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) +BLIS_INLINE void bli_obj_imag_part( const obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { @@ -1472,7 +1472,7 @@ BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). -BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) +BLIS_INLINE void bli_obj_scalar_set_dt_buffer( const obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { diff --git a/frame/include/bli_tapi_ba.h b/frame/include/bli_tapi_ba.h index 0177985d9..6a7e195ab 100644 --- a/frame/include/bli_tapi_ba.h +++ b/frame/include/bli_tapi_ba.h @@ -54,6 +54,6 @@ // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS -#define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ - rntm_t* rntm = NULL; ( void )rntm; +#define BLIS_TAPI_EX_DECLS const cntx_t* cntx = NULL; ( void )cntx; \ + rntm_t* rntm = NULL; ( void )rntm; diff --git a/frame/include/bli_tapi_ex.h b/frame/include/bli_tapi_ex.h index c999b0ae9..f12be24b8 100644 --- a/frame/include/bli_tapi_ex.h +++ b/frame/include/bli_tapi_ex.h @@ -48,7 +48,7 @@ // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm +#define BLIS_TAPI_EX_PARAMS , const cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 4e64f3711..e957fc6b2 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1111,26 +1111,26 @@ typedef struct // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. - void* a_next; - void* b_next; + const void* a_next; + const void* b_next; // The imaginary strides of A and B. - inc_t is_a; - inc_t is_b; + inc_t is_a; + inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). - inc_t ps_a; - inc_t ps_b; + inc_t ps_a; + inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; - void* params; + const void* params; } auxinfo_t; @@ -1162,23 +1162,23 @@ struct thrinfo_s; typedef void (*obj_pack_fn_t) ( - struct obj_s* a, - struct obj_s* ap, - struct cntx_s* cntx, - struct rntm_s* rntm, - struct cntl_s* cntl, - struct thrinfo_s* thread + const struct obj_s* a, + struct obj_s* ap, + const struct cntx_s* cntx, + struct rntm_s* rntm, + struct cntl_s* cntl, + const struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( - struct obj_s* a, - struct obj_s* b, - struct obj_s* c, - struct cntx_s* cntx, - struct rntm_s* rntm, - struct cntl_s* cntl, - struct thrinfo_s* thread + const struct obj_s* a, + const struct obj_s* b, + const struct obj_s* c, + const struct cntx_s* cntx, + struct rntm_s* rntm, + struct cntl_s* cntl, + const struct thrinfo_s* thread ); typedef struct obj_s @@ -1297,7 +1297,7 @@ typedef struct obj_s // Define these macros here since they must be updated if contents of // obj_t changes. -BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) +BLIS_INLINE void bli_obj_init_full_shallow_copy_of( const obj_t* a, obj_t* b ) { b->root = a->root; @@ -1332,7 +1332,7 @@ BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) b->ker_params = a->ker_params; } -BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) +BLIS_INLINE void bli_obj_init_subpart_from( const obj_t* a, obj_t* b ) { b->root = a->root; diff --git a/frame/thread/bli_l3_decor.h b/frame/thread/bli_l3_decor.h index 0b09189a6..e2208aae6 100644 --- a/frame/thread/bli_l3_decor.h +++ b/frame/thread/bli_l3_decor.h @@ -41,30 +41,30 @@ // Level-3 internal function type. typedef void (*l3int_t) ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( - l3int_t func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + l3int_t func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ); // Include definitions specific to the method of multithreading for the diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c index 5b40d0614..2c71c7532 100644 --- a/frame/thread/bli_l3_decor_openmp.c +++ b/frame/thread/bli_l3_decor_openmp.c @@ -46,29 +46,18 @@ void* bli_l3_thread_entry( void* data_void ) { return NULL; } void bli_l3_thread_decorator ( - l3int_t func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + l3int_t func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ) { - // This is part of a hack to support mixed domain in bli_gemm_front(). - // Sometimes we need to specify a non-standard schema for A and B, and - // we decided to transmit them via the schema field in the obj_t's - // rather than pass them in as function parameters. Once the values - // have been read, we immediately reset them back to their expected - // values for unpacked objects. - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); - bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); - // Query the total number of threads from the rntm_t object. const dim_t n_threads = bli_rntm_num_threads( rntm ); @@ -83,7 +72,7 @@ void bli_l3_thread_decorator // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. - array_t* restrict array = bli_sba_checkout_array( n_threads ); + array_t* array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do // this up-front only so that we have the rntm_t.sba_pool field @@ -96,7 +85,7 @@ void bli_l3_thread_decorator bli_pba_rntm_set_pba( rntm ); // Allocate a global communicator for the root thrinfo_t structures. - thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); _Pragma( "omp parallel num_threads(n_threads)" ) @@ -104,8 +93,8 @@ void bli_l3_thread_decorator // Create a thread-local copy of the master thread's rntm_t. This is // necessary since we want each thread to be able to track its own // small block pool_t as it executes down the function stack. - rntm_t rntm_l = *rntm; - rntm_t* restrict rntm_p = &rntm_l; + rntm_t rntm_l = *rntm; + rntm_t* rntm_p = &rntm_l; // Query the thread's id from OpenMP. const dim_t tid = omp_get_thread_num(); @@ -119,7 +108,6 @@ void bli_l3_thread_decorator // be allocated/initialized. bli_sba_rntm_set_pool( tid, array, rntm_p ); - obj_t a_t, b_t, c_t; cntl_t* cntl_use; thrinfo_t* thread; @@ -133,6 +121,17 @@ void bli_l3_thread_decorator bli_obj_alias_to( b, &b_t ); bli_obj_alias_to( c, &c_t ); + // This is part of a hack to support mixed domain in bli_gemm_front(). + // Sometimes we need to specify a non-standard schema for A and B, and + // we decided to transmit them via the schema field in the obj_t's + // rather than pass them in as function parameters. Once the values + // have been read, we immediately reset them back to their expected + // values for unpacked objects. + pack_t schema_a = bli_obj_pack_schema( &a_t ); + pack_t schema_b = bli_obj_pack_schema( &b_t ); + bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t ); + bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t ); + // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( family, schema_a, schema_b, &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use ); diff --git a/frame/thread/bli_l3_decor_openmp.h b/frame/thread/bli_l3_decor_openmp.h index 80dbe5374..6ff7f16a9 100644 --- a/frame/thread/bli_l3_decor_openmp.h +++ b/frame/thread/bli_l3_decor_openmp.h @@ -43,7 +43,7 @@ void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, - thrcomm_t* gl_comm, + thrcomm_t* gl_comm, rntm_t* rntm ); diff --git a/frame/thread/bli_l3_decor_pthreads.c b/frame/thread/bli_l3_decor_pthreads.c index 89b6ea118..80247dfb1 100644 --- a/frame/thread/bli_l3_decor_pthreads.c +++ b/frame/thread/bli_l3_decor_pthreads.c @@ -40,49 +40,45 @@ // A data structure to assist in passing operands to additional threads. typedef struct thread_data { - l3int_t func; - opid_t family; - pack_t schema_a; - pack_t schema_b; - obj_t* alpha; - obj_t* a; - obj_t* b; - obj_t* beta; - obj_t* c; - cntx_t* cntx; - rntm_t* rntm; - cntl_t* cntl; - dim_t tid; - thrcomm_t* gl_comm; - array_t* array; + l3int_t func; + opid_t family; + const obj_t* alpha; + const obj_t* a; + const obj_t* b; + const obj_t* beta; + const obj_t* c; + const cntx_t* cntx; + rntm_t* rntm; + cntl_t* cntl; + dim_t tid; + thrcomm_t* gl_comm; + array_t* array; } thread_data_t; // Entry point for additional threads void* bli_l3_thread_entry( void* data_void ) { - thread_data_t* data = data_void; - - l3int_t func = data->func; - opid_t family = data->family; - pack_t schema_a = data->schema_a; - pack_t schema_b = data->schema_b; - obj_t* alpha = data->alpha; - obj_t* a = data->a; - obj_t* b = data->b; - obj_t* beta = data->beta; - obj_t* c = data->c; - cntx_t* cntx = data->cntx; - rntm_t* rntm = data->rntm; - cntl_t* cntl = data->cntl; - dim_t tid = data->tid; - array_t* array = data->array; - thrcomm_t* gl_comm = data->gl_comm; + const thread_data_t* data = data_void; + + const l3int_t func = data->func; + const opid_t family = data->family; + const obj_t* alpha = data->alpha; + const obj_t* a = data->a; + const obj_t* b = data->b; + const obj_t* beta = data->beta; + const obj_t* c = data->c; + const cntx_t* cntx = data->cntx; + rntm_t* rntm = data->rntm; + cntl_t* cntl = data->cntl; + const dim_t tid = data->tid; + array_t* array = data->array; + thrcomm_t* gl_comm = data->gl_comm; // Create a thread-local copy of the master thread's rntm_t. This is // necessary since we want each thread to be able to track its own // small block pool_t as it executes down the function stack. - rntm_t rntm_l = *rntm; - rntm_t* restrict rntm_p = &rntm_l; + rntm_t rntm_l = *rntm; + rntm_t* rntm_p = &rntm_l; // Use the thread id to access the appropriate pool_t* within the // array_t, and use it to set the sba_pool field within the rntm_t. @@ -90,9 +86,9 @@ void* bli_l3_thread_entry( void* data_void ) // be allocated/initialized. bli_sba_rntm_set_pool( tid, array, rntm_p ); - obj_t a_t, b_t, c_t; - cntl_t* cntl_use; - thrinfo_t* thread; + obj_t a_t, b_t, c_t; + cntl_t* cntl_use; + thrinfo_t* thread; // Alias thread-local copies of A, B, and C. These will be the objects // we pass down the algorithmic function stack. Making thread-local @@ -103,6 +99,17 @@ void* bli_l3_thread_entry( void* data_void ) bli_obj_alias_to( b, &b_t ); bli_obj_alias_to( c, &c_t ); + // This is part of a hack to support mixed domain in bli_gemm_front(). + // Sometimes we need to specify a non-standard schema for A and B, and + // we decided to transmit them via the schema field in the obj_t's + // rather than pass them in as function parameters. Once the values + // have been read, we immediately reset them back to their expected + // values for unpacked objects. + pack_t schema_a = bli_obj_pack_schema( &a_t ); + pack_t schema_b = bli_obj_pack_schema( &b_t ); + bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t ); + bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t ); + // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( family, schema_a, schema_b, &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use ); @@ -134,31 +141,20 @@ void* bli_l3_thread_entry( void* data_void ) void bli_l3_thread_decorator ( - l3int_t func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + l3int_t func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ) { err_t r_val; - // This is part of a hack to support mixed domain in bli_gemm_front(). - // Sometimes we need to specify a non-standard schema for A and B, and - // we decided to transmit them via the schema field in the obj_t's - // rather than pass them in as function parameters. Once the values - // have been read, we immediately reset them back to their expected - // values for unpacked objects. - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); - bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); - // Query the total number of threads from the context. const dim_t n_threads = bli_rntm_num_threads( rntm ); @@ -168,7 +164,7 @@ void bli_l3_thread_decorator // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. - array_t* restrict array = bli_sba_checkout_array( n_threads ); + array_t* array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do // this up-front only so that we have the rntm_t.sba_pool field @@ -181,7 +177,7 @@ void bli_l3_thread_decorator bli_pba_rntm_set_pba( rntm ); // Allocate a global communicator for the root thrinfo_t structures. - thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); // Allocate an array of pthread objects and auxiliary data structs to pass // to the thread entry functions. @@ -203,8 +199,6 @@ void bli_l3_thread_decorator // Set up thread data for additional threads (beyond thread 0). datas[tid].func = func; datas[tid].family = family; - datas[tid].schema_a = schema_a; - datas[tid].schema_b = schema_b; datas[tid].alpha = alpha; datas[tid].a = a; datas[tid].b = b; diff --git a/frame/thread/bli_l3_decor_single.c b/frame/thread/bli_l3_decor_single.c index 51474f0ee..c2c43b370 100644 --- a/frame/thread/bli_l3_decor_single.c +++ b/frame/thread/bli_l3_decor_single.c @@ -39,28 +39,32 @@ void bli_l3_thread_decorator ( - l3int_t func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + l3int_t func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ) { + obj_t a_t, b_t; + bli_obj_alias_to( a, &a_t ); + bli_obj_alias_to( b, &b_t ); + // This is part of a hack to support mixed domain in bli_gemm_front(). // Sometimes we need to specify a non-standard schema for A and B, and // we decided to transmit them via the schema field in the obj_t's // rather than pass them in as function parameters. Once the values // have been read, we immediately reset them back to their expected // values for unpacked objects. - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); - bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); - bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); + pack_t schema_a = bli_obj_pack_schema( &a_t ); + pack_t schema_b = bli_obj_pack_schema( &b_t ); + bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t ); + bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t ); // For sequential execution, we use only one thread. const dim_t n_threads = 1; @@ -71,7 +75,7 @@ void bli_l3_thread_decorator // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. - array_t* restrict array = bli_sba_checkout_array( n_threads ); + array_t* array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do // this up-front only so that we can create the global comm below. @@ -81,13 +85,13 @@ void bli_l3_thread_decorator bli_pba_rntm_set_pba( rntm ); // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); { // NOTE: We don't need to create another copy of the rntm_t since // it was already copied in one of the high-level oapi functions. - rntm_t* restrict rntm_p = rntm; + rntm_t* rntm_p = rntm; cntl_t* cntl_use; thrinfo_t* thread; @@ -111,7 +115,7 @@ void bli_l3_thread_decorator // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( family, schema_a, schema_b, - a, b, c, rntm_p, cntl, &cntl_use ); + &a_t, &b_t, c, rntm_p, cntl, &cntl_use ); // Create the root node of the thread's thrinfo_t structure. bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); @@ -119,8 +123,8 @@ void bli_l3_thread_decorator func ( alpha, - a, - b, + &a_t, + &b_t, beta, c, cntx, diff --git a/frame/thread/bli_l3_sup_decor.h b/frame/thread/bli_l3_sup_decor.h index a001e5b74..6e0401151 100644 --- a/frame/thread/bli_l3_sup_decor.h +++ b/frame/thread/bli_l3_sup_decor.h @@ -41,28 +41,28 @@ // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( - l3supint_t func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + l3supint_t func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ); // Include definitions specific to the method of multithreading for the diff --git a/frame/thread/bli_l3_sup_decor_openmp.c b/frame/thread/bli_l3_sup_decor_openmp.c index 1db9514fd..ff6bc667d 100644 --- a/frame/thread/bli_l3_sup_decor_openmp.c +++ b/frame/thread/bli_l3_sup_decor_openmp.c @@ -46,15 +46,15 @@ void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; } err_t bli_l3_sup_thread_decorator ( - l3supint_t func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + l3supint_t func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { // Query the total number of threads from the rntm_t object. @@ -66,7 +66,7 @@ err_t bli_l3_sup_thread_decorator // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. - array_t* restrict array = bli_sba_checkout_array( n_threads ); + array_t* array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do // this up-front only so that we have the rntm_t.sba_pool field @@ -79,7 +79,7 @@ err_t bli_l3_sup_thread_decorator bli_pba_rntm_set_pba( rntm ); // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); _Pragma( "omp parallel num_threads(n_threads)" ) @@ -87,8 +87,8 @@ err_t bli_l3_sup_thread_decorator // Create a thread-local copy of the master thread's rntm_t. This is // necessary since we want each thread to be able to track its own // small block pool_t as it executes down the function stack. - rntm_t rntm_l = *rntm; - rntm_t* restrict rntm_p = &rntm_l; + rntm_t rntm_l = *rntm; + rntm_t* rntm_p = &rntm_l; // Query the thread's id from OpenMP. const dim_t tid = omp_get_thread_num(); diff --git a/frame/thread/bli_l3_sup_decor_pthreads.c b/frame/thread/bli_l3_sup_decor_pthreads.c index dade71a03..375a85730 100644 --- a/frame/thread/bli_l3_sup_decor_pthreads.c +++ b/frame/thread/bli_l3_sup_decor_pthreads.c @@ -40,18 +40,18 @@ // A data structure to assist in passing operands to additional threads. typedef struct thread_data { - l3supint_t func; - opid_t family; - obj_t* alpha; - obj_t* a; - obj_t* b; - obj_t* beta; - obj_t* c; - cntx_t* cntx; - rntm_t* rntm; - dim_t tid; - thrcomm_t* gl_comm; - array_t* array; + l3supint_t func; + opid_t family; + const obj_t* alpha; + const obj_t* a; + const obj_t* b; + const obj_t* beta; + const obj_t* c; + const cntx_t* cntx; + rntm_t* rntm; + dim_t tid; + thrcomm_t* gl_comm; + array_t* array; } thread_data_t; // Entry point for additional threads @@ -59,26 +59,26 @@ void* bli_l3_sup_thread_entry( void* data_void ) { thread_data_t* data = data_void; - l3supint_t func = data->func; - opid_t family = data->family; - obj_t* alpha = data->alpha; - obj_t* a = data->a; - obj_t* b = data->b; - obj_t* beta = data->beta; - obj_t* c = data->c; - cntx_t* cntx = data->cntx; - rntm_t* rntm = data->rntm; - dim_t tid = data->tid; - array_t* array = data->array; - thrcomm_t* gl_comm = data->gl_comm; + l3supint_t func = data->func; + opid_t family = data->family; + const obj_t* alpha = data->alpha; + const obj_t* a = data->a; + const obj_t* b = data->b; + const obj_t* beta = data->beta; + const obj_t* c = data->c; + const cntx_t* cntx = data->cntx; + rntm_t* rntm = data->rntm; + dim_t tid = data->tid; + array_t* array = data->array; + thrcomm_t* gl_comm = data->gl_comm; ( void )family; // Create a thread-local copy of the master thread's rntm_t. This is // necessary since we want each thread to be able to track its own // small block pool_t as it executes down the function stack. - rntm_t rntm_l = *rntm; - rntm_t* restrict rntm_p = &rntm_l; + rntm_t rntm_l = *rntm; + rntm_t* rntm_p = &rntm_l; // Use the thread id to access the appropriate pool_t* within the // array_t, and use it to set the sba_pool field within the rntm_t. @@ -111,15 +111,15 @@ void* bli_l3_sup_thread_entry( void* data_void ) err_t bli_l3_sup_thread_decorator ( - l3supint_t func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + l3supint_t func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { err_t r_val; @@ -133,7 +133,7 @@ err_t bli_l3_sup_thread_decorator // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. - array_t* restrict array = bli_sba_checkout_array( n_threads ); + array_t* array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do // this up-front only so that we have the rntm_t.sba_pool field @@ -146,7 +146,7 @@ err_t bli_l3_sup_thread_decorator bli_pba_rntm_set_pba( rntm ); // Allocate a global communicator for the root thrinfo_t structures. - thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); // Allocate an array of pthread objects and auxiliary data structs to pass // to the thread entry functions. diff --git a/frame/thread/bli_l3_sup_decor_single.c b/frame/thread/bli_l3_sup_decor_single.c index a87af4103..df767ad29 100644 --- a/frame/thread/bli_l3_sup_decor_single.c +++ b/frame/thread/bli_l3_sup_decor_single.c @@ -41,17 +41,15 @@ err_t bli_l3_sup_thread_decorator ( - l3supint_t func, - opid_t family, - //pack_t schema_a, - //pack_t schema_b, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + l3supint_t func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { // For sequential execution, we use only one thread. @@ -63,7 +61,7 @@ err_t bli_l3_sup_thread_decorator // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. - array_t* restrict array = bli_sba_checkout_array( n_threads ); + array_t* array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. bli_sba_rntm_set_pool( 0, array, rntm ); @@ -73,14 +71,14 @@ err_t bli_l3_sup_thread_decorator #ifndef SKIP_THRINFO_TREE // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); #endif { // NOTE: We don't need to create another copy of the rntm_t since // it was already copied in one of the high-level oapi functions. - rntm_t* restrict rntm_p = rntm; + rntm_t* rntm_p = rntm; // There is only one thread id (for the thief thread). const dim_t tid = 0; @@ -138,7 +136,6 @@ err_t bli_l3_sup_thread_decorator bli_sba_checkin_array( array ); return BLIS_SUCCESS; - } #endif diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 6dc4f9141..7d647a314 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -67,12 +67,12 @@ void bli_thread_finalize( void ) void bli_thread_range_sub ( - thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end ) { dim_t n_way = bli_thread_n_way( thread ); @@ -211,11 +211,11 @@ void bli_thread_range_sub siz_t bli_thread_range_l2r ( - thrinfo_t* thr, - obj_t* a, - blksz_t* bmult, - dim_t* start, - dim_t* end + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end ) { num_t dt = bli_obj_dt( a ); @@ -231,11 +231,11 @@ siz_t bli_thread_range_l2r siz_t bli_thread_range_r2l ( - thrinfo_t* thr, - obj_t* a, - blksz_t* bmult, - dim_t* start, - dim_t* end + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end ) { num_t dt = bli_obj_dt( a ); @@ -251,11 +251,11 @@ siz_t bli_thread_range_r2l siz_t bli_thread_range_t2b ( - thrinfo_t* thr, - obj_t* a, - blksz_t* bmult, - dim_t* start, - dim_t* end + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end ) { num_t dt = bli_obj_dt( a ); @@ -271,11 +271,11 @@ siz_t bli_thread_range_t2b siz_t bli_thread_range_b2t ( - thrinfo_t* thr, - obj_t* a, - blksz_t* bmult, - dim_t* start, - dim_t* end + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end ) { num_t dt = bli_obj_dt( a ); @@ -504,15 +504,15 @@ siz_t bli_find_area_trap_l siz_t bli_thread_range_weighted_sub ( - thrinfo_t* restrict thread, - doff_t diagoff, - uplo_t uplo, - dim_t m, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* restrict j_start_thr, - dim_t* restrict j_end_thr + const thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + dim_t m, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* j_start_thr, + dim_t* j_end_thr ) { dim_t n_way = bli_thread_n_way( thread ); @@ -641,15 +641,15 @@ siz_t bli_thread_range_weighted_sub siz_t bli_thread_range_mdim ( - dir_t direct, - thrinfo_t* thr, - obj_t* a, - obj_t* b, - obj_t* c, - cntl_t* cntl, - cntx_t* cntx, - dim_t* start, - dim_t* end + dir_t direct, + const thrinfo_t* thr, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntl_t* cntl, + const cntx_t* cntx, + dim_t* start, + dim_t* end ) { bszid_t bszid = bli_cntl_bszid( cntl ); @@ -665,8 +665,8 @@ siz_t bli_thread_range_mdim else bszid = BLIS_NR; } - blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); - obj_t* x; + const blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); + const obj_t* x; bool use_weighted; // Use the operation family to choose the one of the two matrices @@ -700,15 +700,15 @@ siz_t bli_thread_range_mdim siz_t bli_thread_range_ndim ( - dir_t direct, - thrinfo_t* thr, - obj_t* a, - obj_t* b, - obj_t* c, - cntl_t* cntl, - cntx_t* cntx, - dim_t* start, - dim_t* end + dir_t direct, + const thrinfo_t* thr, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntl_t* cntl, + const cntx_t* cntx, + dim_t* start, + dim_t* end ) { bszid_t bszid = bli_cntl_bszid( cntl ); @@ -724,8 +724,8 @@ siz_t bli_thread_range_ndim else bszid = BLIS_NR; } - blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); - obj_t* x; + const blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); + const obj_t* x; bool use_weighted; // Use the operation family to choose the one of the two matrices @@ -759,11 +759,11 @@ siz_t bli_thread_range_ndim siz_t bli_thread_range_weighted_l2r ( - thrinfo_t* thr, - obj_t* a, - blksz_t* bmult, - dim_t* start, - dim_t* end + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end ) { siz_t area; @@ -809,11 +809,11 @@ siz_t bli_thread_range_weighted_l2r siz_t bli_thread_range_weighted_r2l ( - thrinfo_t* thr, - obj_t* a, - blksz_t* bmult, - dim_t* start, - dim_t* end + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end ) { siz_t area; @@ -861,11 +861,11 @@ siz_t bli_thread_range_weighted_r2l siz_t bli_thread_range_weighted_t2b ( - thrinfo_t* thr, - obj_t* a, - blksz_t* bmult, - dim_t* start, - dim_t* end + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end ) { siz_t area; @@ -913,11 +913,11 @@ siz_t bli_thread_range_weighted_t2b siz_t bli_thread_range_weighted_b2t ( - thrinfo_t* thr, - obj_t* a, - blksz_t* bmult, - dim_t* start, - dim_t* end + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end ) { siz_t area; @@ -1295,31 +1295,31 @@ void bli_thread_partition_2x2_orig dim_t tn1; // = *nt1; dim_t tn2; // = *nt2; - // Partition a number of threads into two factors nt1 and nt2 such that - // nt1/nt2 ~= work1/work2. There is a fast heuristic algorithm and a - // slower optimal algorithm (which minimizes |nt1*work2 - nt2*work1|). + // Partition a number of threads into two factors nt1 and nt2 such that + // nt1/nt2 ~= work1/work2. There is a fast heuristic algorithm and a + // slower optimal algorithm (which minimizes |nt1*work2 - nt2*work1|). - // Return early small prime numbers of threads. - if ( n_thread < 4 ) - { - tn1 = ( work1 >= work2 ? n_thread : 1 ); - tn2 = ( work1 < work2 ? n_thread : 1 ); + // Return early small prime numbers of threads. + if ( n_thread < 4 ) + { + tn1 = ( work1 >= work2 ? n_thread : 1 ); + tn2 = ( work1 < work2 ? n_thread : 1 ); return; - } + } - tn1 = 1; - tn2 = 1; + tn1 = 1; + tn2 = 1; - // Both algorithms need the prime factorization of n_thread. - bli_prime_factors_t factors; - bli_prime_factorization( n_thread, &factors ); + // Both algorithms need the prime factorization of n_thread. + bli_prime_factors_t factors; + bli_prime_factorization( n_thread, &factors ); #if 1 - // Fast algorithm: assign prime factors in increasing order to whichever - // partition has more work to do. The work is divided by the number of - // threads assigned at each iteration. This algorithm is sub-optimal in + // Fast algorithm: assign prime factors in increasing order to whichever + // partition has more work to do. The work is divided by the number of + // threads assigned at each iteration. This algorithm is sub-optimal in // some cases. We attempt to mitigate the cases that involve at least one // factor of 2. For example, in the partitioning of 12 with equal work // this algorithm tentatively finds 6x2. This factorization involves a @@ -1330,22 +1330,22 @@ void bli_thread_partition_2x2_orig //printf( "w1 w2 = %d %d (initial)\n", (int)work1, (int)work2 ); - dim_t f; - while ( ( f = bli_next_prime_factor( &factors ) ) > 1 ) - { + dim_t f; + while ( ( f = bli_next_prime_factor( &factors ) ) > 1 ) + { //printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d ... f = %d\n", (int)work1, (int)work2, (int)tn1, (int)tn2, (int)f ); - if ( work1 > work2 ) - { - work1 /= f; - tn1 *= f; - } - else - { - work2 /= f; - tn2 *= f; - } - } + if ( work1 > work2 ) + { + work1 /= f; + tn1 *= f; + } + else + { + work2 /= f; + tn2 *= f; + } + } //printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d\n", (int)work1, (int)work2, (int)tn1, (int)tn2 ); @@ -1391,78 +1391,78 @@ void bli_thread_partition_2x2_orig #else - // Slow algorithm: exhaustively constructs all factor pairs of n_thread and - // chooses the best one. - - // Eight prime factors handles n_thread up to 223092870. - dim_t fact[8]; - dim_t mult[8]; - - // There is always at least one prime factor, so use if for initialization. - dim_t nfact = 1; - fact[0] = bli_next_prime_factor( &factors ); - mult[0] = 1; - - // Collect the remaining prime factors, accounting for multiplicity of - // repeated factors. - dim_t f; - while ( ( f = bli_next_prime_factor( &factors ) ) > 1 ) - { - if ( f == fact[nfact-1] ) - { - mult[nfact-1]++; - } - else - { - nfact++; - fact[nfact-1] = f; - mult[nfact-1] = 1; - } - } - - // Now loop over all factor pairs. A single factor pair is denoted by how - // many of each prime factor are included in the first factor (ntaken). - dim_t ntake[8] = {0}; - dim_t min_diff = INT_MAX; - - // Loop over how many prime factors to assign to the first factor in the - // pair, for each prime factor. The total number of iterations is - // \Prod_{i=0}^{nfact-1} mult[i]. - bool done = FALSE; - while ( !done ) - { - dim_t x = 1; - dim_t y = 1; - - // Form the factors by integer exponentiation and accumulation. - for (dim_t i = 0 ; i < nfact ; i++ ) - { - x *= bli_ipow( fact[i], ntake[i] ); - y *= bli_ipow( fact[i], mult[i]-ntake[i] ); - } - - // Check if this factor pair is optimal by checking - // |nt1*work2 - nt2*work1|. - dim_t diff = llabs( x*work2 - y*work1 ); - if ( diff < min_diff ) - { - min_diff = diff; - tn1 = x; - tn2 = y; - } - - // Go to the next factor pair by doing an "odometer loop". - for ( dim_t i = 0 ; i < nfact ; i++ ) - { - if ( ++ntake[i] > mult[i] ) - { - ntake[i] = 0; - if ( i == nfact-1 ) done = TRUE; - else continue; - } - break; - } - } + // Slow algorithm: exhaustively constructs all factor pairs of n_thread and + // chooses the best one. + + // Eight prime factors handles n_thread up to 223092870. + dim_t fact[8]; + dim_t mult[8]; + + // There is always at least one prime factor, so use if for initialization. + dim_t nfact = 1; + fact[0] = bli_next_prime_factor( &factors ); + mult[0] = 1; + + // Collect the remaining prime factors, accounting for multiplicity of + // repeated factors. + dim_t f; + while ( ( f = bli_next_prime_factor( &factors ) ) > 1 ) + { + if ( f == fact[nfact-1] ) + { + mult[nfact-1]++; + } + else + { + nfact++; + fact[nfact-1] = f; + mult[nfact-1] = 1; + } + } + + // Now loop over all factor pairs. A single factor pair is denoted by how + // many of each prime factor are included in the first factor (ntaken). + dim_t ntake[8] = {0}; + dim_t min_diff = INT_MAX; + + // Loop over how many prime factors to assign to the first factor in the + // pair, for each prime factor. The total number of iterations is + // \Prod_{i=0}^{nfact-1} mult[i]. + bool done = FALSE; + while ( !done ) + { + dim_t x = 1; + dim_t y = 1; + + // Form the factors by integer exponentiation and accumulation. + for (dim_t i = 0 ; i < nfact ; i++ ) + { + x *= bli_ipow( fact[i], ntake[i] ); + y *= bli_ipow( fact[i], mult[i]-ntake[i] ); + } + + // Check if this factor pair is optimal by checking + // |nt1*work2 - nt2*work1|. + dim_t diff = llabs( x*work2 - y*work1 ); + if ( diff < min_diff ) + { + min_diff = diff; + tn1 = x; + tn2 = y; + } + + // Go to the next factor pair by doing an "odometer loop". + for ( dim_t i = 0 ; i < nfact ; i++ ) + { + if ( ++ntake[i] > mult[i] ) + { + ntake[i] = 0; + if ( i == nfact-1 ) done = TRUE; + else continue; + } + break; + } + } #endif diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index d4880c4c8..5e9c650b5 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -66,12 +66,12 @@ void bli_thread_finalize( void ); BLIS_EXPORT_BLIS void bli_thread_range_sub ( - thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end ); #undef GENPROT @@ -79,15 +79,15 @@ void bli_thread_range_sub \ siz_t PASTEMAC0( opname ) \ ( \ - dir_t direct, \ - thrinfo_t* thr, \ - obj_t* a, \ - obj_t* b, \ - obj_t* c, \ - cntl_t* cntl, \ - cntx_t* cntx, \ - dim_t* start, \ - dim_t* end \ + dir_t direct, \ + const thrinfo_t* thr, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* c, \ + const cntl_t* cntl, \ + const cntx_t* cntx, \ + dim_t* start, \ + dim_t* end \ ); GENPROT( thread_range_mdim ) @@ -98,11 +98,11 @@ GENPROT( thread_range_ndim ) \ siz_t PASTEMAC0( opname ) \ ( \ - thrinfo_t* thr, \ - obj_t* a, \ - blksz_t* bmult, \ - dim_t* start, \ - dim_t* end \ + const thrinfo_t* thr, \ + const obj_t* a, \ + const blksz_t* bmult, \ + dim_t* start, \ + dim_t* end \ ); GENPROT( thread_range_l2r ) @@ -136,15 +136,15 @@ siz_t bli_find_area_trap_l ); siz_t bli_thread_range_weighted_sub ( - thrinfo_t* restrict thread, - doff_t diagoff, - uplo_t uplo, - dim_t m, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* restrict j_start_thr, - dim_t* restrict j_end_thr + const thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + dim_t m, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* j_start_thr, + dim_t* j_end_thr ); // ----------------------------------------------------------------------------- @@ -157,9 +157,9 @@ typedef struct dim_t f; } bli_prime_factors_t; -void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); +void bli_prime_factorization( dim_t n, bli_prime_factors_t* factors ); -dim_t bli_next_prime_factor(bli_prime_factors_t* factors); +dim_t bli_next_prime_factor( bli_prime_factors_t* factors ); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 @@ -211,13 +211,13 @@ void bli_thread_init_rntm_from_env( rntm_t* rntm ); BLIS_INLINE void bli_thread_range_jrir_rr ( - thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. @@ -228,13 +228,13 @@ BLIS_INLINE void bli_thread_range_jrir_rr BLIS_INLINE void bli_thread_range_jrir_sl ( - thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. @@ -244,13 +244,13 @@ BLIS_INLINE void bli_thread_range_jrir_sl BLIS_INLINE void bli_thread_range_jrir ( - thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c index 0282be170..bbe711400 100644 --- a/frame/thread/bli_thrinfo.c +++ b/frame/thread/bli_thrinfo.c @@ -41,7 +41,7 @@ thrinfo_t* bli_thrinfo_create thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, - dim_t work_id, + dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node @@ -51,13 +51,13 @@ thrinfo_t* bli_thrinfo_create printf( "bli_thrinfo_create(): " ); #endif - thrinfo_t* thread = bli_sba_acquire( rntm, sizeof( thrinfo_t ) ); + thrinfo_t* thread = bli_sba_acquire( rntm, sizeof( thrinfo_t ) ); - bli_thrinfo_init + bli_thrinfo_init ( thread, ocomm, ocomm_id, - n_way, work_id, + n_way, work_id, free_comm, bszid, sub_node @@ -72,7 +72,7 @@ void bli_thrinfo_init thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, - dim_t work_id, + dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h index 8e5a6da3b..6b9809684 100644 --- a/frame/thread/bli_thrinfo.h +++ b/frame/thread/bli_thrinfo.h @@ -75,54 +75,54 @@ typedef struct thrinfo_s thrinfo_t; // thrinfo_t query (field only) -BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) +BLIS_INLINE dim_t bli_thread_num_threads( const thrinfo_t* t ) { return (t->ocomm)->n_threads; } -BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) +BLIS_INLINE dim_t bli_thread_ocomm_id( const thrinfo_t* t ) { return t->ocomm_id; } -BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) +BLIS_INLINE dim_t bli_thread_n_way( const thrinfo_t* t ) { return t->n_way; } -BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) +BLIS_INLINE dim_t bli_thread_work_id( const thrinfo_t* t ) { return t->work_id; } -BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) +BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( const thrinfo_t* t ) { return t->ocomm; } -BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) +BLIS_INLINE bool bli_thrinfo_needs_free_comm( const thrinfo_t* t ) { return t->free_comm; } -BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) +BLIS_INLINE dim_t bli_thread_bszid( const thrinfo_t* t ) { return t->bszid; } -BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) +BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( const thrinfo_t* t ) { return t->sub_node; } -BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) +BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( const thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) -BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) +BLIS_INLINE bool bli_thread_am_ochief( const thrinfo_t* t ) { return t->ocomm_id == 0; } @@ -171,12 +171,12 @@ BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* // other thrinfo_t-related functions -BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) +BLIS_INLINE void* bli_thread_broadcast( const thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } -BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) +BLIS_INLINE void bli_thread_barrier( const thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } @@ -192,7 +192,7 @@ thrinfo_t* bli_thrinfo_create thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, - dim_t work_id, + dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node @@ -204,7 +204,7 @@ void bli_thrinfo_init thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, - dim_t work_id, + dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c index 881990f78..966247fd0 100644 --- a/frame/thread/bli_thrinfo_sup.c +++ b/frame/thread/bli_thrinfo_sup.c @@ -37,9 +37,9 @@ void bli_thrinfo_sup_grow ( - rntm_t* rntm, - bszid_t* bszid_par, - thrinfo_t* thread + rntm_t* rntm, + const bszid_t* bszid_par, + thrinfo_t* thread ) { if ( thread == &BLIS_GEMM_SINGLE_THREADED || @@ -75,10 +75,10 @@ void bli_thrinfo_sup_grow thrinfo_t* bli_thrinfo_sup_rgrow ( - rntm_t* rntm, - bszid_t* bszid_par, - bszid_t* bszid_cur, - thrinfo_t* thread_par + rntm_t* rntm, + const bszid_t* bszid_par, + const bszid_t* bszid_cur, + thrinfo_t* thread_par ) { thrinfo_t* thread_cur; @@ -139,10 +139,10 @@ thrinfo_t* bli_thrinfo_sup_rgrow thrinfo_t* bli_thrinfo_sup_create_for_cntl ( - rntm_t* rntm, - bszid_t* bszid_par, - bszid_t* bszid_chl, - thrinfo_t* thread_par + rntm_t* rntm, + const bszid_t* bszid_par, + const bszid_t* bszid_chl, + thrinfo_t* thread_par ) { // If we are running with a single thread, all of the code can be reduced @@ -151,14 +151,14 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl { thrinfo_t* thread_chl = bli_thrinfo_create ( - rntm, // rntm - &BLIS_SINGLE_COMM, // ocomm - 0, // ocomm_id - 1, // n_way - 0, // work_id - FALSE, // free_comm - BLIS_NO_PART, // bszid - NULL // sub_node + rntm, // rntm + &BLIS_SINGLE_COMM, // ocomm + 0, // ocomm_id + 1, // n_way + 0, // work_id + FALSE, // free_comm + BLIS_NO_PART, // bszid + NULL // sub_node ); return thread_chl; diff --git a/frame/thread/bli_thrinfo_sup.h b/frame/thread/bli_thrinfo_sup.h index 0be035cf8..1afcd3337 100644 --- a/frame/thread/bli_thrinfo_sup.h +++ b/frame/thread/bli_thrinfo_sup.h @@ -42,25 +42,25 @@ void bli_thrinfo_sup_grow ( - rntm_t* rntm, - bszid_t* bszid_par, - thrinfo_t* thread + rntm_t* rntm, + const bszid_t* bszid_par, + thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( - rntm_t* rntm, - bszid_t* bszid_par, - bszid_t* bszid_cur, - thrinfo_t* thread_par + rntm_t* rntm, + const bszid_t* bszid_par, + const bszid_t* bszid_cur, + thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( - rntm_t* rntm, - bszid_t* bszid_par, - bszid_t* bszid_chl, - thrinfo_t* thread_par + rntm_t* rntm, + const bszid_t* bszid_par, + const bszid_t* bszid_chl, + thrinfo_t* thread_par ); #endif diff --git a/frame/util/bli_util_check.c b/frame/util/bli_util_check.c index 3693ea39c..a96f6f5e9 100644 --- a/frame/util/bli_util_check.c +++ b/frame/util/bli_util_check.c @@ -43,8 +43,8 @@ \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* asum \ + const obj_t* x, \ + const obj_t* asum \ ) \ { \ bli_utilv_xa_check( x, asum ); \ @@ -58,7 +58,7 @@ GENFRONT( asumv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x \ + const obj_t* x \ ) \ { \ bli_utilm_mkhst_check( x ); \ @@ -74,8 +74,8 @@ GENFRONT( mktrim ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* norm \ + const obj_t* x, \ + const obj_t* norm \ ) \ { \ bli_utilv_norm_check( x, norm ); \ @@ -91,8 +91,8 @@ GENFRONT( normiv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* norm \ + const obj_t* x, \ + const obj_t* norm \ ) \ { \ bli_utilm_norm_check( x, norm ); \ @@ -108,7 +108,7 @@ GENFRONT( normim ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x \ + const obj_t* x \ ) \ { \ bli_utilm_rand_check( x ); \ @@ -125,9 +125,9 @@ GENFRONT( randnm ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* scale, \ - obj_t* sumsq \ + const obj_t* x, \ + const obj_t* scale, \ + const obj_t* sumsq \ ) \ { \ bli_utilv_sumsqv_check( x, scale, sumsq ); \ @@ -142,9 +142,9 @@ GENFRONT( sumsqv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* chi, \ - obj_t* psi, \ - bool* is_eq \ + const obj_t* chi, \ + const obj_t* psi, \ + const bool* is_eq \ ) \ { \ bli_l0_xxbsc_check( chi, psi, is_eq ); \ @@ -158,9 +158,9 @@ GENFRONT( eqsc ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* y, \ - bool* is_eq \ + const obj_t* x, \ + const obj_t* y, \ + const bool* is_eq \ ) \ { \ bli_l1v_xy_check( x, y ); \ @@ -174,9 +174,9 @@ GENFRONT( eqv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* y, \ - bool* is_eq \ + const obj_t* x, \ + const obj_t* y, \ + const bool* is_eq \ ) \ { \ bli_l1m_xy_check( x, y ); \ @@ -190,11 +190,11 @@ GENFRONT( eqm ) \ void PASTEMAC(opname,_check) \ ( \ - FILE* file, \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ + const FILE* file, \ + const char* s1, \ + const obj_t* x, \ + const char* format, \ + const char* s2 \ ) \ { \ bli_utilm_fprint_check( file, s1, x, format, s2 ); \ @@ -207,8 +207,8 @@ GENFRONT( fprintm ) void bli_utilv_xa_check ( - obj_t* x, - obj_t* asum + const obj_t* x, + const obj_t* asum ) { err_t e_val; @@ -240,7 +240,7 @@ void bli_utilv_xa_check void bli_utilm_mkhst_check ( - obj_t* a + const obj_t* a ) { err_t e_val; @@ -277,8 +277,8 @@ void bli_utilm_mkhst_check void bli_utilv_norm_check ( - obj_t* x, - obj_t* norm + const obj_t* x, + const obj_t* norm ) { err_t e_val; @@ -317,8 +317,8 @@ void bli_utilv_norm_check void bli_utilm_norm_check ( - obj_t* x, - obj_t* norm + const obj_t* x, + const obj_t* norm ) { err_t e_val; @@ -356,35 +356,35 @@ void bli_utilm_norm_check void bli_utilm_fprint_check ( - FILE* file, - char* s1, - obj_t* x, - char* format, - char* s2 + const FILE* file, + const char* s1, + const obj_t* x, + const char* format, + const char* s2 ) { err_t e_val; // Check argument pointers. - + e_val = bli_check_null_pointer( file ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( s1 ); bli_check_error_code( e_val ); - e_val = bli_check_null_pointer( s2 ); + e_val = bli_check_null_pointer( s2 ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). - e_val = bli_check_object_buffer( x ); + e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); } void bli_utilm_rand_check ( - obj_t* x + const obj_t* x ) { err_t e_val; @@ -405,9 +405,9 @@ void bli_utilm_rand_check void bli_utilv_sumsqv_check ( - obj_t* x, - obj_t* scale, - obj_t* sumsq + const obj_t* x, + const obj_t* scale, + const obj_t* sumsq ) { err_t e_val; @@ -430,15 +430,15 @@ void bli_utilv_sumsqv_check e_val = bli_check_scalar_object( scale ); bli_check_error_code( e_val ); - + e_val = bli_check_scalar_object( sumsq ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). - + e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); - + e_val = bli_check_object_buffer( scale ); bli_check_error_code( e_val ); diff --git a/frame/util/bli_util_check.h b/frame/util/bli_util_check.h index 866a2cd89..c3f4fd1aa 100644 --- a/frame/util/bli_util_check.h +++ b/frame/util/bli_util_check.h @@ -42,8 +42,8 @@ \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* asum \ + const obj_t* x, \ + const obj_t* asum \ ); GENPROT( asumv ) @@ -54,7 +54,7 @@ GENPROT( asumv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x \ + const obj_t* x \ ); GENPROT( mkherm ) @@ -67,8 +67,8 @@ GENPROT( mktrim ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* norm \ + const obj_t* x, \ + const obj_t* norm \ ); GENPROT( norm1v ) @@ -81,8 +81,8 @@ GENPROT( normiv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* norm \ + const obj_t* x, \ + const obj_t* norm \ ); GENPROT( norm1m ) @@ -95,7 +95,7 @@ GENPROT( normim ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x \ + const obj_t* x \ ); GENPROT( randv ) @@ -109,9 +109,9 @@ GENPROT( randnm ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* scale, \ - obj_t* sumsq \ + const obj_t* x, \ + const obj_t* scale, \ + const obj_t* sumsq \ ); GENPROT( sumsqv ) @@ -123,9 +123,9 @@ GENPROT( sumsqv ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* chi, \ - obj_t* psi, \ - bool* is_eq \ + const obj_t* chi, \ + const obj_t* psi, \ + const bool* is_eq \ ); GENTPROT( eqsc ) @@ -136,9 +136,9 @@ GENTPROT( eqsc ) \ void PASTEMAC(opname,_check) \ ( \ - obj_t* x, \ - obj_t* y, \ - bool* is_eq \ + const obj_t* x, \ + const obj_t* y, \ + const bool* is_eq \ ); GENPROT( eqv ) @@ -150,11 +150,11 @@ GENPROT( eqm ) \ void PASTEMAC(opname,_check) \ ( \ - FILE* file, \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ + const FILE* file, \ + const char* s1, \ + const obj_t* x, \ + const char* format, \ + const char* s2 \ ); GENPROT( fprintv ) @@ -164,51 +164,51 @@ GENPROT( fprintm ) void bli_utilv_xi_check ( - obj_t* x, - obj_t* index + const obj_t* x, + const obj_t* index ); void bli_utilv_xa_check ( - obj_t* x, - obj_t* asum + const obj_t* x, + const obj_t* asum ); void bli_utilm_mkhst_check ( - obj_t* a + const obj_t* a ); void bli_utilv_norm_check ( - obj_t* x, - obj_t* norm + const obj_t* x, + const obj_t* norm ); void bli_utilm_norm_check ( - obj_t* x, - obj_t* norm + const obj_t* x, + const obj_t* norm ); void bli_utilm_fprint_check ( - FILE* file, - char* s1, - obj_t* x, - char* format, - char* s2 + const FILE* file, + const char* s1, + const obj_t* x, + const char* format, + const char* s2 ); void bli_utilm_rand_check ( - obj_t* x + const obj_t* x ); void bli_utilv_sumsqv_check ( - obj_t* x, - obj_t* scale, - obj_t* sumsq + const obj_t* x, + const obj_t* scale, + const obj_t* sumsq ); diff --git a/frame/util/bli_util_ft.h b/frame/util/bli_util_ft.h index 673f4782a..ccdd7ae66 100644 --- a/frame/util/bli_util_ft.h +++ b/frame/util/bli_util_ft.h @@ -44,9 +44,9 @@ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* asum \ + dim_t n, \ + const ctype* x, inc_t incx, \ + ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); @@ -76,9 +76,9 @@ INSERT_GENTDEF( mktrim ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* norm \ + dim_t n, \ + const ctype* x, inc_t incx, \ + ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); @@ -93,13 +93,13 @@ INSERT_GENTDEFR( normiv ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype_r* norm \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); @@ -114,12 +114,12 @@ INSERT_GENTDEFR( normim ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - FILE* file, \ - char* s1, \ - dim_t n, \ - ctype* x, inc_t incx, \ - char* format, \ - char* s2 \ + FILE* file, \ + const char* s1, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + const char* format, \ + const char* s2 \ ); INSERT_GENTDEF( fprintv ) @@ -131,13 +131,13 @@ INSERT_GENTDEF( fprintv ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - FILE* file, \ - char* s1, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - char* format, \ - char* s2 \ + FILE* file, \ + const char* s1, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + const char* format, \ + const char* s2 \ ); INSERT_GENTDEF( fprintm ) @@ -182,10 +182,10 @@ INSERT_GENTDEF( randnm ) \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* scale, \ - ctype_r* sumsq \ + dim_t n, \ + const ctype* x, inc_t incx, \ + ctype_r* scale, \ + ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); @@ -204,10 +204,10 @@ INSERT_GENTDEFR( sumsqv ) \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ - conj_t conjchi, \ - ctype* chi, \ - ctype* psi, \ - bool* is_eq \ + conj_t conjchi, \ + const ctype* chi, \ + const ctype* psi, \ + bool* is_eq \ ); INSERT_GENTDEF( eqsc ) @@ -219,11 +219,11 @@ INSERT_GENTDEF( eqsc ) \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - bool* is_eq \ + conj_t conjx, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + bool* is_eq \ ); INSERT_GENTDEF( eqv ) @@ -235,15 +235,15 @@ INSERT_GENTDEF( eqv ) \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y, \ - bool* is_eq \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + const ctype* y, inc_t rs_y, inc_t cs_y, \ + bool* is_eq \ ); INSERT_GENTDEF( eqm ) diff --git a/frame/util/bli_util_oapi.c b/frame/util/bli_util_oapi.c index afd221a58..d4e5617ee 100644 --- a/frame/util/bli_util_oapi.c +++ b/frame/util/bli_util_oapi.c @@ -45,8 +45,8 @@ \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* asum \ + const obj_t* x, \ + const obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -54,16 +54,16 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_EX_DECLS \ \ - num_t dt = bli_obj_dt( x ); \ + num_t dt = bli_obj_dt( x ); \ \ - dim_t n = bli_obj_vector_dim( x ); \ - void* buf_x = bli_obj_buffer_at_off( x ); \ - inc_t incx = bli_obj_vector_inc( x ); \ + dim_t n = bli_obj_vector_dim( x ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t incx = bli_obj_vector_inc( x ); \ \ - void* buf_asum = bli_obj_buffer_at_off( asum ); \ + void* buf_asum = bli_obj_buffer_at_off( asum ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, asum ); \ + PASTEMAC(opname,_check)( x, asum ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -88,7 +88,7 @@ GENFRONT( asumv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* a \ + const obj_t* a \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -96,16 +96,16 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_EX_DECLS \ \ - num_t dt = bli_obj_dt( a ); \ + num_t dt = bli_obj_dt( a ); \ \ - uplo_t uploa = bli_obj_uplo( a ); \ - dim_t m = bli_obj_length( a ); \ - void* buf_a = bli_obj_buffer_at_off( a ); \ - inc_t rs_a = bli_obj_row_stride( a ); \ - inc_t cs_a = bli_obj_col_stride( a ); \ + uplo_t uploa = bli_obj_uplo( a ); \ + dim_t m = bli_obj_length( a ); \ + void* buf_a = bli_obj_buffer_at_off( a ); \ + inc_t rs_a = bli_obj_row_stride( a ); \ + inc_t cs_a = bli_obj_col_stride( a ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( a ); \ + PASTEMAC(opname,_check)( a ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -132,8 +132,8 @@ GENFRONT( mktrim ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* norm \ + const obj_t* x, \ + const obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -141,15 +141,15 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_EX_DECLS \ \ - num_t dt = bli_obj_dt( x ); \ + num_t dt = bli_obj_dt( x ); \ \ - dim_t n = bli_obj_vector_dim( x ); \ - void* buf_x = bli_obj_buffer_at_off( x ); \ - inc_t incx = bli_obj_vector_inc( x ); \ - void* buf_norm = bli_obj_buffer_at_off( norm ); \ + dim_t n = bli_obj_vector_dim( x ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t incx = bli_obj_vector_inc( x ); \ + void* buf_norm = bli_obj_buffer_at_off( norm ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, norm ); \ + PASTEMAC(opname,_check)( x, norm ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -176,8 +176,8 @@ GENFRONT( normiv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* norm \ + const obj_t* x, \ + const obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -185,20 +185,20 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_EX_DECLS \ \ - num_t dt = bli_obj_dt( x ); \ + num_t dt = bli_obj_dt( x ); \ \ - doff_t diagoffx = bli_obj_diag_offset( x ); \ - diag_t diagx = bli_obj_diag( x ); \ - uplo_t uplox = bli_obj_uplo( x ); \ - dim_t m = bli_obj_length( x ); \ - dim_t n = bli_obj_width( x ); \ - void* buf_x = bli_obj_buffer_at_off( x ); \ - inc_t rs_x = bli_obj_row_stride( x ); \ - inc_t cs_x = bli_obj_col_stride( x ); \ - void* buf_norm = bli_obj_buffer_at_off( norm ); \ + doff_t diagoffx = bli_obj_diag_offset( x ); \ + diag_t diagx = bli_obj_diag( x ); \ + uplo_t uplox = bli_obj_uplo( x ); \ + dim_t m = bli_obj_length( x ); \ + dim_t n = bli_obj_width( x ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t rs_x = bli_obj_row_stride( x ); \ + inc_t cs_x = bli_obj_col_stride( x ); \ + void* buf_norm = bli_obj_buffer_at_off( norm ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, norm ); \ + PASTEMAC(opname,_check)( x, norm ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -229,7 +229,7 @@ GENFRONT( normim ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -237,14 +237,14 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_EX_DECLS \ \ - num_t dt = bli_obj_dt( x ); \ + num_t dt = bli_obj_dt( x ); \ \ - dim_t n = bli_obj_vector_dim( x ); \ - void* buf_x = bli_obj_buffer_at_off( x ); \ - inc_t incx = bli_obj_vector_inc( x ); \ + dim_t n = bli_obj_vector_dim( x ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t incx = bli_obj_vector_inc( x ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x ); \ + PASTEMAC(opname,_check)( x ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -269,7 +269,7 @@ GENFRONT( randnv ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -277,18 +277,18 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_EX_DECLS \ \ - num_t dt = bli_obj_dt( x ); \ + num_t dt = bli_obj_dt( x ); \ \ - doff_t diagoffx = bli_obj_diag_offset( x ); \ - uplo_t uplox = bli_obj_uplo( x ); \ - dim_t m = bli_obj_length( x ); \ - dim_t n = bli_obj_width( x ); \ - void* buf_x = bli_obj_buffer_at_off( x ); \ - inc_t rs_x = bli_obj_row_stride( x ); \ - inc_t cs_x = bli_obj_col_stride( x ); \ + doff_t diagoffx = bli_obj_diag_offset( x ); \ + uplo_t uplox = bli_obj_uplo( x ); \ + dim_t m = bli_obj_length( x ); \ + dim_t n = bli_obj_width( x ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t rs_x = bli_obj_row_stride( x ); \ + inc_t cs_x = bli_obj_col_stride( x ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x ); \ + PASTEMAC(opname,_check)( x ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -316,9 +316,9 @@ GENFRONT( randnm ) \ void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* scale, \ - obj_t* sumsq \ + const obj_t* x, \ + const obj_t* scale, \ + const obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ) \ { \ @@ -326,16 +326,16 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_EX_DECLS \ \ - num_t dt = bli_obj_dt( x ); \ + num_t dt = bli_obj_dt( x ); \ \ - dim_t n = bli_obj_vector_dim( x ); \ - void* buf_x = bli_obj_buffer_at_off( x ); \ - inc_t incx = bli_obj_vector_inc( x ); \ - void* buf_scale = bli_obj_buffer_at_off( scale ); \ - void* buf_sumsq = bli_obj_buffer_at_off( sumsq ); \ + dim_t n = bli_obj_vector_dim( x ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t incx = bli_obj_vector_inc( x ); \ + void* buf_scale = bli_obj_buffer_at_off( scale ); \ + void* buf_sumsq = bli_obj_buffer_at_off( sumsq ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, scale, sumsq ); \ + PASTEMAC(opname,_check)( x, scale, sumsq ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -366,19 +366,19 @@ GENFRONT( sumsqv ) \ void PASTEMAC0(opname) \ ( \ - obj_t* chi, \ - obj_t* psi, \ - bool* is_eq \ + const obj_t* chi, \ + const obj_t* psi, \ + bool* is_eq \ ) \ { \ bli_init_once(); \ \ - num_t dt_chi = bli_obj_dt( chi ); \ - num_t dt_psi = bli_obj_dt( psi ); \ - num_t dt; \ + num_t dt_chi = bli_obj_dt( chi ); \ + num_t dt_psi = bli_obj_dt( psi ); \ + num_t dt; \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( chi, psi, is_eq ); \ + PASTEMAC(opname,_check)( chi, psi, is_eq ); \ \ /* Decide which datatype will be used to query the buffer from the constant object (if there is one). */ \ @@ -427,29 +427,29 @@ GENFRONT( eqsc ) \ void PASTEMAC0(opname) \ ( \ - obj_t* x, \ - obj_t* y, \ - bool* is_eq \ + const obj_t* x, \ + const obj_t* y, \ + bool* is_eq \ ) \ { \ bli_init_once(); \ \ - num_t dt = bli_obj_dt( x ); \ + num_t dt = bli_obj_dt( x ); \ \ - dim_t n = bli_obj_vector_dim( x ); \ - void* buf_x = bli_obj_buffer_at_off( x ); \ - inc_t inc_x = bli_obj_vector_inc( x ); \ - void* buf_y = bli_obj_buffer_at_off( y ); \ - inc_t inc_y = bli_obj_vector_inc( y ); \ + dim_t n = bli_obj_vector_dim( x ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t inc_x = bli_obj_vector_inc( x ); \ + void* buf_y = bli_obj_buffer_at_off( y ); \ + inc_t inc_y = bli_obj_vector_inc( y ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, y, is_eq ); \ + PASTEMAC(opname,_check)( x, y, is_eq ); \ \ /* Query the conj status of each object and use the two to come up with a single "net" conj_t value. */ \ - conj_t conjx = bli_obj_conj_status( x ); \ - conj_t conjy = bli_obj_conj_status( y ); \ - conj_t conj = bli_apply_conj( conjx, conjy ); \ + conj_t conjx = bli_obj_conj_status( x ); \ + conj_t conjy = bli_obj_conj_status( y ); \ + conj_t conj = bli_apply_conj( conjx, conjy ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ @@ -474,29 +474,29 @@ GENFRONT( eqv ) \ void PASTEMAC0(opname) \ ( \ - obj_t* x, \ - obj_t* y, \ - bool* is_eq \ + const obj_t* x, \ + const obj_t* y, \ + bool* is_eq \ ) \ { \ bli_init_once(); \ \ - num_t dt = bli_obj_dt( x ); \ + num_t dt = bli_obj_dt( x ); \ \ - doff_t diagoffx = bli_obj_diag_offset( x ); \ - diag_t diagx = bli_obj_diag( x ); \ - uplo_t uplox = bli_obj_uplo( x ); \ - dim_t m = bli_obj_length( y ); \ - dim_t n = bli_obj_width( y ); \ - void* buf_x = bli_obj_buffer_at_off( x ); \ - inc_t rs_x = bli_obj_row_stride( x ); \ - inc_t cs_x = bli_obj_col_stride( x ); \ - void* buf_y = bli_obj_buffer_at_off( y ); \ - inc_t rs_y = bli_obj_row_stride( y ); \ - inc_t cs_y = bli_obj_col_stride( y ); \ + doff_t diagoffx = bli_obj_diag_offset( x ); \ + diag_t diagx = bli_obj_diag( x ); \ + uplo_t uplox = bli_obj_uplo( x ); \ + dim_t m = bli_obj_length( y ); \ + dim_t n = bli_obj_width( y ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t rs_x = bli_obj_row_stride( x ); \ + inc_t cs_x = bli_obj_col_stride( x ); \ + void* buf_y = bli_obj_buffer_at_off( y ); \ + inc_t rs_y = bli_obj_row_stride( y ); \ + inc_t cs_y = bli_obj_col_stride( y ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, y, is_eq ); \ + PASTEMAC(opname,_check)( x, y, is_eq ); \ \ /* Query the combined trans and conj status of each object and use the two to come up with a single "net" trans_t value. */ \ @@ -531,23 +531,23 @@ GENFRONT( eqm ) \ void PASTEMAC0(opname) \ ( \ - FILE* file, \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ + FILE* file, \ + const char* s1, \ + const obj_t* x, \ + const char* format, \ + const char* s2 \ ) \ { \ bli_init_once(); \ \ - num_t dt = bli_obj_dt( x ); \ + num_t dt = bli_obj_dt( x ); \ \ - dim_t n = bli_obj_vector_dim( x ); \ - void* buf_x = bli_obj_buffer_at_off( x ); \ - inc_t incx = bli_obj_vector_inc( x ); \ + dim_t n = bli_obj_vector_dim( x ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t incx = bli_obj_vector_inc( x ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( file, s1, x, format, s2 ); \ + PASTEMAC(opname,_check)( file, s1, x, format, s2 ); \ \ /* Handle constants up front. */ \ if ( dt == BLIS_CONSTANT ) \ @@ -579,34 +579,34 @@ GENFRONT( fprintv ) \ void PASTEMAC0(opname) \ ( \ - FILE* file, \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ + FILE* file, \ + const char* s1, \ + const obj_t* x, \ + const char* format, \ + const char* s2 \ ) \ { \ bli_init_once(); \ \ - num_t dt = bli_obj_dt( x ); \ + num_t dt = bli_obj_dt( x ); \ \ - dim_t m = bli_obj_length( x ); \ - dim_t n = bli_obj_width( x ); \ - void* buf_x = bli_obj_buffer_at_off( x ); \ - inc_t rs_x = bli_obj_row_stride( x ); \ - inc_t cs_x = bli_obj_col_stride( x ); \ + dim_t m = bli_obj_length( x ); \ + dim_t n = bli_obj_width( x ); \ + void* buf_x = bli_obj_buffer_at_off( x ); \ + inc_t rs_x = bli_obj_row_stride( x ); \ + inc_t cs_x = bli_obj_col_stride( x ); \ \ if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( file, s1, x, format, s2 ); \ + PASTEMAC(opname,_check)( file, s1, x, format, s2 ); \ \ /* Handle constants up front. */ \ if ( dt == BLIS_CONSTANT ) \ { \ - float* sp = bli_obj_buffer_for_const( BLIS_FLOAT, x ); \ - double* dp = bli_obj_buffer_for_const( BLIS_DOUBLE, x ); \ - scomplex* cp = bli_obj_buffer_for_const( BLIS_SCOMPLEX, x ); \ - dcomplex* zp = bli_obj_buffer_for_const( BLIS_DCOMPLEX, x ); \ - gint_t* ip = bli_obj_buffer_for_const( BLIS_INT, x ); \ + const float* sp = bli_obj_buffer_for_const( BLIS_FLOAT, x ); \ + const double* dp = bli_obj_buffer_for_const( BLIS_DOUBLE, x ); \ + const scomplex* cp = bli_obj_buffer_for_const( BLIS_SCOMPLEX, x ); \ + const dcomplex* zp = bli_obj_buffer_for_const( BLIS_DCOMPLEX, x ); \ + const gint_t* ip = bli_obj_buffer_for_const( BLIS_INT, x ); \ \ fprintf( file, "%s\n", s1 ); \ fprintf( file, " float: %9.2e\n", bli_sreal( *sp ) ); \ @@ -645,10 +645,10 @@ GENFRONT( fprintm ) \ void PASTEMAC0(opname) \ ( \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ + const char* s1, \ + const obj_t* x, \ + const char* format, \ + const char* s2 \ ) \ { \ bli_init_once(); \ diff --git a/frame/util/bli_util_oapi.h b/frame/util/bli_util_oapi.h index 92ce6c95f..ab48f841a 100644 --- a/frame/util/bli_util_oapi.h +++ b/frame/util/bli_util_oapi.h @@ -42,8 +42,8 @@ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* asum \ + const obj_t* x, \ + const obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); @@ -55,7 +55,7 @@ GENPROT( asumv ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* a \ + const obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); @@ -69,8 +69,8 @@ GENPROT( mktrim ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* norm \ + const obj_t* x, \ + const obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); @@ -84,8 +84,8 @@ GENPROT( normiv ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* norm \ + const obj_t* x, \ + const obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); @@ -99,7 +99,7 @@ GENPROT( normim ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); @@ -112,7 +112,7 @@ GENPROT( randnv ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x \ + const obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); @@ -125,9 +125,9 @@ GENPROT( randnm ) \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ - obj_t* x, \ - obj_t* scale, \ - obj_t* sumsq \ + const obj_t* x, \ + const obj_t* scale, \ + const obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); @@ -139,42 +139,15 @@ GENPROT( sumsqv ) #ifdef BLIS_OAPI_BASIC -/* -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* chi, \ - obj_t* psi, \ - bool* is_eq \ - ); - -GENPROT( eqsc ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - obj_t* x, \ - obj_t* y, \ - bool* is_eq \ - ); - -GENPROT( eqv ) -*/ - #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - obj_t* x, \ - obj_t* y, \ - bool* is_eq \ + const obj_t* x, \ + const obj_t* y, \ + bool* is_eq \ ); GENPROT( eqsc ) @@ -187,11 +160,11 @@ GENPROT( eqm ) \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - FILE* file, \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ + FILE* file, \ + const char* s1, \ + const obj_t* x, \ + const char* format, \ + const char* s2 \ ); GENPROT( fprintv ) @@ -203,10 +176,10 @@ GENPROT( fprintm ) \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ - char* s1, \ - obj_t* x, \ - char* format, \ - char* s2 \ + const char* s1, \ + const obj_t* x, \ + const char* format, \ + const char* s2 \ ); GENPROT( printv ) diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c index ca0b3c279..abc9c9089 100644 --- a/frame/util/bli_util_tapi.c +++ b/frame/util/bli_util_tapi.c @@ -45,9 +45,9 @@ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* asum \ + dim_t n, \ + const ctype* x, inc_t incx, \ + ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -71,9 +71,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ n, \ - x, incx, \ - asum, \ - cntx, \ + ( ctype* )x, incx, \ + asum, \ + ( cntx_t* )cntx, \ rntm \ ); \ } @@ -86,9 +86,9 @@ INSERT_GENTFUNCR_BASIC0( asumv ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - uplo_t uploa, \ - dim_t m, \ - ctype* a, inc_t rs_a, inc_t cs_a \ + uplo_t uploa, \ + dim_t m, \ + ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -109,7 +109,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ uploa, \ m, \ a, rs_a, cs_a, \ - cntx, \ + ( cntx_t* )cntx, \ rntm \ ); \ } @@ -124,9 +124,9 @@ INSERT_GENTFUNC_BASIC0( mktrim ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* norm \ + dim_t n, \ + const ctype* x, inc_t incx, \ + ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -150,9 +150,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ n, \ - x, incx, \ - norm, \ - cntx, \ + ( ctype* )x, incx, \ + norm, \ + ( cntx_t* )cntx, \ rntm \ ); \ } @@ -167,13 +167,13 @@ INSERT_GENTFUNCR_BASIC0( normiv ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype_r* norm \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -201,9 +201,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \ uplox, \ m, \ n, \ - x, rs_x, cs_x, \ - norm, \ - cntx, \ + ( ctype* )x, rs_x, cs_x, \ + norm, \ + ( cntx_t* )cntx, \ rntm \ ); \ } @@ -218,8 +218,8 @@ INSERT_GENTFUNCR_BASIC0( normim ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - dim_t n, \ - ctype* x, inc_t incx \ + dim_t n, \ + ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -247,7 +247,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ n, \ x, incx, \ - cntx, \ + ( cntx_t* )cntx, \ rntm \ ); \ \ @@ -274,11 +274,11 @@ INSERT_GENTFUNCR_BASIC0( randnv ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + doff_t diagoffx, \ + uplo_t uplox, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -309,7 +309,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ m, \ n, \ x, rs_x, cs_x, \ - cntx, \ + ( cntx_t* )cntx, \ rntm \ ); \ \ @@ -340,10 +340,10 @@ INSERT_GENTFUNCR_BASIC0( randnm ) \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* scale, \ - ctype_r* sumsq \ + dim_t n, \ + const ctype* x, inc_t incx, \ + ctype_r* scale, \ + ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ) \ { \ @@ -362,10 +362,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ n, \ - x, incx, \ - scale, \ - sumsq, \ - cntx, \ + ( ctype* )x, incx, \ + scale, \ + sumsq, \ + ( cntx_t* )cntx, \ rntm \ ); \ } @@ -383,10 +383,10 @@ INSERT_GENTFUNCR_BASIC0( sumsqv ) \ void PASTEMAC(ch,opname) \ ( \ - conj_t conjchi, \ - ctype* chi, \ - ctype* psi, \ - bool* is_eq \ + conj_t conjchi, \ + const ctype* chi, \ + const ctype* psi, \ + bool* is_eq \ ) \ { \ bli_init_once(); \ @@ -406,11 +406,11 @@ INSERT_GENTFUNC_BASIC0( eqsc ) \ void PASTEMAC(ch,opname) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - bool* is_eq \ + conj_t conjx, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + bool* is_eq \ ) \ { \ bli_init_once(); \ @@ -425,8 +425,8 @@ void PASTEMAC(ch,opname) \ ( \ conjx, \ n, \ - x, incx, \ - y, incy \ + ( ctype* )x, incx, \ + ( ctype* )y, incy \ ); \ } @@ -438,15 +438,15 @@ INSERT_GENTFUNC_BASIC0( eqv ) \ void PASTEMAC(ch,opname) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y, \ - bool* is_eq \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + const ctype* y, inc_t rs_y, inc_t cs_y, \ + bool* is_eq \ ) \ { \ bli_init_once(); \ @@ -467,8 +467,8 @@ void PASTEMAC(ch,opname) \ transx, \ m, \ n, \ - x, rs_x, cs_x, \ - y, rs_y, cs_y \ + ( ctype* )x, rs_x, cs_x, \ + ( ctype* )y, rs_y, cs_y \ ); \ } @@ -480,11 +480,11 @@ INSERT_GENTFUNC_BASIC0( eqm ) \ void PASTEMAC(ch,opname) \ ( \ - char* s1, \ - dim_t n, \ - void* x, inc_t incx, \ - char* format, \ - char* s2 \ + const char* s1, \ + dim_t n, \ + const void* x, inc_t incx, \ + const char* format, \ + const char* s2 \ ) \ { \ bli_init_once(); \ @@ -508,12 +508,12 @@ INSERT_GENTFUNC_BASIC_I( printv, fprintv ) \ void PASTEMAC(ch,opname) \ ( \ - char* s1, \ - dim_t m, \ - dim_t n, \ - void* x, inc_t rs_x, inc_t cs_x, \ - char* format, \ - char* s2 \ + const char* s1, \ + dim_t m, \ + dim_t n, \ + const void* x, inc_t rs_x, inc_t cs_x, \ + const char* format, \ + const char* s2 \ ) \ { \ bli_init_once(); \ diff --git a/frame/util/bli_util_tapi.h b/frame/util/bli_util_tapi.h index 43fbbdb06..29c67df23 100644 --- a/frame/util/bli_util_tapi.h +++ b/frame/util/bli_util_tapi.h @@ -42,9 +42,9 @@ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* asum \ + dim_t n, \ + const ctype* x, inc_t incx, \ + ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); @@ -56,9 +56,9 @@ INSERT_GENTPROTR_BASIC0( asumv ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - uplo_t uploa, \ - dim_t m, \ - ctype* a, inc_t rs_a, inc_t cs_a \ + uplo_t uploa, \ + dim_t m, \ + ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); @@ -72,9 +72,9 @@ INSERT_GENTPROT_BASIC0( mktrim ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* norm \ + dim_t n, \ + const ctype* x, inc_t incx, \ + ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); @@ -88,13 +88,13 @@ INSERT_GENTPROTR_BASIC0( normiv ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype_r* norm \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); @@ -108,8 +108,8 @@ INSERT_GENTPROTR_BASIC0( normim ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - dim_t n, \ - ctype* x, inc_t incx \ + dim_t n, \ + ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); @@ -122,11 +122,11 @@ INSERT_GENTPROT_BASIC0( randnv ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - doff_t diagoffx, \ - uplo_t uplox, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x \ + doff_t diagoffx, \ + uplo_t uplox, \ + dim_t m, \ + dim_t n, \ + ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); @@ -139,10 +139,10 @@ INSERT_GENTPROT_BASIC0( randnm ) \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_r* scale, \ - ctype_r* sumsq \ + dim_t n, \ + const ctype* x, inc_t incx, \ + ctype_r* scale, \ + ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); @@ -159,10 +159,10 @@ INSERT_GENTPROTR_BASIC0( sumsqv ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - conj_t conjchi, \ - ctype* chi, \ - ctype* psi, \ - bool* is_eq \ + conj_t conjchi, \ + const ctype* chi, \ + const ctype* psi, \ + bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) @@ -173,11 +173,11 @@ INSERT_GENTPROT_BASIC0( eqsc ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy, \ - bool* is_eq \ + conj_t conjx, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + const ctype* y, inc_t incy, \ + bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) @@ -188,15 +188,15 @@ INSERT_GENTPROT_BASIC0( eqv ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - doff_t diagoffx, \ - diag_t diagx, \ - uplo_t uplox, \ - trans_t transx, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - ctype* y, inc_t rs_y, inc_t cs_y, \ - bool* is_eq \ + doff_t diagoffx, \ + diag_t diagx, \ + uplo_t uplox, \ + trans_t transx, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + const ctype* y, inc_t rs_y, inc_t cs_y, \ + bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) @@ -207,11 +207,11 @@ INSERT_GENTPROT_BASIC0( eqm ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - char* s1, \ - dim_t n, \ - void* x, inc_t incx, \ - char* format, \ - char* s2 \ + const char* s1, \ + dim_t n, \ + const void* x, inc_t incx, \ + const char* format, \ + const char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) @@ -222,12 +222,12 @@ INSERT_GENTPROT_BASIC0_I( printv ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - char* s1, \ - dim_t m, \ - dim_t n, \ - void* x, inc_t rs_x, inc_t cs_x, \ - char* format, \ - char* s2 \ + const char* s1, \ + dim_t m, \ + dim_t n, \ + const void* x, inc_t rs_x, inc_t cs_x, \ + const char* format, \ + const char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c index af550681a..2b65c8460 100644 --- a/frame/util/bli_util_unb_var1.c +++ b/frame/util/bli_util_unb_var1.c @@ -52,11 +52,11 @@ void PASTEMAC(ch,varname) \ rntm_t* rntm \ ) \ { \ - ctype* chi1; \ - ctype_r chi1_r; \ - ctype_r chi1_i; \ - ctype_r absum; \ - dim_t i; \ + ctype* chi1; \ + ctype_r chi1_r; \ + ctype_r chi1_i; \ + ctype_r absum; \ + dim_t i; \ \ /* Initialize the absolute sum accumulator to zero. */ \ PASTEMAC(chr,set0s)( absum ); \ @@ -239,10 +239,10 @@ void PASTEMAC(ch,varname) \ rntm_t* rntm \ ) \ { \ - ctype* chi1; \ - ctype_r abs_chi1; \ - ctype_r absum; \ - dim_t i; \ + ctype* chi1; \ + ctype_r abs_chi1; \ + ctype_r absum; \ + dim_t i; \ \ /* Initialize the absolute sum accumulator to zero. */ \ PASTEMAC(chr,set0s)( absum ); \ @@ -455,10 +455,10 @@ void PASTEMAC(ch,varname) \ rntm_t* rntm \ ) \ { \ - ctype* chi1; \ - ctype_r abs_chi1; \ - ctype_r abs_chi1_max; \ - dim_t i; \ + ctype* chi1; \ + ctype_r abs_chi1; \ + ctype_r abs_chi1_max; \ + dim_t i; \ \ /* Initialize the maximum absolute value to zero. */ \ PASTEMAC(chr,set0s)( abs_chi1_max ); \ @@ -505,19 +505,19 @@ void PASTEMAC(ch,varname) \ rntm_t* rntm \ ) \ { \ - ctype* one = PASTEMAC(ch,1); \ - ctype* x0; \ - ctype* chi1; \ - ctype* x2; \ - ctype_r absum_max; \ - ctype_r absum_j; \ - ctype_r abval_chi1; \ - uplo_t uplox_eff; \ - dim_t n_iter; \ - dim_t n_elem, n_elem_max; \ - inc_t ldx, incx; \ - dim_t j, i; \ - dim_t ij0, n_shift; \ + ctype* one = PASTEMAC(ch,1); \ + ctype* x0; \ + ctype* chi1; \ + ctype* x2; \ + ctype_r absum_max; \ + ctype_r absum_j; \ + ctype_r abval_chi1; \ + uplo_t uplox_eff; \ + dim_t n_iter; \ + dim_t n_elem, n_elem_max; \ + inc_t ldx, incx; \ + dim_t j, i; \ + dim_t ij0, n_shift; \ \ /* Initialize the maximum absolute column sum to zero. */ \ PASTEMAC(chr,set0s)( absum_max ); \ @@ -904,20 +904,20 @@ void PASTEMAC(ch,varname) \ rntm_t* rntm \ ) \ { \ - ctype* one = PASTEMAC(ch,1); \ - ctype* x0; \ - ctype* x1; \ - ctype* x2; \ - ctype* chi1; \ - ctype beta; \ - ctype omega; \ - double max_m_n; \ - uplo_t uplox_eff; \ - dim_t n_iter; \ - dim_t n_elem, n_elem_max; \ - inc_t ldx, incx; \ - dim_t j, i; \ - dim_t ij0, n_shift; \ + ctype* one = PASTEMAC(ch,1); \ + ctype* x0; \ + ctype* x1; \ + ctype* x2; \ + ctype* chi1; \ + ctype beta; \ + ctype omega; \ + double max_m_n; \ + uplo_t uplox_eff; \ + dim_t n_iter; \ + dim_t n_elem, n_elem_max; \ + inc_t ldx, incx; \ + dim_t j, i; \ + dim_t ij0, n_shift; \ \ /* Set various loop parameters. Here, we pretend that diagx is equal to BLIS_NONUNIT_DIAG because we handle the unit diagonal case manually. */ \ @@ -1059,16 +1059,16 @@ void PASTEMAC(ch,varname) \ rntm_t* rntm \ ) \ { \ - const ctype_r zero_r = *PASTEMAC(chr,0); \ - const ctype_r one_r = *PASTEMAC(chr,1); \ + ctype_r zero_r = *PASTEMAC(chr,0); \ + ctype_r one_r = *PASTEMAC(chr,1); \ \ - ctype* chi1; \ - ctype_r chi1_r; \ - ctype_r chi1_i; \ - ctype_r scale_r; \ - ctype_r sumsq_r; \ - ctype_r abs_chi1_r; \ - dim_t i; \ + ctype* chi1; \ + ctype_r chi1_r; \ + ctype_r chi1_i; \ + ctype_r scale_r; \ + ctype_r sumsq_r; \ + ctype_r abs_chi1_r; \ + dim_t i; \ \ /* NOTE: This function attempts to mimic the algorithm for computing the Frobenius norm in netlib LAPACK's ?lassq(). */ \ @@ -1143,10 +1143,10 @@ INSERT_GENTFUNCR_BASIC0( sumsqv_unb_var1 ) \ bool PASTEMAC(ch,opname) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ ) \ { \ for ( dim_t i = 0; i < n; ++i ) \ @@ -1298,25 +1298,23 @@ INSERT_GENTFUNC_BASIC0( eqm_unb_var1 ) \ void PASTEMAC(ch,opname) \ ( \ - FILE* file, \ - char* s1, \ - dim_t n, \ - ctype* x, inc_t incx, \ - char* format, \ - char* s2 \ + FILE* file, \ + const char* s1, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + const char* format, \ + const char* s2 \ ) \ { \ - dim_t i; \ - ctype* chi1; \ - char default_spec[32] = PASTEMAC(ch,formatspec)(); \ + const char default_spec[32] = PASTEMAC(ch,formatspec)(); \ \ if ( format == NULL ) format = default_spec; \ \ - chi1 = x; \ + const ctype*chi1 = x; \ \ fprintf( file, "%s\n", s1 ); \ \ - for ( i = 0; i < n; ++i ) \ + for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,fprints)( file, format, *chi1 ); \ fprintf( file, "\n" ); \ @@ -1335,28 +1333,26 @@ INSERT_GENTFUNC_BASIC0_I( fprintv ) \ void PASTEMAC(ch,opname) \ ( \ - FILE* file, \ - char* s1, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - char* format, \ - char* s2 \ + FILE* file, \ + const char* s1, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + const char* format, \ + const char* s2 \ ) \ { \ - dim_t i, j; \ - ctype* chi1; \ - char default_spec[32] = PASTEMAC(ch,formatspec)(); \ + const char default_spec[32] = PASTEMAC(ch,formatspec)(); \ \ if ( format == NULL ) format = default_spec; \ \ fprintf( file, "%s\n", s1 ); \ \ - for ( i = 0; i < m; ++i ) \ + for ( dim_t i = 0; i < m; ++i ) \ { \ - for ( j = 0; j < n; ++j ) \ + for ( dim_t j = 0; j < n; ++j ) \ { \ - chi1 = (( ctype* ) x) + i*rs_x + j*cs_x; \ + const ctype* chi1 = (( ctype* ) x) + i*rs_x + j*cs_x; \ \ PASTEMAC(ch,fprints)( file, format, *chi1 ); \ fprintf( file, " " ); \ diff --git a/frame/util/bli_util_unb_var1.h b/frame/util/bli_util_unb_var1.h index f87848856..435efa4ac 100644 --- a/frame/util/bli_util_unb_var1.h +++ b/frame/util/bli_util_unb_var1.h @@ -162,10 +162,10 @@ INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) \ bool PASTEMAC(ch,varname) \ ( \ - conj_t conjx, \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype* y, inc_t incy \ + conj_t conjx, \ + dim_t n, \ + ctype* x, inc_t incx, \ + ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) @@ -194,12 +194,12 @@ INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - FILE* file, \ - char* s1, \ - dim_t n, \ - ctype* x, inc_t incx, \ - char* format, \ - char* s2 \ + FILE* file, \ + const char* s1, \ + dim_t n, \ + const ctype* x, inc_t incx, \ + const char* format, \ + const char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) @@ -210,13 +210,13 @@ INSERT_GENTPROT_BASIC0_I( fprintv ) \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ - FILE* file, \ - char* s1, \ - dim_t m, \ - dim_t n, \ - ctype* x, inc_t rs_x, inc_t cs_x, \ - char* format, \ - char* s2 \ + FILE* file, \ + const char* s1, \ + dim_t m, \ + dim_t n, \ + const ctype* x, inc_t rs_x, inc_t cs_x, \ + const char* format, \ + const char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) diff --git a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c b/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c index 7171347bf..8f1122b45 100644 --- a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c +++ b/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c @@ -52,7 +52,7 @@ void bli_dpackm_armsve256_int_8xk double* restrict kappa, double* restrict a, inc_t inca_, inc_t lda_, double* restrict p, inc_t ldp_, - cntx_t* restrict cntx + cntx_t* cntx ) { const int64_t cdim = cdim_; diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c index a086b3a76..5866ed26f 100644 --- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c +++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c @@ -51,7 +51,7 @@ void bli_dpackm_armsve512_asm_10xk double* restrict kappa, double* restrict a, inc_t inca_, inc_t lda_, double* restrict p, inc_t ldp_, - cntx_t* restrict cntx + cntx_t* cntx ) { const int64_t cdim = cdim_; diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c index aeb323c0c..88ccb4b8e 100644 --- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c +++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c @@ -50,7 +50,7 @@ void bli_dpackm_armsve512_asm_16xk double* restrict kappa, double* restrict a, inc_t inca_, inc_t lda_, double* restrict p, inc_t ldp_, - cntx_t* restrict cntx + cntx_t* cntx ) { const int64_t cdim = cdim_; diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c index 098d5d4b5..9bc7fd949 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c @@ -53,8 +53,8 @@ void bli_cgemm_armsve_asm_2vx10_unindexed scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { void* a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c index 0ee470f24..1c9d68dec 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c @@ -53,8 +53,8 @@ void bli_dgemm_armsve_asm_2vx10_unindexed double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { void* a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c index d03af5923..05005f8c3 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c @@ -53,8 +53,8 @@ void bli_sgemm_armsve_asm_2vx10_unindexed float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { void* a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c index 8636a527b..210d40f0b 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c @@ -53,8 +53,8 @@ void bli_zgemm_armsve_asm_2vx10_unindexed dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { void* a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c index c248285c3..4dec190e0 100644 --- a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c +++ b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c @@ -43,7 +43,7 @@ void bli_sgemm_armv7a_ker_4x4 float* restrict b, float* restrict beta, float* restrict c, uint32_t rs_c, uint32_t cs_c, - auxinfo_t* restrict data + auxinfo_t* data ); void bli_sgemm_armv7a_asm_4x4 @@ -56,8 +56,8 @@ void bli_sgemm_armv7a_asm_4x4 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a @@ -78,7 +78,7 @@ void bli_dgemm_armv7a_ker_4x4 double* restrict b, double* restrict beta, double* restrict c, uint32_t rs_c, uint32_t cs_c, - auxinfo_t* restrict data + auxinfo_t* data ); void bli_dgemm_armv7a_asm_4x4 @@ -91,8 +91,8 @@ void bli_dgemm_armv7a_asm_4x4 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a @@ -113,7 +113,7 @@ void bli_cgemm_armv7a_ker_2x2 scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, uint32_t rs_c, uint32_t cs_c, - auxinfo_t* restrict data + auxinfo_t* data ); void bli_cgemm_armv7a_asm_2x2 @@ -126,8 +126,8 @@ void bli_cgemm_armv7a_asm_2x2 scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a @@ -148,7 +148,7 @@ void bli_zgemm_armv7a_ker_2x2 dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, uint32_t rs_c, uint32_t cs_c, - auxinfo_t* restrict data + auxinfo_t* data ); void bli_zgemm_armv7a_asm_2x2 @@ -161,8 +161,8 @@ void bli_zgemm_armv7a_asm_2x2 dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a diff --git a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c index 06f36a346..b1e9481a3 100644 --- a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c +++ b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c @@ -45,8 +45,8 @@ void bli_sgemm_armv7a_int_4x4 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a @@ -251,8 +251,8 @@ void bli_dgemm_armv7a_int_4x4 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c index 301b8ad79..3eefd9ddc 100644 --- a/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c +++ b/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c @@ -57,7 +57,7 @@ void bli_dpackm_armv8a_int_6xk double* restrict kappa, double* restrict a, inc_t inca0, inc_t lda0, double* restrict p, inc_t ldp0, - cntx_t* restrict cntx + cntx_t* cntx ) { // This is the panel dimension assumed by the packm kernel. @@ -296,7 +296,7 @@ void bli_dpackm_armv8a_int_6xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } @@ -316,7 +316,7 @@ void bli_dpackm_armv8a_int_6xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c index 321fa5403..51b064a24 100644 --- a/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c +++ b/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c @@ -57,7 +57,7 @@ void bli_dpackm_armv8a_int_8xk double* restrict kappa, double* restrict a, inc_t inca0, inc_t lda0, double* restrict p, inc_t ldp0, - cntx_t* restrict cntx + cntx_t* cntx ) { // This is the panel dimension assumed by the packm kernel. @@ -326,7 +326,7 @@ void bli_dpackm_armv8a_int_8xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } @@ -346,7 +346,7 @@ void bli_dpackm_armv8a_int_8xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c index 371877247..f915215e1 100644 --- a/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c +++ b/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c @@ -57,7 +57,7 @@ void bli_spackm_armv8a_int_12xk float* restrict kappa, float* restrict a, inc_t inca0, inc_t lda0, float* restrict p, inc_t ldp0, - cntx_t* restrict cntx + cntx_t* cntx ) { // This is the panel dimension assumed by the packm kernel. @@ -410,7 +410,7 @@ void bli_spackm_armv8a_int_12xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } @@ -428,7 +428,7 @@ void bli_spackm_armv8a_int_12xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c index 3d363c2d8..b508b2a0e 100644 --- a/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c +++ b/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c @@ -57,7 +57,7 @@ void bli_spackm_armv8a_int_8xk float* restrict kappa, float* restrict a, inc_t inca0, inc_t lda0, float* restrict p, inc_t ldp0, - cntx_t* restrict cntx + cntx_t* cntx ) { // This is the panel dimension assumed by the packm kernel. @@ -348,7 +348,7 @@ void bli_spackm_armv8a_int_8xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } @@ -366,7 +366,7 @@ void bli_spackm_armv8a_int_8xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index 4d9a88817..94f0090bc 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -61,8 +61,8 @@ void bli_sgemm_armv8a_asm_8x12 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { void* a_next = bli_auxinfo_next_a( data ); @@ -740,8 +740,8 @@ void bli_dgemm_armv8a_asm_6x8 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { void* a_next = bli_auxinfo_next_a( data ); @@ -1462,8 +1462,8 @@ void bli_cgemm_armv8a_opt_4x4 scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { } @@ -1478,8 +1478,8 @@ void bli_zgemm_armv8a_opt_4x4 dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { } diff --git a/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c b/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c index c87ff1feb..44e0ac419 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c @@ -57,8 +57,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ /* NOTE: This microkernel can actually handle arbitrarily large @@ -262,8 +262,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ /* NOTE: This microkernel can actually handle arbitrarily large diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c index 630459db7..cade3ee05 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c @@ -121,8 +121,8 @@ void bli_dgemmsup_rd_armv8a_asm_6x8m double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { if ( n0 != 8 ) diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c index e13dd668e..06c9ac32c 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c @@ -114,8 +114,8 @@ void bli_dgemmsup_rd_armv8a_asm_6x8n double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { if ( m0 != 6 ) diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c index 16001a73c..312eb4454 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c @@ -98,7 +98,7 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) /* * 4x8 dgemmsup kernel with extending 1st dimension. * - * Recommanded usage case: + * Recommanded usage case: * o 16 < (L1 cache latency) * (Num. FPU) < 25. * o L1 cache has a bandwidth not too low (true in most cases). * o (FMLA latency) * (Num. FPU) < 32 cycles (true in almost all cases). @@ -115,8 +115,8 @@ void bli_dgemmsup_rv_armv8a_asm_4x8m double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Fixme: This uker has no dispatching for unalighed sizes. diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c index 43913cd38..bc7402a5f 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c @@ -98,7 +98,7 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) /* * 4x8 dgemmsup kernel with extending 2nd dimension. * - * Recommanded usage case: + * Recommanded usage case: * o 16 < (L1 cache latency) * (Num. FPU) < 25. * o L1 cache has a bandwidth not too low (true in most cases). * o (FMLA latency) * (Num. FPU) < 32 cycles (true in almost all cases). @@ -115,8 +115,8 @@ void bli_dgemmsup_rv_armv8a_asm_4x8n double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Fixme: This uker has no dispatching for unalighed sizes. diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c index 3100112d3..8ff5ec173 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c @@ -140,8 +140,8 @@ void bli_dgemmsup_rv_armv8a_asm_6x8m double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { if ( n0 != 8 ) diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c index fb9357c11..9bdf4b3b8 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c @@ -140,8 +140,8 @@ void bli_dgemmsup_rv_armv8a_asm_6x8n double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { if ( m0 != 6 ) diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c index 5b0e9b062..4d374df98 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c @@ -111,8 +111,8 @@ void bli_dgemmsup_rv_armv8a_asm_8x4m double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Fixme: This uker has no dispatching for unalighed sizes. diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c index 84c7c4a7d..aa53de55c 100644 --- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c +++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c @@ -94,8 +94,8 @@ void bli_dgemmsup_rd_armv8a_asm_3x4 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { assert( m0 == 3 ); diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c index abbb6fb4d..b10546764 100644 --- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c +++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c @@ -118,8 +118,8 @@ void bli_dgemmsup_rd_armv8a_asm_6x3 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { assert( m0 == 6 ); diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c index 43880063e..5438fdfc2 100644 --- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c +++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c @@ -69,8 +69,8 @@ void bli_dgemmsup_rd_armv8a_int_2x8 double* restrict b, inc_t rs_b, inc_t cs_b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { assert( m0 <= 2 ); @@ -114,10 +114,10 @@ void bli_dgemmsup_rd_armv8a_int_2x8 PRAGMA_UNROLL for ( ; k_mker > 0; --k_mker ) { - // if ( m0 > 0 ) + // if ( m0 > 0 ) va_0 = vld1q_f64( a_loc + rs_a * 0 ); if ( m0 > 1 ) va_1 = vld1q_f64( a_loc + rs_a * 1 ); - // if ( n0 > 0 ) + // if ( n0 > 0 ) vb_0 = vld1q_f64( b_loc + cs_b * 0 ); if ( n0 > 1 ) vb_1 = vld1q_f64( b_loc + cs_b * 1 ); if ( n0 > 2 ) vb_2 = vld1q_f64( b_loc + cs_b * 2 ); @@ -174,10 +174,10 @@ void bli_dgemmsup_rd_armv8a_int_2x8 PRAGMA_NOUNROLL for ( ; k_left > 0; --k_left ) { - // if ( m0 > 0 ) + // if ( m0 > 0 ) va_0 = vld1q_lane_f64( a_loc + rs_a * 0, va_0, 0 ); if ( m0 > 1 ) va_1 = vld1q_lane_f64( a_loc + rs_a * 1, va_1, 0 ); - // if ( n0 > 0 ) + // if ( n0 > 0 ) vb_0 = vld1q_lane_f64( b_loc + cs_b * 0, vb_0, 0 ); if ( n0 > 1 ) vb_1 = vld1q_lane_f64( b_loc + cs_b * 1, vb_1, 0 ); if ( n0 > 2 ) vb_2 = vld1q_lane_f64( b_loc + cs_b * 2, vb_2, 0 ); diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c index 73e5f20fb..89817d6d5 100644 --- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c +++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c @@ -69,8 +69,8 @@ void bli_dgemmsup_rd_armv8a_int_3x4 double* restrict b, inc_t rs_b, inc_t cs_b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // if ( m0 == 3 && n0 == 4 ) diff --git a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c index 16af42ade..931f3ed66 100644 --- a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c +++ b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c @@ -69,8 +69,8 @@ void bli_dgemmsup_rv_armv8a_int_3x8mn double* restrict b0, inc_t rs_b, inc_t cs_b, double* restrict beta, double* restrict c0, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Unlike the rd case, this rv case does not impose restriction upon diff --git a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c index 8bbd87f1f..f850b0fa6 100644 --- a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c +++ b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c @@ -69,8 +69,8 @@ void bli_dgemmsup_rv_armv8a_int_6x4mn double* restrict b0, inc_t rs_b, inc_t cs_b, double* restrict beta, double* restrict c0, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Unlike the rd case, this rv case does not impose restriction upon @@ -123,7 +123,7 @@ void bli_dgemmsup_rv_armv8a_int_6x4mn for ( ; k > 0; --k ) { // A columns. - // if ( m0 > 0 ) + // if ( m0 > 0 ) va_0 = vld1q_lane_f64( a_loc + rs_a * 0, va_0, 0 ); if ( m0 > 1 ) va_0 = vld1q_lane_f64( a_loc + rs_a * 1, va_0, 1 ); if ( m0 > 2 ) va_1 = vld1q_lane_f64( a_loc + rs_a * 2, va_1, 0 ); diff --git a/kernels/bgq/1/bli_axpyv_bgq_int.c b/kernels/bgq/1/bli_axpyv_bgq_int.c index 0c4a8cbd3..1d233f5c1 100644 --- a/kernels/bgq/1/bli_axpyv_bgq_int.c +++ b/kernels/bgq/1/bli_axpyv_bgq_int.c @@ -34,14 +34,14 @@ #include "blis.h" -void bli_daxpyv_bgq_int - ( +void bli_daxpyv_bgq_int + ( conj_t conjx, dim_t n, double* restrict alpha, double* restrict x, inc_t incx, double* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { if ( bli_zero_dim1( n ) ) return; @@ -70,7 +70,7 @@ void bli_daxpyv_bgq_int xv = vec_lda( 0 * sizeof(double), &x[i*4] ); yv = vec_lda( 0 * sizeof(double), &y[i*4] ); zv = vec_madd( alphav, xv, yv ); - vec_sta( zv, 0 * sizeof(double), &y[i*4] ); + vec_sta( zv, 0 * sizeof(double), &y[i*4] ); } for ( dim_t i = 0; i < n_left; i++ ) { diff --git a/kernels/bgq/1/bli_dotv_bgq_int.c b/kernels/bgq/1/bli_dotv_bgq_int.c index 73e53c23a..eb6805a4c 100644 --- a/kernels/bgq/1/bli_dotv_bgq_int.c +++ b/kernels/bgq/1/bli_dotv_bgq_int.c @@ -42,7 +42,7 @@ void bli_ddotv_bgq_int double* restrict x, inc_t incx, double* restrict y, inc_t incy, double* restrict rho, - cntx_t* restrict cntx + cntx_t* cntx ) { bool use_ref = FALSE; @@ -91,7 +91,7 @@ void bli_ddotv_bgq_int { rhos += x[4*n_run + i] * y[4*n_run + i]; } - + *rho = rhos; } diff --git a/kernels/bgq/1f/bli_axpyf_bgq_int.c b/kernels/bgq/1f/bli_axpyf_bgq_int.c index 4e296e0a2..cf0fe633c 100644 --- a/kernels/bgq/1f/bli_axpyf_bgq_int.c +++ b/kernels/bgq/1f/bli_axpyf_bgq_int.c @@ -45,7 +45,7 @@ void bli_daxpyf_bgq_int double* restrict a, inc_t inca, inc_t lda, double* restrict x, inc_t incx, double* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t fusefac = 8; @@ -60,7 +60,7 @@ void bli_daxpyf_bgq_int use_ref = TRUE; // Call the reference implementation if needed. if ( use_ref == TRUE ) - { + { // printf("%d\t%d\t%d\t%d\t%d\t%d\n", fusefac, inca, incx, incy, bli_is_unaligned_to( ( siz_t )a, 32 ), bli_is_unaligned_to( ( siz_t )y, 32)); // printf("DEFAULTING TO REFERENCE IMPLEMENTATION\n"); BLIS_DAXPYF_KERNEL_REF( conja, conjx, m, b_n, alpha, a, inca, lda, x, incx, y, incy, cntx ); @@ -134,7 +134,7 @@ void bli_daxpyf_bgq_int vec_sta( yv, 0 * sizeof(double), &y0[i*4]); } - + for ( dim_t i = 0; i < m_left; ++i ) { y0[4*m_run + i] += chi0 * a0[4*m_run + i] diff --git a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c index 15e3e072f..2adbc4c36 100644 --- a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c +++ b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c @@ -64,8 +64,8 @@ void bli_dgemm_bgq_int_8x8 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { GEMM_UKR_SETUP_CT_ANY( d, 8, 8, false ); @@ -228,8 +228,8 @@ void bli_zgemm_bgq_int_4x4 dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { GEMM_UKR_SETUP_CT_ANY( z, 4, 4, false ); diff --git a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c index 3a75d61d7..bef7232dd 100644 --- a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c +++ b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c @@ -98,8 +98,8 @@ void bli_sgemm_bulldozer_asm_8x8_fma4 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a @@ -590,8 +590,8 @@ void bli_dgemm_bulldozer_asm_4x6_fma4 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a @@ -810,8 +810,8 @@ void bli_cgemm_bulldozer_asm_8x4_fma4 scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1334,8 +1334,8 @@ void bli_zgemm_bulldozer_asm_4x4_fma4 dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c index 843335ad5..e5d077409 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c @@ -51,7 +51,7 @@ void bli_cpackm_haswell_asm_3xk scomplex* restrict kappa, scomplex* restrict a, inc_t inca0, inc_t lda0, scomplex* restrict p, inc_t ldp0, - cntx_t* restrict cntx + cntx_t* cntx ) { #if 0 diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c index 862a33b86..fa8fabe9d 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c @@ -51,7 +51,7 @@ void bli_cpackm_haswell_asm_8xk scomplex* restrict kappa, scomplex* restrict a, inc_t inca0, inc_t lda0, scomplex* restrict p, inc_t ldp0, - cntx_t* restrict cntx + cntx_t* cntx ) { #if 0 diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c index b64f26591..47fc5b98d 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c @@ -51,7 +51,7 @@ void bli_dpackm_haswell_asm_6xk double* restrict kappa, double* restrict a, inc_t inca0, inc_t lda0, double* restrict p, inc_t ldp0, - cntx_t* restrict cntx + cntx_t* cntx ) { #if 0 @@ -107,7 +107,7 @@ void bli_dpackm_haswell_asm_6xk if ( cdim0 == mnr && !gs && unitk ) { begin_asm() - + mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca @@ -121,13 +121,13 @@ void bli_dpackm_haswell_asm_6xk mov(var(one), rdx) // load address of 1.0 constant vmovsd(mem(rdx), xmm1) // load 1.0 - + mov(var(kappa), rcx) // load address of kappa vmovsd(mem(rcx), xmm0) // load kappa - + // now branch on kappa == 1.0 - + vucomisd(xmm0, xmm1) // set ZF if kappa == 1.0 je(.DKAPPAUNIT) // if ZF = 1, jump to beta == 0 case @@ -137,7 +137,7 @@ void bli_dpackm_haswell_asm_6xk cmp(imm(8), r8) // set ZF if (8*inca) == 8. jz(.DCOLNONU) // jump to column storage case - + // -- kappa non-unit, row storage on A ------------------------------------- label(.DROWNONU) @@ -150,7 +150,7 @@ void bli_dpackm_haswell_asm_6xk label(.DCOLNONU) jmp(.DDONE) // jump to end. - + @@ -161,7 +161,7 @@ void bli_dpackm_haswell_asm_6xk // -- kappa unit, row storage on A ----------------------------------------- - + label(.DROWUNIT) lea(mem(r8, r8, 2), r12) // r12 = 3*inca @@ -255,7 +255,7 @@ void bli_dpackm_haswell_asm_6xk // -- kappa unit, column storage on A -------------------------------------- label(.DCOLUNIT) - + lea(mem(r10, r10, 2), r13) // r13 = 3*lda mov(var(k_iter), rsi) // i = k_iter; @@ -319,8 +319,8 @@ void bli_dpackm_haswell_asm_6xk label(.DDONE) - - + + end_asm( : // output operands (none) @@ -374,7 +374,7 @@ void bli_dpackm_haswell_asm_6xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } @@ -394,7 +394,7 @@ void bli_dpackm_haswell_asm_6xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c index 9deb564ce..9f07e37a4 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c @@ -51,7 +51,7 @@ void bli_dpackm_haswell_asm_8xk double* restrict kappa, double* restrict a, inc_t inca0, inc_t lda0, double* restrict p, inc_t ldp0, - cntx_t* restrict cntx + cntx_t* cntx ) { #if 0 @@ -107,7 +107,7 @@ void bli_dpackm_haswell_asm_8xk if ( cdim0 == mnr && !gs && unitk ) { begin_asm() - + mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca @@ -121,13 +121,13 @@ void bli_dpackm_haswell_asm_8xk mov(var(one), rdx) // load address of 1.0 constant vmovsd(mem(rdx), xmm1) // load 1.0 - + mov(var(kappa), rcx) // load address of kappa vmovsd(mem(rcx), xmm0) // load kappa - + // now branch on kappa == 1.0 - + vucomisd(xmm0, xmm1) // set ZF if kappa == 1.0 je(.DKAPPAUNIT) // if ZF = 1, jump to beta == 0 case @@ -137,7 +137,7 @@ void bli_dpackm_haswell_asm_8xk cmp(imm(8), r8) // set ZF if (8*inca) == 8. jz(.DCOLNONU) // jump to column storage case - + // -- kappa non-unit, row storage on A ------------------------------------- label(.DROWNONU) @@ -150,7 +150,7 @@ void bli_dpackm_haswell_asm_8xk label(.DCOLNONU) jmp(.DDONE) // jump to end. - + @@ -161,7 +161,7 @@ void bli_dpackm_haswell_asm_8xk // -- kappa unit, row storage on A ----------------------------------------- - + label(.DROWUNIT) lea(mem(r8, r8, 2), r12) // r12 = 3*inca @@ -265,7 +265,7 @@ void bli_dpackm_haswell_asm_8xk // -- kappa unit, column storage on A -------------------------------------- label(.DCOLUNIT) - + lea(mem(r10, r10, 2), r13) // r13 = 3*lda mov(var(k_iter), rsi) // i = k_iter; @@ -329,8 +329,8 @@ void bli_dpackm_haswell_asm_8xk label(.DDONE) - - + + end_asm( : // output operands (none) @@ -384,7 +384,7 @@ void bli_dpackm_haswell_asm_8xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } @@ -402,7 +402,7 @@ void bli_dpackm_haswell_asm_8xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c index 40ac22bc5..27b2c71ee 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c @@ -51,7 +51,7 @@ void bli_spackm_haswell_asm_16xk float* restrict kappa, float* restrict a, inc_t inca0, inc_t lda0, float* restrict p, inc_t ldp0, - cntx_t* restrict cntx + cntx_t* cntx ) { #if 0 @@ -100,14 +100,14 @@ void bli_spackm_haswell_asm_16xk // NOTE: If/when this kernel ever supports scaling by kappa within the // assembly region, this constraint should be lifted. const bool unitk = bli_seq1( *kappa ); - + // ------------------------------------------------------------------------- if ( cdim0 == mnr && !gs && unitk ) { begin_asm() - + mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca @@ -121,13 +121,13 @@ void bli_spackm_haswell_asm_16xk mov(var(one), rdx) // load address of 1.0 constant vmovss(mem(rdx), xmm1) // load 1.0 - + mov(var(kappa), rcx) // load address of kappa vmovss(mem(rcx), xmm0) // load kappa - + // now branch on kappa == 1.0 - + vucomiss(xmm0, xmm1) // set ZF if kappa == 1.0 je(.SKAPPAUNIT) // if ZF = 1, jump to beta == 0 case @@ -137,7 +137,7 @@ void bli_spackm_haswell_asm_16xk cmp(imm(4), r8) // set ZF if (4*inca) == 4. jz(.SCOLNONU) // jump to column storage case - + // -- kappa non-unit, row storage on A ------------------------------------- label(.SROWNONU) @@ -150,7 +150,7 @@ void bli_spackm_haswell_asm_16xk label(.SCOLNONU) jmp(.SDONE) // jump to end. - + @@ -161,7 +161,7 @@ void bli_spackm_haswell_asm_16xk // -- kappa unit, row storage on A ----------------------------------------- - + label(.SROWUNIT) lea(mem(r8, r8, 2), r13) // r13 = 3*inca @@ -402,7 +402,7 @@ void bli_spackm_haswell_asm_16xk // -- kappa unit, column storage on A -------------------------------------- label(.SCOLUNIT) - + lea(mem(r10, r10, 2), r13) // r13 = 3*lda lea(mem(r13, r10, 2), r15) // r15 = 5*lda lea(mem(r13, r10, 4), rdx) // rdx = 7*lda @@ -488,8 +488,8 @@ void bli_spackm_haswell_asm_16xk label(.SDONE) - - + + end_asm( : // output operands (none) @@ -543,7 +543,7 @@ void bli_spackm_haswell_asm_16xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } @@ -561,7 +561,7 @@ void bli_spackm_haswell_asm_16xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c index 3a134bed8..a073eca62 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c @@ -51,7 +51,7 @@ void bli_spackm_haswell_asm_6xk float* restrict kappa, float* restrict a, inc_t inca0, inc_t lda0, float* restrict p, inc_t ldp0, - cntx_t* restrict cntx + cntx_t* cntx ) { #if 0 @@ -100,14 +100,14 @@ void bli_spackm_haswell_asm_6xk // NOTE: If/when this kernel ever supports scaling by kappa within the // assembly region, this constraint should be lifted. const bool unitk = bli_seq1( *kappa ); - + // ------------------------------------------------------------------------- if ( cdim0 == mnr && !gs && unitk ) { begin_asm() - + mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca @@ -121,13 +121,13 @@ void bli_spackm_haswell_asm_6xk mov(var(one), rdx) // load address of 1.0 constant vmovss(mem(rdx), xmm1) // load 1.0 - + mov(var(kappa), rcx) // load address of kappa vmovss(mem(rcx), xmm0) // load kappa - + // now branch on kappa == 1.0 - + vucomiss(xmm0, xmm1) // set ZF if kappa == 1.0 je(.SKAPPAUNIT) // if ZF = 1, jump to beta == 0 case @@ -137,7 +137,7 @@ void bli_spackm_haswell_asm_6xk cmp(imm(4), r8) // set ZF if (4*inca) == 4. jz(.SCOLNONU) // jump to column storage case - + // -- kappa non-unit, row storage on A ------------------------------------- label(.SROWNONU) @@ -150,7 +150,7 @@ void bli_spackm_haswell_asm_6xk label(.SCOLNONU) jmp(.SDONE) // jump to end. - + @@ -161,7 +161,7 @@ void bli_spackm_haswell_asm_6xk // -- kappa unit, row storage on A ----------------------------------------- - + label(.SROWUNIT) lea(mem(r8, r8, 2), r13) // r13 = 3*inca @@ -274,7 +274,7 @@ void bli_spackm_haswell_asm_6xk // -- kappa unit, column storage on A -------------------------------------- label(.SCOLUNIT) - + lea(mem(r10, r10, 2), r13) // r13 = 3*lda lea(mem(r13, r10, 2), r15) // r15 = 5*lda lea(mem(r13, r10, 4), rdx) // rdx = 7*lda @@ -361,8 +361,8 @@ void bli_spackm_haswell_asm_6xk label(.SDONE) - - + + end_asm( : // output operands (none) @@ -416,7 +416,7 @@ void bli_spackm_haswell_asm_6xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } @@ -434,7 +434,7 @@ void bli_spackm_haswell_asm_6xk ( m_edge, n_edge, - p_edge, 1, ldp + p_edge, 1, ldp ); } } diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c index 1a714abe2..5e65565d5 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c @@ -51,7 +51,7 @@ void bli_zpackm_haswell_asm_3xk dcomplex* restrict kappa, dcomplex* restrict a, inc_t inca0, inc_t lda0, dcomplex* restrict p, inc_t ldp0, - cntx_t* restrict cntx + cntx_t* cntx ) { #if 0 diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c index 4e11872af..d118081cc 100644 --- a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c +++ b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c @@ -51,7 +51,7 @@ void bli_zpackm_haswell_asm_4xk dcomplex* restrict kappa, dcomplex* restrict a, inc_t inca0, inc_t lda0, dcomplex* restrict p, inc_t ldp0, - cntx_t* restrict cntx + cntx_t* cntx ) { #if 0 diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c index 70ea4ccd7..b7be1c674 100644 --- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c @@ -87,8 +87,8 @@ void bli_sgemm_haswell_asm_6x16 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -767,8 +767,8 @@ void bli_dgemm_haswell_asm_6x8 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1326,8 +1326,8 @@ void bli_cgemm_haswell_asm_3x8 scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1719,8 +1719,8 @@ void bli_zgemm_haswell_asm_3x4 dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c index dd9526d56..261054499 100644 --- a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c +++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c @@ -86,8 +86,8 @@ void bli_sgemm_haswell_asm_16x6 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -470,8 +470,8 @@ void bli_dgemm_haswell_asm_8x6 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -840,8 +840,8 @@ void bli_cgemm_haswell_asm_8x3 scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1231,8 +1231,8 @@ void bli_zgemm_haswell_asm_4x3 dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c index d0d0ff211..915fbf08f 100644 --- a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c @@ -67,8 +67,8 @@ void bli_sgemmtrsm_l_haswell_asm_6x16 float* restrict b01, float* restrict b11, float* restrict c11, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -858,8 +858,8 @@ void bli_dgemmtrsm_l_haswell_asm_6x8 double* restrict b01, double* restrict b11, double* restrict c11, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c index 68a8c069b..63c42785c 100644 --- a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c +++ b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c @@ -67,8 +67,8 @@ void bli_sgemmtrsm_u_haswell_asm_6x16 float* restrict b01, float* restrict b11, float* restrict c11, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -863,8 +863,8 @@ void bli_dgemmtrsm_u_haswell_asm_6x8 double* restrict b01, double* restrict b11, double* restrict c11, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c index 1820277d5..637e5917b 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c @@ -78,8 +78,8 @@ void bli_dgemmsup_rd_haswell_asm_6x8m double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t n_left = n0 % 8; @@ -166,7 +166,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m begin_asm() //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -184,7 +184,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -259,18 +259,18 @@ void bli_dgemmsup_rd_haswell_asm_6x8m #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a - - + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -305,7 +305,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) @@ -336,7 +336,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a @@ -398,27 +398,27 @@ void bli_dgemmsup_rd_haswell_asm_6x8m vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a @@ -429,7 +429,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -451,21 +451,21 @@ void bli_dgemmsup_rd_haswell_asm_6x8m vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -473,12 +473,12 @@ void bli_dgemmsup_rd_haswell_asm_6x8m // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -500,22 +500,22 @@ void bli_dgemmsup_rd_haswell_asm_6x8m vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 + + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -555,7 +555,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -563,73 +563,73 @@ void bli_dgemmsup_rd_haswell_asm_6x8m mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - - + + lea(mem(r12, rdi, 2), r12) // @@ -653,7 +653,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m label(.DRETURN) - + end_asm( : // output operands (none) @@ -735,8 +735,8 @@ void bli_dgemmsup_rd_haswell_asm_6x4m double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -766,7 +766,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -784,7 +784,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -838,19 +838,19 @@ void bli_dgemmsup_rd_haswell_asm_6x4m prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -885,7 +885,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) @@ -916,7 +916,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a @@ -978,25 +978,25 @@ void bli_dgemmsup_rd_haswell_asm_6x4m vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 1 @@ -1004,12 +1004,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4m prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif - + vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -1031,21 +1031,21 @@ void bli_dgemmsup_rd_haswell_asm_6x4m vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1053,12 +1053,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4m // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -1080,22 +1080,22 @@ void bli_dgemmsup_rd_haswell_asm_6x4m vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 + + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -1134,7 +1134,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -1142,73 +1142,73 @@ void bli_dgemmsup_rd_haswell_asm_6x4m mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - - + + lea(mem(r12, rdi, 2), r12) // @@ -1225,7 +1225,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m label(.DRETURN) - + end_asm( : // output operands (none) @@ -1307,8 +1307,8 @@ void bli_dgemmsup_rd_haswell_asm_6x2m double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1336,9 +1336,9 @@ void bli_dgemmsup_rd_haswell_asm_6x2m // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1356,7 +1356,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1412,19 +1412,19 @@ void bli_dgemmsup_rd_haswell_asm_6x2m prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c #endif - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1462,7 +1462,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) @@ -1496,7 +1496,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a @@ -1564,25 +1564,25 @@ void bli_dgemmsup_rd_haswell_asm_6x2m vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 @@ -1590,7 +1590,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif - + vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; @@ -1620,21 +1620,21 @@ void bli_dgemmsup_rd_haswell_asm_6x2m vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1642,7 +1642,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; @@ -1672,12 +1672,12 @@ void bli_dgemmsup_rd_haswell_asm_6x2m vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -1690,7 +1690,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 - + vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) @@ -1723,7 +1723,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m // xmm14[0:1] = sum(ymm14) sum(ymm15) - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -1731,96 +1731,96 @@ void bli_dgemmsup_rd_haswell_asm_6x2m mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), xmm3, xmm10) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), xmm3, xmm12) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), xmm3, xmm14) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovupd(xmm6, mem(rcx)) add(rdi, rcx) - + vmovupd(xmm8, mem(rcx)) add(rdi, rcx) - + vmovupd(xmm10, mem(rcx)) add(rdi, rcx) - + vmovupd(xmm12, mem(rcx)) add(rdi, rcx) - + vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - + @@ -1838,7 +1838,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m label(.DRETURN) - + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c index e720e7da1..d9dad5fea 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c @@ -78,8 +78,8 @@ void bli_dgemmsup_rd_haswell_asm_6x8n double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t m_left = m0 % 6; @@ -223,7 +223,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -241,7 +241,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -317,18 +317,18 @@ void bli_dgemmsup_rd_haswell_asm_6x8n //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b - - + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -362,7 +362,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 1 @@ -398,7 +398,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b @@ -465,32 +465,32 @@ void bli_dgemmsup_rd_haswell_asm_6x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -512,21 +512,21 @@ void bli_dgemmsup_rd_haswell_asm_6x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -534,12 +534,12 @@ void bli_dgemmsup_rd_haswell_asm_6x8n // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -561,22 +561,22 @@ void bli_dgemmsup_rd_haswell_asm_6x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 + + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -615,7 +615,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -623,73 +623,73 @@ void bli_dgemmsup_rd_haswell_asm_6x8n mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - - + + add(imm(4*8), r12) // c_jj = r12 += 4*cs_c @@ -711,7 +711,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n label(.DRETURN) - + end_asm( : // output operands (none) @@ -804,8 +804,8 @@ void bli_dgemmsup_rd_haswell_asm_3x8n double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -835,7 +835,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -853,7 +853,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -909,18 +909,18 @@ void bli_dgemmsup_rd_haswell_asm_3x8n //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b - - + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -954,7 +954,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 1 @@ -990,7 +990,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b @@ -1057,32 +1057,32 @@ void bli_dgemmsup_rd_haswell_asm_3x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -1104,21 +1104,21 @@ void bli_dgemmsup_rd_haswell_asm_3x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1126,12 +1126,12 @@ void bli_dgemmsup_rd_haswell_asm_3x8n // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -1153,22 +1153,22 @@ void bli_dgemmsup_rd_haswell_asm_3x8n vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 + + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -1206,7 +1206,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n // ymm6[2] = sum(ymm12); ymm6[3] = sum(ymm15) - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -1215,73 +1215,73 @@ void bli_dgemmsup_rd_haswell_asm_3x8n mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - - + + add(imm(4*8), r12) // c_jj = r12 += 4*cs_c @@ -1297,7 +1297,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n label(.DRETURN) - + end_asm( : // output operands (none) @@ -1391,8 +1391,8 @@ void bli_dgemmsup_rd_haswell_asm_2x8n double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1422,7 +1422,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1440,7 +1440,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1491,18 +1491,18 @@ void bli_dgemmsup_rd_haswell_asm_2x8n //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b - - + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -1531,7 +1531,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + // ---------------------------------- iteration 1 #if 1 @@ -1562,7 +1562,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b @@ -1619,31 +1619,31 @@ void bli_dgemmsup_rd_haswell_asm_2x8n vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_a = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -1661,21 +1661,21 @@ void bli_dgemmsup_rd_haswell_asm_2x8n vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1683,11 +1683,11 @@ void bli_dgemmsup_rd_haswell_asm_2x8n // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -1705,21 +1705,21 @@ void bli_dgemmsup_rd_haswell_asm_2x8n vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 + + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 - + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -1746,7 +1746,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -1754,65 +1754,65 @@ void bli_dgemmsup_rd_haswell_asm_2x8n mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - - + + add(imm(4*8), r12) // c_jj = r12 += 4*cs_c @@ -1828,7 +1828,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n label(.DRETURN) - + end_asm( : // output operands (none) @@ -1921,8 +1921,8 @@ void bli_dgemmsup_rd_haswell_asm_1x8n double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1952,7 +1952,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rdx) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1970,7 +1970,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -2016,18 +2016,18 @@ void bli_dgemmsup_rd_haswell_asm_1x8n //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b - - + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -2051,7 +2051,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) - + // ---------------------------------- iteration 1 #if 1 @@ -2077,7 +2077,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b @@ -2124,30 +2124,30 @@ void bli_dgemmsup_rd_haswell_asm_1x8n add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_a = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) @@ -2161,21 +2161,21 @@ void bli_dgemmsup_rd_haswell_asm_1x8n add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -2183,10 +2183,10 @@ void bli_dgemmsup_rd_haswell_asm_1x8n // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) @@ -2200,20 +2200,20 @@ void bli_dgemmsup_rd_haswell_asm_1x8n add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 - + + // ymm4 ymm7 ymm10 ymm13 + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -2228,7 +2228,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -2236,57 +2236,57 @@ void bli_dgemmsup_rd_haswell_asm_1x8n mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - - + + add(imm(4*8), r12) // c_jj = r12 += 4*cs_c @@ -2302,7 +2302,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n label(.DRETURN) - + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c index f764bc613..fcf448423 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c @@ -78,8 +78,8 @@ void bli_sgemmsup_rd_haswell_asm_6x16m float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t n_left = n0 % 16; @@ -190,7 +190,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m begin_asm() //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -208,7 +208,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -283,18 +283,18 @@ void bli_sgemmsup_rd_haswell_asm_6x16m #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -329,7 +329,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -360,7 +360,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a @@ -422,27 +422,27 @@ void bli_sgemmsup_rd_haswell_asm_6x16m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a @@ -453,7 +453,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -475,21 +475,21 @@ void bli_sgemmsup_rd_haswell_asm_6x16m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -497,12 +497,12 @@ void bli_sgemmsup_rd_haswell_asm_6x16m // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -524,22 +524,22 @@ void bli_sgemmsup_rd_haswell_asm_6x16m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -555,7 +555,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -571,7 +571,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) - + vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -590,7 +590,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -598,73 +598,73 @@ void bli_sgemmsup_rd_haswell_asm_6x16m mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + lea(mem(r12, rdi, 2), r12) // @@ -688,7 +688,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m label(.SRETURN) - + end_asm( : // output operands (none) @@ -770,8 +770,8 @@ void bli_sgemmsup_rd_haswell_asm_6x12m float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -801,7 +801,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m begin_asm() //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -819,7 +819,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -894,18 +894,18 @@ void bli_sgemmsup_rd_haswell_asm_6x12m #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -940,7 +940,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -971,7 +971,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a @@ -1033,27 +1033,27 @@ void bli_sgemmsup_rd_haswell_asm_6x12m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a @@ -1064,7 +1064,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -1086,21 +1086,21 @@ void bli_sgemmsup_rd_haswell_asm_6x12m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1108,12 +1108,12 @@ void bli_sgemmsup_rd_haswell_asm_6x12m // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -1135,22 +1135,22 @@ void bli_sgemmsup_rd_haswell_asm_6x12m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1166,7 +1166,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1182,7 +1182,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) - + vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1201,7 +1201,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1209,73 +1209,73 @@ void bli_sgemmsup_rd_haswell_asm_6x12m mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + lea(mem(r12, rdi, 2), r12) // @@ -1299,7 +1299,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m label(.SRETURN) - + end_asm( : // output operands (none) @@ -1383,8 +1383,8 @@ void bli_sgemmsup_rd_haswell_asm_6x8m float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1414,7 +1414,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m begin_asm() //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1432,7 +1432,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1507,18 +1507,18 @@ void bli_sgemmsup_rd_haswell_asm_6x8m #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -1553,7 +1553,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -1584,7 +1584,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a @@ -1646,27 +1646,27 @@ void bli_sgemmsup_rd_haswell_asm_6x8m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a @@ -1677,7 +1677,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -1699,21 +1699,21 @@ void bli_sgemmsup_rd_haswell_asm_6x8m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1721,12 +1721,12 @@ void bli_sgemmsup_rd_haswell_asm_6x8m // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -1748,22 +1748,22 @@ void bli_sgemmsup_rd_haswell_asm_6x8m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1779,7 +1779,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1795,7 +1795,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) - + vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1814,7 +1814,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1822,73 +1822,73 @@ void bli_sgemmsup_rd_haswell_asm_6x8m mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + lea(mem(r12, rdi, 2), r12) // @@ -1912,7 +1912,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m label(.SRETURN) - + end_asm( : // output operands (none) @@ -1994,8 +1994,8 @@ void bli_sgemmsup_rd_haswell_asm_6x4m float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2025,7 +2025,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -2043,7 +2043,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -2098,18 +2098,18 @@ void bli_sgemmsup_rd_haswell_asm_6x4m #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -2144,7 +2144,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -2175,7 +2175,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a @@ -2237,27 +2237,27 @@ void bli_sgemmsup_rd_haswell_asm_6x4m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a @@ -2268,7 +2268,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -2290,21 +2290,21 @@ void bli_sgemmsup_rd_haswell_asm_6x4m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -2312,12 +2312,12 @@ void bli_sgemmsup_rd_haswell_asm_6x4m // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -2339,22 +2339,22 @@ void bli_sgemmsup_rd_haswell_asm_6x4m vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -2370,7 +2370,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -2386,7 +2386,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) - + vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -2405,7 +2405,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -2413,73 +2413,73 @@ void bli_sgemmsup_rd_haswell_asm_6x4m mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + lea(mem(r12, rdi, 2), r12) // @@ -2496,7 +2496,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m label(.SRETURN) - + end_asm( : // output operands (none) @@ -2579,8 +2579,8 @@ void bli_sgemmsup_rd_haswell_asm_6x2m float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2610,7 +2610,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -2628,7 +2628,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -2685,18 +2685,18 @@ void bli_sgemmsup_rd_haswell_asm_6x2m prefetch(0, mem(r10, rdi, 2, 1*4)) // prefetch c + 5*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -2734,7 +2734,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) @@ -2768,7 +2768,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a @@ -2836,27 +2836,27 @@ void bli_sgemmsup_rd_haswell_asm_6x2m vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a @@ -2892,21 +2892,21 @@ void bli_sgemmsup_rd_haswell_asm_6x2m vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -2944,12 +2944,12 @@ void bli_sgemmsup_rd_haswell_asm_6x2m vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -2962,7 +2962,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 - + vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -3007,7 +3007,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m // xmm14[0:1] = sum(ymm14) sum(ymm15) - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -3015,103 +3015,103 @@ void bli_sgemmsup_rd_haswell_asm_6x2m mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) - + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) - + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm10) vmovsd(xmm10, mem(rcx)) add(rdi, rcx) - + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm12) vmovsd(xmm12, mem(rcx)) add(rdi, rcx) - + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm14) vmovsd(xmm14, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm6, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm8, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm10, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm12, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm14, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + lea(mem(r12, rdi, 4), r12) // @@ -3128,7 +3128,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m label(.SRETURN) - + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c index 1fe862a8d..33b2df4b4 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c @@ -78,8 +78,8 @@ void bli_sgemmsup_rd_haswell_asm_6x16n float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t m_left = m0 % 6; @@ -223,7 +223,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -240,7 +240,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b - + //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -317,18 +317,18 @@ void bli_sgemmsup_rd_haswell_asm_6x16n //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -362,7 +362,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 1 @@ -398,7 +398,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b @@ -465,32 +465,32 @@ void bli_sgemmsup_rd_haswell_asm_6x16n vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -512,21 +512,21 @@ void bli_sgemmsup_rd_haswell_asm_6x16n vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -534,12 +534,12 @@ void bli_sgemmsup_rd_haswell_asm_6x16n // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -561,22 +561,22 @@ void bli_sgemmsup_rd_haswell_asm_6x16n vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -592,7 +592,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -608,7 +608,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) - + vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -627,7 +627,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -635,73 +635,73 @@ void bli_sgemmsup_rd_haswell_asm_6x16n mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + add(imm(4*4), r12) // c_jj = r12 += 4*cs_c @@ -723,7 +723,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n label(.SRETURN) - + end_asm( : // output operands (none) @@ -816,8 +816,8 @@ void bli_sgemmsup_rd_haswell_asm_3x16n float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -847,7 +847,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -864,7 +864,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -919,18 +919,18 @@ void bli_sgemmsup_rd_haswell_asm_3x16n //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -964,7 +964,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 1 @@ -1000,7 +1000,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b @@ -1067,32 +1067,32 @@ void bli_sgemmsup_rd_haswell_asm_3x16n vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -1114,21 +1114,21 @@ void bli_sgemmsup_rd_haswell_asm_3x16n vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1136,12 +1136,12 @@ void bli_sgemmsup_rd_haswell_asm_3x16n // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -1163,22 +1163,22 @@ void bli_sgemmsup_rd_haswell_asm_3x16n vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1194,7 +1194,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1210,7 +1210,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) - + vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1229,7 +1229,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1237,73 +1237,73 @@ void bli_sgemmsup_rd_haswell_asm_3x16n mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + add(imm(4*4), r12) // c_jj = r12 += 4*cs_c @@ -1319,7 +1319,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n label(.SRETURN) - + end_asm( : // output operands (none) @@ -1413,8 +1413,8 @@ void bli_sgemmsup_rd_haswell_asm_2x16n float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1444,7 +1444,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1461,7 +1461,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1511,18 +1511,18 @@ void bli_sgemmsup_rd_haswell_asm_2x16n //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -1551,7 +1551,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + // ---------------------------------- iteration 1 #if 1 @@ -1582,7 +1582,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b @@ -1639,31 +1639,31 @@ void bli_sgemmsup_rd_haswell_asm_2x16n vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -1681,21 +1681,21 @@ void bli_sgemmsup_rd_haswell_asm_2x16n vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1703,11 +1703,11 @@ void bli_sgemmsup_rd_haswell_asm_2x16n // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -1725,21 +1725,21 @@ void bli_sgemmsup_rd_haswell_asm_2x16n vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1755,7 +1755,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1774,7 +1774,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1782,65 +1782,65 @@ void bli_sgemmsup_rd_haswell_asm_2x16n mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + add(imm(4*4), r12) // c_jj = r12 += 4*cs_c @@ -1856,7 +1856,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n label(.SRETURN) - + end_asm( : // output operands (none) @@ -1949,8 +1949,8 @@ void bli_sgemmsup_rd_haswell_asm_1x16n float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1980,7 +1980,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1997,7 +1997,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -2042,18 +2042,18 @@ void bli_sgemmsup_rd_haswell_asm_1x16n //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -2077,7 +2077,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + // ---------------------------------- iteration 1 #if 1 @@ -2103,7 +2103,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b @@ -2150,30 +2150,30 @@ void bli_sgemmsup_rd_haswell_asm_1x16n add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) @@ -2187,21 +2187,21 @@ void bli_sgemmsup_rd_haswell_asm_1x16n add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -2209,11 +2209,11 @@ void bli_sgemmsup_rd_haswell_asm_1x16n // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) @@ -2227,20 +2227,20 @@ void bli_sgemmsup_rd_haswell_asm_1x16n add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 - + // ymm4 ymm7 ymm10 ymm13 + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -2259,7 +2259,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -2267,57 +2267,57 @@ void bli_sgemmsup_rd_haswell_asm_1x16n mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + add(imm(4*4), r12) // c_jj = r12 += 4*cs_c @@ -2333,7 +2333,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n label(.SRETURN) - + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c index 1637e9766..4e6b75572 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t n_left = n0 % 8; @@ -225,15 +225,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8m // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -336,19 +336,19 @@ void bli_dgemmsup_rv_haswell_asm_6x8m lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -356,7 +356,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -367,14 +367,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -383,7 +383,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 0 @@ -402,14 +402,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -417,8 +417,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - + + // ---------------------------------- iteration 2 #if 0 @@ -426,7 +426,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -437,14 +437,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -452,7 +452,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 3 @@ -473,14 +473,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -488,50 +488,50 @@ void bli_dgemmsup_rv_haswell_asm_6x8m vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -539,23 +539,23 @@ void bli_dgemmsup_rv_haswell_asm_6x8m vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) @@ -568,24 +568,24 @@ void bli_dgemmsup_rv_haswell_asm_6x8m vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(ymm0, ymm15, ymm15) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -594,60 +594,60 @@ void bli_dgemmsup_rv_haswell_asm_6x8m cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13) vmovupd(ymm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm15) vmovupd(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -734,51 +734,51 @@ void bli_dgemmsup_rv_haswell_asm_6x8m jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(ymm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm14, mem(rcx, 0*32)) vmovupd(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -843,9 +843,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) @@ -866,8 +866,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m label(.DRETURN) - - + + end_asm( : // output operands (none) @@ -995,8 +995,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6m double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1026,15 +1026,15 @@ void bli_dgemmsup_rv_haswell_asm_6x6m // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -1135,19 +1135,19 @@ void bli_dgemmsup_rv_haswell_asm_6x6m lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1155,7 +1155,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -1166,14 +1166,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6m vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1182,7 +1182,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 0 @@ -1201,14 +1201,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6m vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1216,8 +1216,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6m vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - + + // ---------------------------------- iteration 2 #if 0 @@ -1225,7 +1225,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -1236,14 +1236,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6m vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1251,7 +1251,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 3 @@ -1272,14 +1272,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6m vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1287,27 +1287,27 @@ void bli_dgemmsup_rv_haswell_asm_6x6m vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) @@ -1316,21 +1316,21 @@ void bli_dgemmsup_rv_haswell_asm_6x6m vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1338,23 +1338,23 @@ void bli_dgemmsup_rv_haswell_asm_6x6m vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) @@ -1367,24 +1367,24 @@ void bli_dgemmsup_rv_haswell_asm_6x6m vmulpd(xmm0, xmm13, xmm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(xmm0, xmm15, xmm15) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1393,60 +1393,60 @@ void bli_dgemmsup_rv_haswell_asm_6x6m cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13) vmovupd(xmm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm15) vmovupd(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1521,51 +1521,51 @@ void bli_dgemmsup_rv_haswell_asm_6x6m jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(xmm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm14, mem(rcx, 0*32)) vmovupd(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1622,9 +1622,9 @@ void bli_dgemmsup_rv_haswell_asm_6x6m //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) @@ -1645,8 +1645,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6m label(.DRETURN) - - + + end_asm( : // output operands (none) @@ -1774,8 +1774,8 @@ void bli_dgemmsup_rv_haswell_asm_6x4m double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1805,9 +1805,9 @@ void bli_dgemmsup_rv_haswell_asm_6x4m // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1822,7 +1822,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1907,17 +1907,17 @@ void bli_dgemmsup_rv_haswell_asm_6x4m #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1925,7 +1925,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1933,19 +1933,19 @@ void bli_dgemmsup_rv_haswell_asm_6x4m vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 1 #if 0 @@ -1961,18 +1961,18 @@ void bli_dgemmsup_rv_haswell_asm_6x4m vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 2 @@ -1981,7 +1981,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1989,18 +1989,18 @@ void bli_dgemmsup_rv_haswell_asm_6x4m vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 3 @@ -2018,38 +2018,38 @@ void bli_dgemmsup_rv_haswell_asm_6x4m vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) @@ -2062,58 +2062,58 @@ void bli_dgemmsup_rv_haswell_asm_6x4m vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm14, ymm14) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2122,42 +2122,42 @@ void bli_dgemmsup_rv_haswell_asm_6x4m cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2205,45 +2205,45 @@ void bli_dgemmsup_rv_haswell_asm_6x4m jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) - + jmp(.DDONE) // jump to end. @@ -2278,15 +2278,15 @@ void bli_dgemmsup_rv_haswell_asm_6x4m vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) - - - - + + + + label(.DDONE) - + lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c @@ -2302,8 +2302,8 @@ void bli_dgemmsup_rv_haswell_asm_6x4m label(.DRETURN) - - + + end_asm( : // output operands (none) @@ -2431,8 +2431,8 @@ void bli_dgemmsup_rv_haswell_asm_6x2m double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2462,9 +2462,9 @@ void bli_dgemmsup_rv_haswell_asm_6x2m // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2479,7 +2479,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2558,19 +2558,19 @@ void bli_dgemmsup_rv_haswell_asm_6x2m lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -2578,7 +2578,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -2586,19 +2586,19 @@ void bli_dgemmsup_rv_haswell_asm_6x2m vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 1 #if 0 @@ -2614,18 +2614,18 @@ void bli_dgemmsup_rv_haswell_asm_6x2m vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 2 @@ -2634,7 +2634,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -2642,18 +2642,18 @@ void bli_dgemmsup_rv_haswell_asm_6x2m vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 3 @@ -2671,43 +2671,43 @@ void bli_dgemmsup_rv_haswell_asm_6x2m vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -2715,58 +2715,58 @@ void bli_dgemmsup_rv_haswell_asm_6x2m vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2775,42 +2775,42 @@ void bli_dgemmsup_rv_haswell_asm_6x2m cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10) vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12) vmovupd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm14) vmovupd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2846,40 +2846,40 @@ void bli_dgemmsup_rv_haswell_asm_6x2m jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) @@ -2890,7 +2890,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m label(.DCOLSTORBZ) - + // begin I/O on columns 0-3 vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) @@ -2911,10 +2911,10 @@ void bli_dgemmsup_rv_haswell_asm_6x2m vmovupd(xmm1, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) - - - - + + + + label(.DDONE) @@ -2936,7 +2936,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m label(.DRETURN) - + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c index 5ecef06e8..2533a7825 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8n double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t m_left = m0 % 6; @@ -154,14 +154,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n } #endif - dgemmsup_ker_ft ker_fps[6] = + dgemmsup_ker_ft ker_fps[6] = { NULL, bli_dgemmsup_rv_haswell_asm_1x8n, bli_dgemmsup_rv_haswell_asm_2x8n, bli_dgemmsup_rv_haswell_asm_3x8n, bli_dgemmsup_rv_haswell_asm_4x8n, - bli_dgemmsup_rv_haswell_asm_5x8n + bli_dgemmsup_rv_haswell_asm_5x8n }; dgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; @@ -203,15 +203,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8n // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -313,19 +313,19 @@ void bli_dgemmsup_rv_haswell_asm_6x8n lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -344,14 +344,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -360,7 +360,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 0 @@ -379,14 +379,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -394,8 +394,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8n vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - + + // ---------------------------------- iteration 2 #if 0 @@ -414,14 +414,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -429,7 +429,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 3 @@ -450,14 +450,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -465,25 +465,25 @@ void bli_dgemmsup_rv_haswell_asm_6x8n vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 1 @@ -494,21 +494,21 @@ void bli_dgemmsup_rv_haswell_asm_6x8n vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -516,23 +516,23 @@ void bli_dgemmsup_rv_haswell_asm_6x8n vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) @@ -545,24 +545,24 @@ void bli_dgemmsup_rv_haswell_asm_6x8n vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(ymm0, ymm15, ymm15) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -571,60 +571,60 @@ void bli_dgemmsup_rv_haswell_asm_6x8n cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13) vmovupd(ymm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm15) vmovupd(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -711,51 +711,51 @@ void bli_dgemmsup_rv_haswell_asm_6x8n jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(ymm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm14, mem(rcx, 0*32)) vmovupd(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -820,9 +820,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8n //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) @@ -841,8 +841,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8n label(.DRETURN) - - + + end_asm( : // output operands (none) @@ -959,8 +959,8 @@ void bli_dgemmsup_rv_haswell_asm_5x8n double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -990,15 +990,15 @@ void bli_dgemmsup_rv_haswell_asm_5x8n // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -1097,19 +1097,19 @@ void bli_dgemmsup_rv_haswell_asm_5x8n lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1117,7 +1117,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8n #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -1128,20 +1128,20 @@ void bli_dgemmsup_rv_haswell_asm_5x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 1 #if 0 @@ -1160,20 +1160,20 @@ void bli_dgemmsup_rv_haswell_asm_5x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - + + // ---------------------------------- iteration 2 #if 0 @@ -1181,7 +1181,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8n #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -1192,19 +1192,19 @@ void bli_dgemmsup_rv_haswell_asm_5x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 3 @@ -1225,37 +1225,37 @@ void bli_dgemmsup_rv_haswell_asm_5x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 1 @@ -1266,42 +1266,42 @@ void bli_dgemmsup_rv_haswell_asm_5x8n vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) @@ -1312,24 +1312,24 @@ void bli_dgemmsup_rv_haswell_asm_5x8n vmulpd(ymm0, ymm11, ymm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm13, ymm13) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1338,52 +1338,52 @@ void bli_dgemmsup_rv_haswell_asm_5x8n cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13) vmovupd(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1468,46 +1468,46 @@ void bli_dgemmsup_rv_haswell_asm_5x8n jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1568,9 +1568,9 @@ void bli_dgemmsup_rv_haswell_asm_5x8n //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) @@ -1589,8 +1589,8 @@ void bli_dgemmsup_rv_haswell_asm_5x8n label(.DRETURN) - - + + end_asm( : // output operands (none) @@ -1707,8 +1707,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8n double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1740,13 +1740,13 @@ void bli_dgemmsup_rv_haswell_asm_4x8n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -1842,19 +1842,19 @@ void bli_dgemmsup_rv_haswell_asm_4x8n lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1862,7 +1862,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -1873,7 +1873,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1882,7 +1882,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 1 #if 0 @@ -1901,7 +1901,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1909,8 +1909,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8n vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - + + // ---------------------------------- iteration 2 #if 0 @@ -1918,7 +1918,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -1929,7 +1929,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1937,7 +1937,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 3 @@ -1958,7 +1958,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1966,25 +1966,25 @@ void bli_dgemmsup_rv_haswell_asm_4x8n vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 1 @@ -1995,14 +1995,14 @@ void bli_dgemmsup_rv_haswell_asm_4x8n vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -2010,23 +2010,23 @@ void bli_dgemmsup_rv_haswell_asm_4x8n vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) @@ -2035,24 +2035,24 @@ void bli_dgemmsup_rv_haswell_asm_4x8n vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2061,44 +2061,44 @@ void bli_dgemmsup_rv_haswell_asm_4x8n cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2153,21 +2153,21 @@ void bli_dgemmsup_rv_haswell_asm_4x8n jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -2175,16 +2175,16 @@ void bli_dgemmsup_rv_haswell_asm_4x8n vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2225,9 +2225,9 @@ void bli_dgemmsup_rv_haswell_asm_4x8n //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) @@ -2246,8 +2246,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8n label(.DRETURN) - - + + end_asm( : // output operands (none) @@ -2355,8 +2355,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8n double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2388,13 +2388,13 @@ void bli_dgemmsup_rv_haswell_asm_3x8n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -2493,19 +2493,19 @@ void bli_dgemmsup_rv_haswell_asm_3x8n lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -2513,7 +2513,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8n #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -2524,13 +2524,13 @@ void bli_dgemmsup_rv_haswell_asm_3x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - + // ---------------------------------- iteration 1 #if 0 @@ -2549,13 +2549,13 @@ void bli_dgemmsup_rv_haswell_asm_3x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - - + + // ---------------------------------- iteration 2 #if 0 @@ -2563,7 +2563,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8n #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -2574,12 +2574,12 @@ void bli_dgemmsup_rv_haswell_asm_3x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - + // ---------------------------------- iteration 3 @@ -2600,30 +2600,30 @@ void bli_dgemmsup_rv_haswell_asm_3x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 1 @@ -2634,59 +2634,59 @@ void bli_dgemmsup_rv_haswell_asm_3x8n vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2695,36 +2695,36 @@ void bli_dgemmsup_rv_haswell_asm_3x8n cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2811,21 +2811,21 @@ void bli_dgemmsup_rv_haswell_asm_3x8n jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -2833,12 +2833,12 @@ void bli_dgemmsup_rv_haswell_asm_3x8n vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2903,9 +2903,9 @@ void bli_dgemmsup_rv_haswell_asm_3x8n //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) @@ -2924,8 +2924,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8n label(.DRETURN) - - + + end_asm( : // output operands (none) @@ -3033,8 +3033,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8n double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -3066,13 +3066,13 @@ void bli_dgemmsup_rv_haswell_asm_2x8n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -3162,19 +3162,19 @@ void bli_dgemmsup_rv_haswell_asm_2x8n lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -3182,7 +3182,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -3195,7 +3195,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 1 #if 0 @@ -3215,8 +3215,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - - + + // ---------------------------------- iteration 2 #if 0 @@ -3224,7 +3224,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -3236,7 +3236,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 3 @@ -3258,25 +3258,25 @@ void bli_dgemmsup_rv_haswell_asm_2x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 1 @@ -3287,7 +3287,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; @@ -3295,45 +3295,45 @@ void bli_dgemmsup_rv_haswell_asm_2x8n vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -3342,28 +3342,28 @@ void bli_dgemmsup_rv_haswell_asm_2x8n cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -3406,21 +3406,21 @@ void bli_dgemmsup_rv_haswell_asm_2x8n jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -3428,8 +3428,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8n vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -3462,9 +3462,9 @@ void bli_dgemmsup_rv_haswell_asm_2x8n //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) @@ -3483,8 +3483,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8n label(.DRETURN) - - + + end_asm( : // output operands (none) @@ -3592,8 +3592,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8n double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -3625,13 +3625,13 @@ void bli_dgemmsup_rv_haswell_asm_1x8n begin_asm() //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -3718,19 +3718,19 @@ void bli_dgemmsup_rv_haswell_asm_1x8n lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -3738,7 +3738,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -3748,7 +3748,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 1 #if 1 @@ -3765,8 +3765,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8n add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - - + + // ---------------------------------- iteration 2 #if 1 @@ -3774,7 +3774,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -3783,7 +3783,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 3 @@ -3802,25 +3802,25 @@ void bli_dgemmsup_rv_haswell_asm_1x8n add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 1 @@ -3831,48 +3831,48 @@ void bli_dgemmsup_rv_haswell_asm_1x8n vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -3881,20 +3881,20 @@ void bli_dgemmsup_rv_haswell_asm_1x8n cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -3937,26 +3937,26 @@ void bli_dgemmsup_rv_haswell_asm_1x8n jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -3985,9 +3985,9 @@ void bli_dgemmsup_rv_haswell_asm_1x8n //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) @@ -4006,8 +4006,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8n label(.DRETURN) - - + + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c index 426e5157e..aacfd8d1f 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16m float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t n_left = n0 % 16; @@ -249,15 +249,15 @@ void bli_sgemmsup_rv_haswell_asm_6x16m // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -369,19 +369,19 @@ void bli_sgemmsup_rv_haswell_asm_6x16m lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -389,7 +389,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -400,14 +400,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16m vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -416,7 +416,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 0 @@ -435,14 +435,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16m vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -450,8 +450,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16m vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - + + // ---------------------------------- iteration 2 #if 0 @@ -459,7 +459,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -470,14 +470,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16m vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -485,7 +485,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 3 @@ -506,14 +506,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16m vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -521,50 +521,50 @@ void bli_sgemmsup_rv_haswell_asm_6x16m vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -572,23 +572,23 @@ void bli_sgemmsup_rv_haswell_asm_6x16m vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) @@ -601,26 +601,26 @@ void bli_sgemmsup_rv_haswell_asm_6x16m vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -629,60 +629,60 @@ void bli_sgemmsup_rv_haswell_asm_6x16m cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13) vmovups(ymm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14) vmovups(ymm14, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm15) vmovups(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -828,51 +828,51 @@ void bli_sgemmsup_rv_haswell_asm_6x16m jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm10, mem(rcx, 0*32)) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm12, mem(rcx, 0*32)) vmovups(ymm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm14, mem(rcx, 0*32)) vmovups(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -977,9 +977,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16m //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) @@ -1000,8 +1000,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16m label(.SRETURN) - - + + end_asm( : // output operands (none) @@ -1129,8 +1129,8 @@ void bli_sgemmsup_rv_haswell_asm_6x12m float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1160,15 +1160,15 @@ void bli_sgemmsup_rv_haswell_asm_6x12m // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -1275,19 +1275,19 @@ void bli_sgemmsup_rv_haswell_asm_6x12m lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1295,7 +1295,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -1306,14 +1306,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12m vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1322,7 +1322,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 0 @@ -1341,14 +1341,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12m vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1356,8 +1356,8 @@ void bli_sgemmsup_rv_haswell_asm_6x12m vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - + + // ---------------------------------- iteration 2 #if 0 @@ -1365,7 +1365,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -1376,14 +1376,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12m vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1391,7 +1391,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 3 @@ -1412,14 +1412,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12m vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1427,50 +1427,50 @@ void bli_sgemmsup_rv_haswell_asm_6x12m vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1478,23 +1478,23 @@ void bli_sgemmsup_rv_haswell_asm_6x12m vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(ymm0, ymm6, ymm6) @@ -1507,26 +1507,26 @@ void bli_sgemmsup_rv_haswell_asm_6x12m vmulps(xmm0, xmm13, xmm13) vmulps(ymm0, ymm14, ymm14) vmulps(xmm0, xmm15, xmm15) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1535,60 +1535,60 @@ void bli_sgemmsup_rv_haswell_asm_6x12m cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9) vmovups(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11) vmovups(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm13) vmovups(xmm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14) vmovups(ymm14, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm15) vmovups(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1711,51 +1711,51 @@ void bli_sgemmsup_rv_haswell_asm_6x12m jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovups(ymm6, mem(rcx, 0*32)) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm8, mem(rcx, 0*32)) vmovups(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm10, mem(rcx, 0*32)) vmovups(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm12, mem(rcx, 0*32)) vmovups(xmm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm14, mem(rcx, 0*32)) vmovups(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1844,9 +1844,9 @@ void bli_sgemmsup_rv_haswell_asm_6x12m //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) @@ -1867,8 +1867,8 @@ void bli_sgemmsup_rv_haswell_asm_6x12m label(.SRETURN) - - + + end_asm( : // output operands (none) @@ -1996,8 +1996,8 @@ void bli_sgemmsup_rv_haswell_asm_6x8m float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2027,15 +2027,15 @@ void bli_sgemmsup_rv_haswell_asm_6x8m // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -2131,19 +2131,19 @@ void bli_sgemmsup_rv_haswell_asm_6x8m lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -2151,7 +2151,7 @@ void bli_sgemmsup_rv_haswell_asm_6x8m #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -2159,19 +2159,19 @@ void bli_sgemmsup_rv_haswell_asm_6x8m vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 1 #if 0 @@ -2187,19 +2187,19 @@ void bli_sgemmsup_rv_haswell_asm_6x8m vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - - + + // ---------------------------------- iteration 2 #if 0 @@ -2207,7 +2207,7 @@ void bli_sgemmsup_rv_haswell_asm_6x8m #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -2215,18 +2215,18 @@ void bli_sgemmsup_rv_haswell_asm_6x8m vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 3 @@ -2244,104 +2244,104 @@ void bli_sgemmsup_rv_haswell_asm_6x8m vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm14, ymm14) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2350,42 +2350,42 @@ void bli_sgemmsup_rv_haswell_asm_6x8m cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14) vmovups(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2463,45 +2463,45 @@ void bli_sgemmsup_rv_haswell_asm_6x8m jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(ymm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(ymm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2557,9 +2557,9 @@ void bli_sgemmsup_rv_haswell_asm_6x8m //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) @@ -2580,8 +2580,8 @@ void bli_sgemmsup_rv_haswell_asm_6x8m label(.SRETURN) - - + + end_asm( : // output operands (none) @@ -2709,8 +2709,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2740,15 +2740,15 @@ void bli_sgemmsup_rv_haswell_asm_6x6m // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -2842,19 +2842,19 @@ void bli_sgemmsup_rv_haswell_asm_6x6m lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -2862,7 +2862,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6m #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -2872,19 +2872,19 @@ void bli_sgemmsup_rv_haswell_asm_6x6m vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 1 #if 0 @@ -2902,19 +2902,19 @@ void bli_sgemmsup_rv_haswell_asm_6x6m vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - - + + // ---------------------------------- iteration 2 #if 0 @@ -2922,7 +2922,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6m #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -2932,18 +2932,18 @@ void bli_sgemmsup_rv_haswell_asm_6x6m vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 3 @@ -2963,43 +2963,43 @@ void bli_sgemmsup_rv_haswell_asm_6x6m vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -3009,60 +3009,60 @@ void bli_sgemmsup_rv_haswell_asm_6x6m vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm14, ymm14) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -3071,12 +3071,12 @@ void bli_sgemmsup_rv_haswell_asm_6x6m cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vextractf128(imm(0x1), ymm4, xmm5) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*4)) @@ -3086,8 +3086,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm6, xmm7) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*4)) @@ -3097,8 +3097,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm8, xmm9) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*4)) @@ -3108,8 +3108,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm10, xmm11) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*4)) @@ -3119,8 +3119,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m vmovsd(xmm11, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm12, xmm13) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm12) vmovups(xmm12, mem(rcx, 0*4)) @@ -3130,8 +3130,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m vmovsd(xmm13, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm14, xmm15) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm14) vmovups(xmm14, mem(rcx, 0*4)) @@ -3141,8 +3141,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m vmovsd(xmm15, mem(rcx, 4*4)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -3207,57 +3207,57 @@ void bli_sgemmsup_rv_haswell_asm_6x6m jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vextractf128(imm(0x1), ymm4, xmm5) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) - + vextractf128(imm(0x1), ymm6, xmm7) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm8, xmm9) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm10, xmm11) vmovups(xmm10, mem(rcx, 0*4)) vmovsd(xmm11, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm12, xmm13) vmovups(xmm12, mem(rcx, 0*4)) vmovsd(xmm13, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm14, xmm15) vmovups(xmm14, mem(rcx, 0*4)) vmovsd(xmm15, mem(rcx, 4*4)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -3305,7 +3305,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6m //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - + label(.SDONE) @@ -3326,8 +3326,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m label(.SRETURN) - - + + end_asm( : // output operands (none) @@ -3455,8 +3455,8 @@ void bli_sgemmsup_rv_haswell_asm_6x4m float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -3486,15 +3486,15 @@ void bli_sgemmsup_rv_haswell_asm_6x4m // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -3585,19 +3585,19 @@ void bli_sgemmsup_rv_haswell_asm_6x4m lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -3605,7 +3605,7 @@ void bli_sgemmsup_rv_haswell_asm_6x4m #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -3613,19 +3613,19 @@ void bli_sgemmsup_rv_haswell_asm_6x4m vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 1 #if 0 @@ -3641,19 +3641,19 @@ void bli_sgemmsup_rv_haswell_asm_6x4m vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - - + + // ---------------------------------- iteration 2 #if 0 @@ -3661,7 +3661,7 @@ void bli_sgemmsup_rv_haswell_asm_6x4m #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -3669,18 +3669,18 @@ void bli_sgemmsup_rv_haswell_asm_6x4m vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 3 @@ -3698,104 +3698,104 @@ void bli_sgemmsup_rv_haswell_asm_6x4m vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -3804,42 +3804,42 @@ void bli_sgemmsup_rv_haswell_asm_6x4m cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) vmovups(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14) vmovups(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -3893,45 +3893,45 @@ void bli_sgemmsup_rv_haswell_asm_6x4m jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -3972,9 +3972,9 @@ void bli_sgemmsup_rv_haswell_asm_6x4m //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) @@ -3995,8 +3995,8 @@ void bli_sgemmsup_rv_haswell_asm_6x4m label(.SRETURN) - - + + end_asm( : // output operands (none) @@ -4124,8 +4124,8 @@ void bli_sgemmsup_rv_haswell_asm_6x2m float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -4155,15 +4155,15 @@ void bli_sgemmsup_rv_haswell_asm_6x2m // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -4252,19 +4252,19 @@ void bli_sgemmsup_rv_haswell_asm_6x2m lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -4272,7 +4272,7 @@ void bli_sgemmsup_rv_haswell_asm_6x2m #else prefetch(0, mem(rdx, 5*8)) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -4280,19 +4280,19 @@ void bli_sgemmsup_rv_haswell_asm_6x2m vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 1 #if 0 @@ -4308,19 +4308,19 @@ void bli_sgemmsup_rv_haswell_asm_6x2m vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - - + + // ---------------------------------- iteration 2 #if 0 @@ -4328,7 +4328,7 @@ void bli_sgemmsup_rv_haswell_asm_6x2m #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -4336,18 +4336,18 @@ void bli_sgemmsup_rv_haswell_asm_6x2m vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 3 @@ -4365,104 +4365,104 @@ void bli_sgemmsup_rv_haswell_asm_6x2m vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -4471,42 +4471,42 @@ void bli_sgemmsup_rv_haswell_asm_6x2m cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) vmovsd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14) vmovsd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -4541,45 +4541,45 @@ void bli_sgemmsup_rv_haswell_asm_6x2m jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovsd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovsd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -4606,9 +4606,9 @@ void bli_sgemmsup_rv_haswell_asm_6x2m //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) @@ -4629,8 +4629,8 @@ void bli_sgemmsup_rv_haswell_asm_6x2m label(.SRETURN) - - + + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c index 7463707cc..da768ebf1 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16n float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t m_left = m0 % 6; @@ -154,14 +154,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n } #endif - sgemmsup_ker_ft ker_fps[6] = + sgemmsup_ker_ft ker_fps[6] = { NULL, bli_sgemmsup_rv_haswell_asm_1x16n, bli_sgemmsup_rv_haswell_asm_2x16n, bli_sgemmsup_rv_haswell_asm_3x16n, bli_sgemmsup_rv_haswell_asm_4x16n, - bli_sgemmsup_rv_haswell_asm_5x16n + bli_sgemmsup_rv_haswell_asm_5x16n }; sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; @@ -203,15 +203,15 @@ void bli_sgemmsup_rv_haswell_asm_6x16n // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -322,19 +322,19 @@ void bli_sgemmsup_rv_haswell_asm_6x16n lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -353,14 +353,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -369,7 +369,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16n vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 0 @@ -388,14 +388,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -403,8 +403,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16n vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - + + // ---------------------------------- iteration 2 #if 0 @@ -423,14 +423,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -438,7 +438,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16n vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 3 @@ -459,14 +459,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -474,25 +474,25 @@ void bli_sgemmsup_rv_haswell_asm_6x16n vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 1 @@ -503,21 +503,21 @@ void bli_sgemmsup_rv_haswell_asm_6x16n vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -525,23 +525,23 @@ void bli_sgemmsup_rv_haswell_asm_6x16n vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) @@ -554,26 +554,26 @@ void bli_sgemmsup_rv_haswell_asm_6x16n vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -582,60 +582,60 @@ void bli_sgemmsup_rv_haswell_asm_6x16n cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13) vmovups(ymm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14) vmovups(ymm14, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm15) vmovups(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -781,51 +781,51 @@ void bli_sgemmsup_rv_haswell_asm_6x16n jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm10, mem(rcx, 0*32)) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm12, mem(rcx, 0*32)) vmovups(ymm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm14, mem(rcx, 0*32)) vmovups(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -930,9 +930,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16n //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) @@ -952,8 +952,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16n label(.SRETURN) - - + + end_asm( : // output operands (none) @@ -1094,8 +1094,8 @@ void bli_sgemmsup_rv_haswell_asm_5x16n float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1125,15 +1125,15 @@ void bli_sgemmsup_rv_haswell_asm_5x16n // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -1241,19 +1241,19 @@ void bli_sgemmsup_rv_haswell_asm_5x16n lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1272,20 +1272,20 @@ void bli_sgemmsup_rv_haswell_asm_5x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 1 #if 0 @@ -1304,20 +1304,20 @@ void bli_sgemmsup_rv_haswell_asm_5x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - - + + // ---------------------------------- iteration 2 #if 0 @@ -1336,19 +1336,19 @@ void bli_sgemmsup_rv_haswell_asm_5x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 3 @@ -1369,37 +1369,37 @@ void bli_sgemmsup_rv_haswell_asm_5x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 1 @@ -1410,42 +1410,42 @@ void bli_sgemmsup_rv_haswell_asm_5x16n vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) @@ -1456,26 +1456,26 @@ void bli_sgemmsup_rv_haswell_asm_5x16n vmulps(ymm0, ymm11, ymm11) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm13, ymm13) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1484,52 +1484,52 @@ void bli_sgemmsup_rv_haswell_asm_5x16n cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13) vmovups(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1701,46 +1701,46 @@ void bli_sgemmsup_rv_haswell_asm_5x16n jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm10, mem(rcx, 0*32)) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm12, mem(rcx, 0*32)) vmovups(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1862,9 +1862,9 @@ void bli_sgemmsup_rv_haswell_asm_5x16n //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) @@ -1884,8 +1884,8 @@ void bli_sgemmsup_rv_haswell_asm_5x16n label(.SRETURN) - - + + end_asm( : // output operands (none) @@ -2026,8 +2026,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16n float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2057,15 +2057,15 @@ void bli_sgemmsup_rv_haswell_asm_4x16n // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -2170,19 +2170,19 @@ void bli_sgemmsup_rv_haswell_asm_4x16n lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -2201,7 +2201,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -2210,7 +2210,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 1 #if 0 @@ -2229,7 +2229,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -2237,8 +2237,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16n vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - - + + // ---------------------------------- iteration 2 #if 0 @@ -2257,7 +2257,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -2265,7 +2265,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 3 @@ -2286,7 +2286,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -2294,25 +2294,25 @@ void bli_sgemmsup_rv_haswell_asm_4x16n vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 1 @@ -2323,14 +2323,14 @@ void bli_sgemmsup_rv_haswell_asm_4x16n vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -2338,23 +2338,23 @@ void bli_sgemmsup_rv_haswell_asm_4x16n vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) @@ -2363,26 +2363,26 @@ void bli_sgemmsup_rv_haswell_asm_4x16n vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2391,44 +2391,44 @@ void bli_sgemmsup_rv_haswell_asm_4x16n cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11) vmovups(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2516,41 +2516,41 @@ void bli_sgemmsup_rv_haswell_asm_4x16n jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm10, mem(rcx, 0*32)) vmovups(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2620,9 +2620,9 @@ void bli_sgemmsup_rv_haswell_asm_4x16n //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) @@ -2642,8 +2642,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16n label(.SRETURN) - - + + end_asm( : // output operands (none) @@ -2784,8 +2784,8 @@ void bli_sgemmsup_rv_haswell_asm_3x16n float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2815,15 +2815,15 @@ void bli_sgemmsup_rv_haswell_asm_3x16n // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - + //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -2931,19 +2931,19 @@ void bli_sgemmsup_rv_haswell_asm_3x16n lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -2962,13 +2962,13 @@ void bli_sgemmsup_rv_haswell_asm_3x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - + // ---------------------------------- iteration 1 #if 0 @@ -2987,13 +2987,13 @@ void bli_sgemmsup_rv_haswell_asm_3x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - - + + // ---------------------------------- iteration 2 #if 0 @@ -3012,12 +3012,12 @@ void bli_sgemmsup_rv_haswell_asm_3x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - + // ---------------------------------- iteration 3 @@ -3038,30 +3038,30 @@ void bli_sgemmsup_rv_haswell_asm_3x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 1 @@ -3072,61 +3072,61 @@ void bli_sgemmsup_rv_haswell_asm_3x16n vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -3135,36 +3135,36 @@ void bli_sgemmsup_rv_haswell_asm_3x16n cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -3316,36 +3316,36 @@ void bli_sgemmsup_rv_haswell_asm_3x16n jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -3439,9 +3439,9 @@ void bli_sgemmsup_rv_haswell_asm_3x16n //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.SDONE) @@ -3461,8 +3461,8 @@ void bli_sgemmsup_rv_haswell_asm_3x16n label(.SRETURN) - - + + end_asm( : // output operands (none) @@ -3603,8 +3603,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16n float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -3634,15 +3634,15 @@ void bli_sgemmsup_rv_haswell_asm_2x16n // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - + //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -3741,19 +3741,19 @@ void bli_sgemmsup_rv_haswell_asm_2x16n lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -3774,7 +3774,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16n vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 1 #if 0 @@ -3794,8 +3794,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - - + + // ---------------------------------- iteration 2 #if 0 @@ -3815,7 +3815,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 3 @@ -3837,25 +3837,25 @@ void bli_sgemmsup_rv_haswell_asm_2x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 1 @@ -3866,7 +3866,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16n vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; @@ -3874,47 +3874,47 @@ void bli_sgemmsup_rv_haswell_asm_2x16n vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -3923,28 +3923,28 @@ void bli_sgemmsup_rv_haswell_asm_2x16n cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -4012,21 +4012,21 @@ void bli_sgemmsup_rv_haswell_asm_2x16n jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -4034,8 +4034,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16n vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -4077,9 +4077,9 @@ void bli_sgemmsup_rv_haswell_asm_2x16n //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) @@ -4099,8 +4099,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16n label(.SRETURN) - - + + end_asm( : // output operands (none) @@ -4241,8 +4241,8 @@ void bli_sgemmsup_rv_haswell_asm_1x16n float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -4272,15 +4272,15 @@ void bli_sgemmsup_rv_haswell_asm_1x16n // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) - + //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -4376,19 +4376,19 @@ void bli_sgemmsup_rv_haswell_asm_1x16n lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -4406,7 +4406,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16n vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 1 #if 0 @@ -4423,8 +4423,8 @@ void bli_sgemmsup_rv_haswell_asm_1x16n add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - - + + // ---------------------------------- iteration 2 #if 0 @@ -4441,7 +4441,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16n add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 3 @@ -4460,25 +4460,25 @@ void bli_sgemmsup_rv_haswell_asm_1x16n add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 1 @@ -4489,50 +4489,50 @@ void bli_sgemmsup_rv_haswell_asm_1x16n vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -4541,20 +4541,20 @@ void bli_sgemmsup_rv_haswell_asm_1x16n cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -4648,26 +4648,26 @@ void bli_sgemmsup_rv_haswell_asm_1x16n jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -4728,9 +4728,9 @@ void bli_sgemmsup_rv_haswell_asm_1x16n //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) @@ -4750,8 +4750,8 @@ void bli_sgemmsup_rv_haswell_asm_1x16n label(.SRETURN) - - + + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c index 69d543a99..67b3ec8bf 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -104,8 +104,8 @@ void PASTEMAC(ch,opname) \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ for ( dim_t i = 0; i < mdim; ++i ) \ diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c index 457ef9f22..929f9ea47 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c @@ -72,8 +72,8 @@ void bli_dgemmsup_rd_haswell_asm_6x1 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -586,8 +586,8 @@ void bli_dgemmsup_rd_haswell_asm_3x1 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -995,8 +995,8 @@ void bli_dgemmsup_rd_haswell_asm_2x1 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1369,8 +1369,8 @@ void bli_dgemmsup_rd_haswell_asm_1x1 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c index af498eb0e..397d932e4 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c @@ -72,8 +72,8 @@ void bli_dgemmsup_rd_haswell_asm_6x2 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -99,9 +99,9 @@ void bli_dgemmsup_rd_haswell_asm_6x2 // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -119,7 +119,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -169,19 +169,19 @@ void bli_dgemmsup_rd_haswell_asm_6x2 prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c #endif - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -219,7 +219,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2 vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) @@ -253,7 +253,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -321,27 +321,27 @@ void bli_dgemmsup_rd_haswell_asm_6x2 vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -377,21 +377,21 @@ void bli_dgemmsup_rd_haswell_asm_6x2 vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -399,7 +399,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; @@ -429,12 +429,12 @@ void bli_dgemmsup_rd_haswell_asm_6x2 vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -447,7 +447,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2 // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 - + vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) @@ -480,7 +480,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2 // xmm14[0:1] = sum(ymm14) sum(ymm15) - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -488,103 +488,103 @@ void bli_dgemmsup_rd_haswell_asm_6x2 mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), xmm3, xmm10) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), xmm3, xmm12) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), xmm3, xmm14) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovupd(xmm6, mem(rcx)) add(rdi, rcx) - + vmovupd(xmm8, mem(rcx)) add(rdi, rcx) - + vmovupd(xmm10, mem(rcx)) add(rdi, rcx) - + vmovupd(xmm12, mem(rcx)) add(rdi, rcx) - + vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - + label(.DRETURN) - + end_asm( : // output operands (none) @@ -628,8 +628,8 @@ void bli_dgemmsup_rd_haswell_asm_3x2 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -655,9 +655,9 @@ void bli_dgemmsup_rd_haswell_asm_3x2 // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -675,7 +675,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -716,19 +716,19 @@ void bli_dgemmsup_rd_haswell_asm_3x2 prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c #endif - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -754,7 +754,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2 vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) - + // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) @@ -776,7 +776,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -820,27 +820,27 @@ void bli_dgemmsup_rd_haswell_asm_3x2 vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -864,21 +864,21 @@ void bli_dgemmsup_rd_haswell_asm_3x2 vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -886,7 +886,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; @@ -904,12 +904,12 @@ void bli_dgemmsup_rd_haswell_asm_3x2 vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -919,7 +919,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2 // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 - + vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) @@ -937,7 +937,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2 // xmm8[0:1] = sum(ymm8) sum(ymm9) - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -945,79 +945,79 @@ void bli_dgemmsup_rd_haswell_asm_3x2 mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovupd(xmm6, mem(rcx)) add(rdi, rcx) - + vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - + label(.DRETURN) - + end_asm( : // output operands (none) @@ -1061,8 +1061,8 @@ void bli_dgemmsup_rd_haswell_asm_2x2 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1088,9 +1088,9 @@ void bli_dgemmsup_rd_haswell_asm_2x2 // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1108,7 +1108,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1146,19 +1146,19 @@ void bli_dgemmsup_rd_haswell_asm_2x2 prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c #endif - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1180,7 +1180,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2 vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) @@ -1198,7 +1198,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1234,27 +1234,27 @@ void bli_dgemmsup_rd_haswell_asm_2x2 vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1274,21 +1274,21 @@ void bli_dgemmsup_rd_haswell_asm_2x2 vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1296,7 +1296,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; @@ -1310,12 +1310,12 @@ void bli_dgemmsup_rd_haswell_asm_2x2 vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -1324,7 +1324,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2 // ymm4 ymm5 // ymm6 ymm7 - + vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) @@ -1337,7 +1337,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2 // xmm6[0:1] = sum(ymm6) sum(ymm7) - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -1345,71 +1345,71 @@ void bli_dgemmsup_rd_haswell_asm_2x2 mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - + label(.DRETURN) - + end_asm( : // output operands (none) @@ -1453,8 +1453,8 @@ void bli_dgemmsup_rd_haswell_asm_1x2 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1480,9 +1480,9 @@ void bli_dgemmsup_rd_haswell_asm_1x2 // ------------------------------------------------------------------------- begin_asm() - + //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1500,7 +1500,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1535,19 +1535,19 @@ void bli_dgemmsup_rd_haswell_asm_1x2 //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c #endif - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1565,7 +1565,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2 vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) - + // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) @@ -1579,7 +1579,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1607,27 +1607,27 @@ void bli_dgemmsup_rd_haswell_asm_1x2 vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1643,21 +1643,21 @@ void bli_dgemmsup_rd_haswell_asm_1x2 vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1665,7 +1665,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; @@ -1675,12 +1675,12 @@ void bli_dgemmsup_rd_haswell_asm_1x2 vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -1688,7 +1688,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2 label(.DPOSTACCUM) // ymm4 ymm5 - + vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) @@ -1696,7 +1696,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2 // xmm4[0:1] = sum(ymm4) sum(ymm5) - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -1704,63 +1704,63 @@ void bli_dgemmsup_rd_haswell_asm_1x2 mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - + label(.DRETURN) - + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c index 516bfced5..75e84650c 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c @@ -72,8 +72,8 @@ void bli_dgemmsup_rd_haswell_asm_6x4 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -605,8 +605,8 @@ void bli_dgemmsup_rd_haswell_asm_2x4 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1055,8 +1055,8 @@ void bli_dgemmsup_rd_haswell_asm_1x4 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c index 571444bed..b2e3d83af 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c @@ -72,8 +72,8 @@ void bli_dgemmsup_rd_haswell_asm_6x8 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t n_left = n0 % 8; @@ -163,7 +163,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -181,7 +181,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -256,18 +256,18 @@ void bli_dgemmsup_rd_haswell_asm_6x8 #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a - - + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -302,7 +302,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8 vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) @@ -333,7 +333,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -395,27 +395,27 @@ void bli_dgemmsup_rd_haswell_asm_6x8 vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -426,7 +426,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8 vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -448,21 +448,21 @@ void bli_dgemmsup_rd_haswell_asm_6x8 vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -470,12 +470,12 @@ void bli_dgemmsup_rd_haswell_asm_6x8 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -497,23 +497,23 @@ void bli_dgemmsup_rd_haswell_asm_6x8 vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 + + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -553,7 +553,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8 // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) @@ -561,73 +561,73 @@ void bli_dgemmsup_rd_haswell_asm_6x8 mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm5, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - - + + lea(mem(r12, rdi, 2), r12) // @@ -651,7 +651,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8 label(.DRETURN) - + end_asm( : // output operands (none) @@ -735,8 +735,8 @@ void bli_dgemmsup_rd_haswell_asm_2x8 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -761,7 +761,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -779,7 +779,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -833,18 +833,18 @@ void bli_dgemmsup_rd_haswell_asm_2x8 prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c #endif - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -874,7 +874,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8 vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) @@ -900,7 +900,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -952,27 +952,27 @@ void bli_dgemmsup_rd_haswell_asm_2x8 vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -982,7 +982,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_a = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -1000,21 +1000,21 @@ void bli_dgemmsup_rd_haswell_asm_2x8 vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1022,11 +1022,11 @@ void bli_dgemmsup_rd_haswell_asm_2x8 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) @@ -1044,22 +1044,22 @@ void bli_dgemmsup_rd_haswell_asm_2x8 vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 + + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 - + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -1090,70 +1090,70 @@ void bli_dgemmsup_rd_haswell_asm_2x8 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - - + + add(imm(4), r15) // jj += 4; @@ -1165,7 +1165,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8 label(.DRETURN) - + end_asm( : // output operands (none) @@ -1209,8 +1209,8 @@ void bli_dgemmsup_rd_haswell_asm_1x8 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1235,7 +1235,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1253,7 +1253,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c @@ -1302,18 +1302,18 @@ void bli_dgemmsup_rd_haswell_asm_1x8 prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c #endif - - - + + + mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. - - + + label(.DLOOPKITER16) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1338,7 +1338,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8 add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) - + // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) @@ -1359,7 +1359,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1401,27 +1401,27 @@ void bli_dgemmsup_rd_haswell_asm_1x8 add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKITER4) - + mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. - - + + label(.DLOOPKITER4) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1430,7 +1430,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_a = 4*8; - + vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) @@ -1444,21 +1444,21 @@ void bli_dgemmsup_rd_haswell_asm_1x8 add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. - - - + + + label(.DCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.DLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1466,10 +1466,10 @@ void bli_dgemmsup_rd_haswell_asm_1x8 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_a = 1*8; - + vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) @@ -1483,12 +1483,12 @@ void bli_dgemmsup_rd_haswell_asm_1x8 add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -1496,9 +1496,9 @@ void bli_dgemmsup_rd_haswell_asm_1x8 label(.DPOSTACCUM) - - // ymm4 ymm7 ymm10 ymm13 - + + // ymm4 ymm7 ymm10 ymm13 + vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) @@ -1513,7 +1513,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8 - + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1521,57 +1521,57 @@ void bli_dgemmsup_rd_haswell_asm_1x8 mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.DDONE) - - + + add(imm(4), r15) // jj += 4; @@ -1583,7 +1583,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8 label(.DRETURN) - + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c index eb1118196..5843d5e40 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x2 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -115,9 +115,9 @@ void bli_dgemmsup_rv_haswell_asm_6x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -132,7 +132,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -168,31 +168,31 @@ void bli_dgemmsup_rv_haswell_asm_6x2 prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -200,19 +200,19 @@ void bli_dgemmsup_rv_haswell_asm_6x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 1 #if 0 @@ -226,25 +226,25 @@ void bli_dgemmsup_rv_haswell_asm_6x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -252,18 +252,18 @@ void bli_dgemmsup_rv_haswell_asm_6x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 3 @@ -278,43 +278,43 @@ void bli_dgemmsup_rv_haswell_asm_6x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -322,57 +322,57 @@ void bli_dgemmsup_rv_haswell_asm_6x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -381,42 +381,42 @@ void bli_dgemmsup_rv_haswell_asm_6x2 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10) vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12) vmovupd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm14) vmovupd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -452,40 +452,40 @@ void bli_dgemmsup_rv_haswell_asm_6x2 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) @@ -517,13 +517,13 @@ void bli_dgemmsup_rv_haswell_asm_6x2 vmovupd(xmm1, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -566,8 +566,8 @@ void bli_dgemmsup_rv_haswell_asm_5x2 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -588,9 +588,9 @@ void bli_dgemmsup_rv_haswell_asm_5x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -605,7 +605,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -646,21 +646,21 @@ void bli_dgemmsup_rv_haswell_asm_5x2 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 - + #if 1 prefetch(0, mem(rdx, 5*8)) #endif @@ -672,17 +672,17 @@ void bli_dgemmsup_rv_haswell_asm_5x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) - + // ---------------------------------- iteration 1 #if 0 @@ -696,23 +696,23 @@ void bli_dgemmsup_rv_haswell_asm_5x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) - + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -720,16 +720,16 @@ void bli_dgemmsup_rv_haswell_asm_5x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) - + // ---------------------------------- iteration 3 @@ -744,41 +744,41 @@ void bli_dgemmsup_rv_haswell_asm_5x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -786,54 +786,54 @@ void bli_dgemmsup_rv_haswell_asm_5x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // r13 = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -842,37 +842,37 @@ void bli_dgemmsup_rv_haswell_asm_5x2 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10) vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12) vmovupd(xmm12, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -907,37 +907,37 @@ void bli_dgemmsup_rv_haswell_asm_5x2 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(xmm12, mem(rcx, 0*32)) //add(rdi, rcx) @@ -947,7 +947,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2 label(.DCOLSTORBZ) - + // begin I/O on columns 0-1 vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) @@ -967,13 +967,13 @@ void bli_dgemmsup_rv_haswell_asm_5x2 vmovhpd(xmm0, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1016,8 +1016,8 @@ void bli_dgemmsup_rv_haswell_asm_4x2 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1038,9 +1038,9 @@ void bli_dgemmsup_rv_haswell_asm_4x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1055,7 +1055,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1089,31 +1089,31 @@ void bli_dgemmsup_rv_haswell_asm_4x2 prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c - + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1121,14 +1121,14 @@ void bli_dgemmsup_rv_haswell_asm_4x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + // ---------------------------------- iteration 1 #if 0 @@ -1142,20 +1142,20 @@ void bli_dgemmsup_rv_haswell_asm_4x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1163,13 +1163,13 @@ void bli_dgemmsup_rv_haswell_asm_4x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - + // ---------------------------------- iteration 3 @@ -1184,89 +1184,89 @@ void bli_dgemmsup_rv_haswell_asm_4x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1275,32 +1275,32 @@ void bli_dgemmsup_rv_haswell_asm_4x2 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10) vmovupd(xmm10, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1326,32 +1326,32 @@ void bli_dgemmsup_rv_haswell_asm_4x2 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm10, mem(rcx, 0*32)) //add(rdi, rcx) @@ -1362,7 +1362,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2 label(.DCOLSTORBZ) - + // begin I/O on columns 0-1 vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) @@ -1375,13 +1375,13 @@ void bli_dgemmsup_rv_haswell_asm_4x2 vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1424,8 +1424,8 @@ void bli_dgemmsup_rv_haswell_asm_3x2 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1446,9 +1446,9 @@ void bli_dgemmsup_rv_haswell_asm_3x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1463,7 +1463,7 @@ void bli_dgemmsup_rv_haswell_asm_3x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1496,31 +1496,31 @@ void bli_dgemmsup_rv_haswell_asm_3x2 prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1528,12 +1528,12 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) - - + + // ---------------------------------- iteration 1 #if 0 @@ -1547,18 +1547,18 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) - + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1566,11 +1566,11 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) - + // ---------------------------------- iteration 3 @@ -1585,36 +1585,36 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1622,61 +1622,61 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) @@ -1690,10 +1690,10 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) vmovupd(xmm8, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. - + label(.DCOLSTORED) @@ -1722,26 +1722,26 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) - + //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. - - + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) @@ -1752,8 +1752,8 @@ void bli_dgemmsup_rv_haswell_asm_3x2 vmovupd(xmm8, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1781,12 +1781,12 @@ void bli_dgemmsup_rv_haswell_asm_3x2 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1829,8 +1829,8 @@ void bli_dgemmsup_rv_haswell_asm_2x2 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1851,9 +1851,9 @@ void bli_dgemmsup_rv_haswell_asm_2x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1868,7 +1868,7 @@ void bli_dgemmsup_rv_haswell_asm_2x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1900,41 +1900,41 @@ void bli_dgemmsup_rv_haswell_asm_2x2 prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + // ---------------------------------- iteration 1 #if 0 @@ -1943,29 +1943,29 @@ void bli_dgemmsup_rv_haswell_asm_2x2 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - + // ---------------------------------- iteration 3 @@ -1975,82 +1975,82 @@ void bli_dgemmsup_rv_haswell_asm_2x2 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2059,22 +2059,22 @@ void bli_dgemmsup_rv_haswell_asm_2x2 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) vmovupd(xmm6, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2094,34 +2094,34 @@ void bli_dgemmsup_rv_haswell_asm_2x2 jmp(.DDONE) // jump to end. - - - + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(xmm6, mem(rcx, 0*32)) //add(rdi, rcx) - + jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) - + vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) @@ -2130,13 +2130,13 @@ void bli_dgemmsup_rv_haswell_asm_2x2 vmovupd(xmm1, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -2179,8 +2179,8 @@ void bli_dgemmsup_rv_haswell_asm_1x2 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2201,9 +2201,9 @@ void bli_dgemmsup_rv_haswell_asm_1x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2218,7 +2218,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2249,31 +2249,31 @@ void bli_dgemmsup_rv_haswell_asm_1x2 prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -2281,7 +2281,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2 add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) - + // ---------------------------------- iteration 1 #if 0 @@ -2294,21 +2294,21 @@ void bli_dgemmsup_rv_haswell_asm_1x2 vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) - + // ---------------------------------- iteration 3 @@ -2322,92 +2322,92 @@ void bli_dgemmsup_rv_haswell_asm_1x2 vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(xmm0, xmm4, xmm4) // scale by alpha - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2422,48 +2422,48 @@ void bli_dgemmsup_rv_haswell_asm_1x2 vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) - + //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. - + label(.DCOLSTORBZ) - + // begin I/O on columns 0-1 vmovlpd(xmm4, mem(rcx )) vmovhpd(xmm4, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c index bdcf833e3..6fb5eaf8a 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x4 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -115,9 +115,9 @@ void bli_dgemmsup_rv_haswell_asm_6x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -132,7 +132,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -170,31 +170,31 @@ void bli_dgemmsup_rv_haswell_asm_6x4 prefetch(0, mem(rcx, rbp, 1, 5*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -202,19 +202,19 @@ void bli_dgemmsup_rv_haswell_asm_6x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 1 #if 0 @@ -228,25 +228,25 @@ void bli_dgemmsup_rv_haswell_asm_6x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -254,18 +254,18 @@ void bli_dgemmsup_rv_haswell_asm_6x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 3 @@ -280,43 +280,43 @@ void bli_dgemmsup_rv_haswell_asm_6x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -324,57 +324,57 @@ void bli_dgemmsup_rv_haswell_asm_6x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm14, ymm14) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -383,42 +383,42 @@ void bli_dgemmsup_rv_haswell_asm_6x4 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -466,45 +466,45 @@ void bli_dgemmsup_rv_haswell_asm_6x4 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) - + jmp(.DDONE) // jump to end. @@ -539,13 +539,13 @@ void bli_dgemmsup_rv_haswell_asm_6x4 vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -588,8 +588,8 @@ void bli_dgemmsup_rv_haswell_asm_5x4 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -610,9 +610,9 @@ void bli_dgemmsup_rv_haswell_asm_5x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -627,7 +627,7 @@ void bli_dgemmsup_rv_haswell_asm_5x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -670,19 +670,19 @@ void bli_dgemmsup_rv_haswell_asm_5x4 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -696,17 +696,17 @@ void bli_dgemmsup_rv_haswell_asm_5x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) - + // ---------------------------------- iteration 1 #if 0 @@ -720,16 +720,16 @@ void bli_dgemmsup_rv_haswell_asm_5x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) - + // ---------------------------------- iteration 2 @@ -744,16 +744,16 @@ void bli_dgemmsup_rv_haswell_asm_5x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) - + // ---------------------------------- iteration 3 @@ -768,41 +768,41 @@ void bli_dgemmsup_rv_haswell_asm_5x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -810,54 +810,54 @@ void bli_dgemmsup_rv_haswell_asm_5x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm12, ymm12) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -866,37 +866,37 @@ void bli_dgemmsup_rv_haswell_asm_5x4 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -943,41 +943,41 @@ void bli_dgemmsup_rv_haswell_asm_5x4 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) //add(rdi, rcx) - + jmp(.DDONE) // jump to end. @@ -1010,13 +1010,13 @@ void bli_dgemmsup_rv_haswell_asm_5x4 vmovhpd(xmm1, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1059,8 +1059,8 @@ void bli_dgemmsup_rv_haswell_asm_4x4 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1081,9 +1081,9 @@ void bli_dgemmsup_rv_haswell_asm_4x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1098,7 +1098,7 @@ void bli_dgemmsup_rv_haswell_asm_4x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1134,8 +1134,8 @@ void bli_dgemmsup_rv_haswell_asm_4x4 prefetch(0, mem(rcx, rbp, 1, 3*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; @@ -1143,22 +1143,22 @@ void bli_dgemmsup_rv_haswell_asm_4x4 - + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1166,14 +1166,14 @@ void bli_dgemmsup_rv_haswell_asm_4x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - - + + // ---------------------------------- iteration 1 #if 0 @@ -1182,39 +1182,39 @@ void bli_dgemmsup_rv_haswell_asm_4x4 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - + // ---------------------------------- iteration 3 @@ -1224,128 +1224,128 @@ void bli_dgemmsup_rv_haswell_asm_4x4 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm10, ymm10) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1377,33 +1377,33 @@ void bli_dgemmsup_rv_haswell_asm_4x4 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm10, mem(rcx, 0*32)) //add(rdi, rcx) @@ -1413,7 +1413,7 @@ void bli_dgemmsup_rv_haswell_asm_4x4 label(.DCOLSTORBZ) - + // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) @@ -1431,12 +1431,12 @@ void bli_dgemmsup_rv_haswell_asm_4x4 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1479,8 +1479,8 @@ void bli_dgemmsup_rv_haswell_asm_3x4 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1501,9 +1501,9 @@ void bli_dgemmsup_rv_haswell_asm_3x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1518,7 +1518,7 @@ void bli_dgemmsup_rv_haswell_asm_3x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1553,31 +1553,31 @@ void bli_dgemmsup_rv_haswell_asm_3x4 prefetch(0, mem(rcx, rbp, 1, 2*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1585,12 +1585,12 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) - - + + // ---------------------------------- iteration 1 #if 0 @@ -1604,18 +1604,18 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) - + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1623,11 +1623,11 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) - + // ---------------------------------- iteration 3 @@ -1642,36 +1642,36 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1679,61 +1679,61 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) @@ -1747,10 +1747,10 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. - + label(.DCOLSTORED) @@ -1791,26 +1791,26 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) - + //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. - - + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) @@ -1821,8 +1821,8 @@ void bli_dgemmsup_rv_haswell_asm_3x4 vmovupd(ymm8, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1858,12 +1858,12 @@ void bli_dgemmsup_rv_haswell_asm_3x4 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1906,8 +1906,8 @@ void bli_dgemmsup_rv_haswell_asm_2x4 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1928,9 +1928,9 @@ void bli_dgemmsup_rv_haswell_asm_2x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1945,7 +1945,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1979,31 +1979,31 @@ void bli_dgemmsup_rv_haswell_asm_2x4 prefetch(0, mem(rcx, rbp, 1, 1*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -2012,8 +2012,8 @@ void bli_dgemmsup_rv_haswell_asm_2x4 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - - + + // ---------------------------------- iteration 1 #if 0 @@ -2028,10 +2028,10 @@ void bli_dgemmsup_rv_haswell_asm_2x4 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif @@ -2044,7 +2044,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - + // ---------------------------------- iteration 3 @@ -2060,32 +2060,32 @@ void bli_dgemmsup_rv_haswell_asm_2x4 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -2094,42 +2094,42 @@ void bli_dgemmsup_rv_haswell_asm_2x4 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2138,22 +2138,22 @@ void bli_dgemmsup_rv_haswell_asm_2x4 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2179,24 +2179,24 @@ void bli_dgemmsup_rv_haswell_asm_2x4 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) //add(rdi, rcx) @@ -2207,7 +2207,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4 label(.DCOLSTORBZ) - + // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) @@ -2220,13 +2220,13 @@ void bli_dgemmsup_rv_haswell_asm_2x4 vmovupd(xmm4, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -2269,8 +2269,8 @@ void bli_dgemmsup_rv_haswell_asm_1x4 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2291,9 +2291,9 @@ void bli_dgemmsup_rv_haswell_asm_1x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2308,7 +2308,7 @@ void bli_dgemmsup_rv_haswell_asm_1x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2341,27 +2341,27 @@ void bli_dgemmsup_rv_haswell_asm_1x4 prefetch(0, mem(rcx, rbp, 1, 0*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 - + #if 1 prefetch(0, mem(rdx, 4*8)) #endif @@ -2372,8 +2372,8 @@ void bli_dgemmsup_rv_haswell_asm_1x4 vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) - - + + // ---------------------------------- iteration 1 #if 0 @@ -2386,21 +2386,21 @@ void bli_dgemmsup_rv_haswell_asm_1x4 vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) - + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) - + // ---------------------------------- iteration 3 @@ -2414,27 +2414,27 @@ void bli_dgemmsup_rv_haswell_asm_1x4 vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) @@ -2446,41 +2446,41 @@ void bli_dgemmsup_rv_haswell_asm_1x4 vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2488,17 +2488,17 @@ void bli_dgemmsup_rv_haswell_asm_1x4 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2520,15 +2520,15 @@ void bli_dgemmsup_rv_haswell_asm_1x4 vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) - + //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) @@ -2536,10 +2536,10 @@ void bli_dgemmsup_rv_haswell_asm_1x4 jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) //add(rdi, rcx) @@ -2549,7 +2549,7 @@ void bli_dgemmsup_rv_haswell_asm_1x4 label(.DCOLSTORBZ) - + // begin I/O on columns 0-3 vmovupd(ymm4, ymm0) @@ -2560,14 +2560,14 @@ void bli_dgemmsup_rv_haswell_asm_1x4 vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) - - - - - + + + + + label(.DDONE) - - + + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c index 9da1e7b83..2b7222a34 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -115,15 +115,15 @@ void bli_dgemmsup_rv_haswell_asm_6x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -180,18 +180,18 @@ void bli_dgemmsup_rv_haswell_asm_6x6 lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -208,14 +208,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -224,7 +224,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 0 @@ -241,14 +241,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -256,8 +256,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - + + // ---------------------------------- iteration 2 #if 1 @@ -274,14 +274,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -289,7 +289,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 3 @@ -307,14 +307,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -322,50 +322,50 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -373,22 +373,22 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) @@ -401,24 +401,24 @@ void bli_dgemmsup_rv_haswell_asm_6x6 vmulpd(xmm0, xmm13, xmm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(xmm0, xmm15, xmm15) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -427,60 +427,60 @@ void bli_dgemmsup_rv_haswell_asm_6x6 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13) vmovupd(xmm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm15) vmovupd(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -555,51 +555,51 @@ void bli_dgemmsup_rv_haswell_asm_6x6 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(xmm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm14, mem(rcx, 0*32)) vmovupd(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -656,12 +656,12 @@ void bli_dgemmsup_rv_haswell_asm_6x6 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -704,8 +704,8 @@ void bli_dgemmsup_rv_haswell_asm_5x6 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -726,15 +726,15 @@ void bli_dgemmsup_rv_haswell_asm_5x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -790,18 +790,18 @@ void bli_dgemmsup_rv_haswell_asm_5x6 lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -818,20 +818,20 @@ void bli_dgemmsup_rv_haswell_asm_5x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 1 #if 0 @@ -848,20 +848,20 @@ void bli_dgemmsup_rv_haswell_asm_5x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - + + // ---------------------------------- iteration 2 #if 1 @@ -878,19 +878,19 @@ void bli_dgemmsup_rv_haswell_asm_5x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 3 @@ -908,82 +908,82 @@ void bli_dgemmsup_rv_haswell_asm_5x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) @@ -994,24 +994,24 @@ void bli_dgemmsup_rv_haswell_asm_5x6 vmulpd(xmm0, xmm11, xmm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(xmm0, xmm13, xmm13) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1020,52 +1020,52 @@ void bli_dgemmsup_rv_haswell_asm_5x6 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13) vmovupd(xmm13, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1138,46 +1138,46 @@ void bli_dgemmsup_rv_haswell_asm_5x6 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(xmm13, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1231,12 +1231,12 @@ void bli_dgemmsup_rv_haswell_asm_5x6 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1279,8 +1279,8 @@ void bli_dgemmsup_rv_haswell_asm_4x6 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1301,15 +1301,15 @@ void bli_dgemmsup_rv_haswell_asm_4x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -1365,17 +1365,17 @@ void bli_dgemmsup_rv_haswell_asm_4x6 #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -1392,7 +1392,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1401,7 +1401,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 1 #if 0 @@ -1418,7 +1418,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1426,8 +1426,8 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - + + // ---------------------------------- iteration 2 #if 1 @@ -1444,7 +1444,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1452,7 +1452,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 3 @@ -1470,7 +1470,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1478,43 +1478,43 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1522,22 +1522,22 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) @@ -1546,24 +1546,24 @@ void bli_dgemmsup_rv_haswell_asm_4x6 vmulpd(xmm0, xmm9, xmm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(xmm0, xmm11, xmm11) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1572,44 +1572,44 @@ void bli_dgemmsup_rv_haswell_asm_4x6 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11) vmovupd(xmm11, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1658,41 +1658,41 @@ void bli_dgemmsup_rv_haswell_asm_4x6 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(xmm11, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1729,9 +1729,9 @@ void bli_dgemmsup_rv_haswell_asm_4x6 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) @@ -1777,8 +1777,8 @@ void bli_dgemmsup_rv_haswell_asm_3x6 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1799,9 +1799,9 @@ void bli_dgemmsup_rv_haswell_asm_3x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1816,7 +1816,7 @@ void bli_dgemmsup_rv_haswell_asm_3x6 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1854,31 +1854,31 @@ void bli_dgemmsup_rv_haswell_asm_3x6 prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -1889,13 +1889,13 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - - + + // ---------------------------------- iteration 1 #if 0 @@ -1912,15 +1912,15 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - + // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif @@ -1935,12 +1935,12 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - + // ---------------------------------- iteration 3 @@ -1958,37 +1958,37 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -1999,65 +1999,65 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(xmm0, xmm9, xmm9) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) @@ -2080,10 +2080,10 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) vmovupd(xmm9, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. - + label(.DCOLSTORED) @@ -2124,7 +2124,7 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) - + lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-5 @@ -2155,26 +2155,26 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) - + //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. - - + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -2186,8 +2186,8 @@ void bli_dgemmsup_rv_haswell_asm_3x6 vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(xmm9, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2244,12 +2244,12 @@ void bli_dgemmsup_rv_haswell_asm_3x6 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -2292,8 +2292,8 @@ void bli_dgemmsup_rv_haswell_asm_2x6 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2314,15 +2314,15 @@ void bli_dgemmsup_rv_haswell_asm_2x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -2376,17 +2376,17 @@ void bli_dgemmsup_rv_haswell_asm_2x6 #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -2405,7 +2405,7 @@ void bli_dgemmsup_rv_haswell_asm_2x6 vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 1 #if 0 @@ -2423,8 +2423,8 @@ void bli_dgemmsup_rv_haswell_asm_2x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - - + + // ---------------------------------- iteration 2 #if 1 @@ -2442,7 +2442,7 @@ void bli_dgemmsup_rv_haswell_asm_2x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 3 @@ -2461,36 +2461,36 @@ void bli_dgemmsup_rv_haswell_asm_2x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; @@ -2498,44 +2498,44 @@ void bli_dgemmsup_rv_haswell_asm_2x6 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2544,28 +2544,28 @@ void bli_dgemmsup_rv_haswell_asm_2x6 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) vmovupd(xmm7, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2614,31 +2614,31 @@ void bli_dgemmsup_rv_haswell_asm_2x6 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2675,9 +2675,9 @@ void bli_dgemmsup_rv_haswell_asm_2x6 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) @@ -2723,8 +2723,8 @@ void bli_dgemmsup_rv_haswell_asm_1x6 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2745,15 +2745,15 @@ void bli_dgemmsup_rv_haswell_asm_1x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -2806,17 +2806,17 @@ void bli_dgemmsup_rv_haswell_asm_1x6 #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -2832,7 +2832,7 @@ void bli_dgemmsup_rv_haswell_asm_1x6 vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 1 #if 0 @@ -2847,8 +2847,8 @@ void bli_dgemmsup_rv_haswell_asm_1x6 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - - + + // ---------------------------------- iteration 2 #if 1 @@ -2863,7 +2863,7 @@ void bli_dgemmsup_rv_haswell_asm_1x6 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 3 @@ -2879,76 +2879,76 @@ void bli_dgemmsup_rv_haswell_asm_1x6 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2957,20 +2957,20 @@ void bli_dgemmsup_rv_haswell_asm_1x6 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -3007,26 +3007,26 @@ void bli_dgemmsup_rv_haswell_asm_1x6 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -3052,9 +3052,9 @@ void bli_dgemmsup_rv_haswell_asm_1x6 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c index a6c8f0e43..b3a7c17ca 100644 --- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c +++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -108,8 +108,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { #if 0 @@ -178,7 +178,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8 // Advance C and A pointers by the mrs and nrs we just // used, and decrement m_left. cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; - } + } } // Advance C and B pointers by the mrs and nrs we just used, and @@ -208,9 +208,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -225,7 +225,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -275,25 +275,25 @@ void bli_dgemmsup_rv_haswell_asm_6x8 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -304,14 +304,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -320,7 +320,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 0 @@ -337,14 +337,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -352,14 +352,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -370,14 +370,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -385,7 +385,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 3 @@ -403,14 +403,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -418,50 +418,50 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -469,22 +469,22 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) @@ -497,24 +497,24 @@ void bli_dgemmsup_rv_haswell_asm_6x8 vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(ymm0, ymm15, ymm15) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -523,60 +523,60 @@ void bli_dgemmsup_rv_haswell_asm_6x8 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13) vmovupd(ymm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm15) vmovupd(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -663,51 +663,51 @@ void bli_dgemmsup_rv_haswell_asm_6x8 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(ymm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm14, mem(rcx, 0*32)) vmovupd(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -772,12 +772,12 @@ void bli_dgemmsup_rv_haswell_asm_6x8 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -820,8 +820,8 @@ void bli_dgemmsup_rv_haswell_asm_5x8 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -842,15 +842,15 @@ void bli_dgemmsup_rv_haswell_asm_5x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) - + lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a @@ -909,18 +909,18 @@ void bli_dgemmsup_rv_haswell_asm_5x8 lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 @@ -937,20 +937,20 @@ void bli_dgemmsup_rv_haswell_asm_5x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 1 #if 0 @@ -967,26 +967,26 @@ void bli_dgemmsup_rv_haswell_asm_5x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -997,19 +997,19 @@ void bli_dgemmsup_rv_haswell_asm_5x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 3 @@ -1027,37 +1027,37 @@ void bli_dgemmsup_rv_haswell_asm_5x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP #if 0 @@ -1068,41 +1068,41 @@ void bli_dgemmsup_rv_haswell_asm_5x8 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) @@ -1113,24 +1113,24 @@ void bli_dgemmsup_rv_haswell_asm_5x8 vmulpd(ymm0, ymm11, ymm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm13, ymm13) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1139,52 +1139,52 @@ void bli_dgemmsup_rv_haswell_asm_5x8 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13) vmovupd(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1269,46 +1269,46 @@ void bli_dgemmsup_rv_haswell_asm_5x8 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1367,9 +1367,9 @@ void bli_dgemmsup_rv_haswell_asm_5x8 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) @@ -1415,8 +1415,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1437,9 +1437,9 @@ void bli_dgemmsup_rv_haswell_asm_4x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1454,7 +1454,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1496,31 +1496,31 @@ void bli_dgemmsup_rv_haswell_asm_4x8 prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c - + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -1531,7 +1531,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1539,8 +1539,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - + + // ---------------------------------- iteration 1 #if 0 @@ -1557,7 +1557,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1565,10 +1565,10 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif @@ -1583,7 +1583,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1591,7 +1591,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 3 @@ -1609,7 +1609,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1617,27 +1617,27 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) @@ -1653,7 +1653,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1661,22 +1661,22 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) @@ -1685,38 +1685,38 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) @@ -1747,10 +1747,10 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. - + label(.DCOLSTORED) @@ -1805,19 +1805,19 @@ void bli_dgemmsup_rv_haswell_asm_4x8 jmp(.DDONE) // jump to end. - - + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -1833,8 +1833,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8 vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -1875,12 +1875,12 @@ void bli_dgemmsup_rv_haswell_asm_4x8 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -1923,8 +1923,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1945,9 +1945,9 @@ void bli_dgemmsup_rv_haswell_asm_3x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1962,7 +1962,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2003,27 +2003,27 @@ void bli_dgemmsup_rv_haswell_asm_3x8 prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 - + #if 1 prefetch(0, mem(rdx, 4*8)) #endif @@ -2038,13 +2038,13 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - - + + // ---------------------------------- iteration 1 #if 0 @@ -2061,15 +2061,15 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - + // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif @@ -2084,12 +2084,12 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - + // ---------------------------------- iteration 3 @@ -2107,32 +2107,32 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) @@ -2148,65 +2148,65 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) @@ -2229,10 +2229,10 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. - + label(.DCOLSTORED) @@ -2273,7 +2273,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) - + lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 @@ -2312,26 +2312,26 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) - + //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. - - + + label(.DBETAZERO) - + cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -2343,8 +2343,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8 vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2409,12 +2409,12 @@ void bli_dgemmsup_rv_haswell_asm_3x8 //lea(mem(rdx, rsi, 4), rdx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -2457,8 +2457,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2479,9 +2479,9 @@ void bli_dgemmsup_rv_haswell_asm_2x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2496,7 +2496,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2536,27 +2536,27 @@ void bli_dgemmsup_rv_haswell_asm_2x8 prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c - - + + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 - + #if 1 prefetch(0, mem(rdx, 4*8)) #endif @@ -2572,8 +2572,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - - + + // ---------------------------------- iteration 1 #if 0 @@ -2591,10 +2591,10 @@ void bli_dgemmsup_rv_haswell_asm_2x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif @@ -2610,7 +2610,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 3 @@ -2629,27 +2629,27 @@ void bli_dgemmsup_rv_haswell_asm_2x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) @@ -2666,44 +2666,44 @@ void bli_dgemmsup_rv_haswell_asm_2x8 vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2712,12 +2712,12 @@ void bli_dgemmsup_rv_haswell_asm_2x8 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) @@ -2732,8 +2732,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8 vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2778,19 +2778,19 @@ void bli_dgemmsup_rv_haswell_asm_2x8 jmp(.DDONE) // jump to end. - - + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -2798,8 +2798,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8 vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -2832,12 +2832,12 @@ void bli_dgemmsup_rv_haswell_asm_2x8 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.DDONE) - - + + end_asm( : // output operands (none) @@ -2880,8 +2880,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8 double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2902,9 +2902,9 @@ void bli_dgemmsup_rv_haswell_asm_1x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2919,7 +2919,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2958,27 +2958,27 @@ void bli_dgemmsup_rv_haswell_asm_1x8 prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c - + #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.DLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 - + #if 1 prefetch(0, mem(rdx, 4*8)) #endif @@ -2991,8 +2991,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - - + + // ---------------------------------- iteration 1 #if 0 @@ -3002,15 +3002,15 @@ void bli_dgemmsup_rv_haswell_asm_1x8 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 2 - + #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif @@ -3023,7 +3023,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 3 @@ -3039,27 +3039,27 @@ void bli_dgemmsup_rv_haswell_asm_1x8 add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - - - + + + dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.DCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.DLOOPKLEFT) // EDGE LOOP - + #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) @@ -3068,18 +3068,18 @@ void bli_dgemmsup_rv_haswell_asm_1x8 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) - - + + dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.DPOSTACCUM) @@ -3088,27 +3088,27 @@ void bli_dgemmsup_rv_haswell_asm_1x8 mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate - + vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -3116,20 +3116,20 @@ void bli_dgemmsup_rv_haswell_asm_1x8 cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case - - + + label(.DROWSTORED) - - + + vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -3150,7 +3150,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8 vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) - + lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-7 @@ -3173,26 +3173,26 @@ void bli_dgemmsup_rv_haswell_asm_1x8 jmp(.DDONE) // jump to end. - - - - + + + + label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case - - + + label(.DROWSTORBZ) - - + + vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.DDONE) // jump to end. @@ -3220,14 +3220,14 @@ void bli_dgemmsup_rv_haswell_asm_1x8 vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) - - - - + + + + label(.DDONE) - - + + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c index dad5458b9..98b557fae 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -103,8 +103,8 @@ void PASTEMAC(ch,opname) \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ for ( dim_t i = 0; i < mdim; ++i ) \ @@ -175,8 +175,8 @@ void PASTEMAC(ch,opname) \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ for ( dim_t i = 0; i < m; ++i ) \ diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c index 1eb8d926c..c17b0b275 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c @@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x1 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -164,18 +164,18 @@ void bli_sgemmsup_rd_haswell_asm_6x1 prefetch(0, mem(r10, rdi, 2, 0*4)) // prefetch c + 5*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -206,7 +206,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) @@ -233,7 +233,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -287,27 +287,27 @@ void bli_sgemmsup_rd_haswell_asm_6x1 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -336,21 +336,21 @@ void bli_sgemmsup_rd_haswell_asm_6x1 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -358,7 +358,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rbx ), xmm0) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; @@ -381,12 +381,12 @@ void bli_sgemmsup_rd_haswell_asm_6x1 add(imm(1*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -399,7 +399,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1 // ymm10 // ymm12 // ymm14 - + vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -443,8 +443,8 @@ void bli_sgemmsup_rd_haswell_asm_6x1 // xmm12[0] = sum(ymm12) // xmm14[0] = sum(ymm14) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -452,109 +452,109 @@ void bli_sgemmsup_rd_haswell_asm_6x1 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovss(xmm4, mem(rcx)) add(rdi, rcx) - + vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovss(xmm6, mem(rcx)) add(rdi, rcx) - + vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm8) vmovss(xmm8, mem(rcx)) add(rdi, rcx) - + vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm10) vmovss(xmm10, mem(rcx)) add(rdi, rcx) - + vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm12) vmovss(xmm12, mem(rcx)) add(rdi, rcx) - + vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm14) vmovss(xmm14, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovss(xmm4, mem(rcx)) add(rdi, rcx) - + vmovss(xmm6, mem(rcx)) add(rdi, rcx) - + vmovss(xmm8, mem(rcx)) add(rdi, rcx) - + vmovss(xmm10, mem(rcx)) add(rdi, rcx) - + vmovss(xmm12, mem(rcx)) add(rdi, rcx) - + vmovss(xmm14, mem(rcx)) //add(rdi, rcx) - - - - - + + + + + label(.SDONE) - - + + label(.SRETURN) - + end_asm( : // output operands (none) @@ -598,8 +598,8 @@ void bli_sgemmsup_rd_haswell_asm_3x1 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -627,7 +627,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -645,7 +645,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -684,18 +684,18 @@ void bli_sgemmsup_rd_haswell_asm_3x1 prefetch(0, mem(rcx, rdi, 2, 0*4)) // prefetch c + 2*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -717,7 +717,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) - + // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) @@ -735,7 +735,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -771,27 +771,27 @@ void bli_sgemmsup_rd_haswell_asm_3x1 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -811,21 +811,21 @@ void bli_sgemmsup_rd_haswell_asm_3x1 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -833,7 +833,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rbx ), xmm0) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; @@ -847,12 +847,12 @@ void bli_sgemmsup_rd_haswell_asm_3x1 add(imm(1*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -862,7 +862,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1 // ymm4 // ymm6 // ymm8 - + vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -885,8 +885,8 @@ void bli_sgemmsup_rd_haswell_asm_3x1 // xmm6[0] = sum(ymm6) // xmm8[0] = sum(ymm8) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -894,82 +894,82 @@ void bli_sgemmsup_rd_haswell_asm_3x1 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovss(xmm4, mem(rcx)) add(rdi, rcx) - + vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovss(xmm6, mem(rcx)) add(rdi, rcx) - + vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm8) vmovss(xmm8, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovss(xmm4, mem(rcx)) add(rdi, rcx) - + vmovss(xmm6, mem(rcx)) add(rdi, rcx) - + vmovss(xmm8, mem(rcx)) //add(rdi, rcx) - - - - - + + + + + label(.SDONE) - - + + label(.SRETURN) - + end_asm( : // output operands (none) @@ -1013,8 +1013,8 @@ void bli_sgemmsup_rd_haswell_asm_2x1 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1042,7 +1042,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1060,7 +1060,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1097,18 +1097,18 @@ void bli_sgemmsup_rd_haswell_asm_2x1 prefetch(0, mem(rcx, rdi, 1, 0*4)) // prefetch c + 1*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1127,7 +1127,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) - + // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) @@ -1142,7 +1142,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1172,27 +1172,27 @@ void bli_sgemmsup_rd_haswell_asm_2x1 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1209,21 +1209,21 @@ void bli_sgemmsup_rd_haswell_asm_2x1 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1231,7 +1231,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rbx ), xmm0) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; @@ -1242,12 +1242,12 @@ void bli_sgemmsup_rd_haswell_asm_2x1 add(imm(1*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -1256,7 +1256,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1 // ymm4 // ymm6 - + vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1272,8 +1272,8 @@ void bli_sgemmsup_rd_haswell_asm_2x1 // xmm4[0] = sum(ymm4) // xmm6[0] = sum(ymm6) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1281,73 +1281,73 @@ void bli_sgemmsup_rd_haswell_asm_2x1 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovss(xmm4, mem(rcx)) add(rdi, rcx) - + vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovss(xmm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovss(xmm4, mem(rcx)) add(rdi, rcx) - + vmovss(xmm6, mem(rcx)) //add(rdi, rcx) - - - - - + + + + + label(.SDONE) - - + + label(.SRETURN) - + end_asm( : // output operands (none) @@ -1391,8 +1391,8 @@ void bli_sgemmsup_rd_haswell_asm_1x1 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1420,7 +1420,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1438,7 +1438,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1473,18 +1473,18 @@ void bli_sgemmsup_rd_haswell_asm_1x1 prefetch(0, mem(rcx, 0*4)) // prefetch c + 0*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1500,7 +1500,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) - + // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) @@ -1512,7 +1512,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1536,27 +1536,27 @@ void bli_sgemmsup_rd_haswell_asm_1x1 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1570,21 +1570,21 @@ void bli_sgemmsup_rd_haswell_asm_1x1 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1592,7 +1592,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rbx ), xmm0) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; @@ -1600,12 +1600,12 @@ void bli_sgemmsup_rd_haswell_asm_1x1 add(imm(1*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -1614,7 +1614,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1 // ymm4 // ymm6 - + vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1624,8 +1624,8 @@ void bli_sgemmsup_rd_haswell_asm_1x1 // xmm4[0] = sum(ymm4) // xmm6[0] = sum(ymm6) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1633,65 +1633,65 @@ void bli_sgemmsup_rd_haswell_asm_1x1 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovss(xmm4, mem(rcx)) add(rdi, rcx) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovss(xmm4, mem(rcx)) //add(rdi, rcx) - - - - - + + + + + label(.SDONE) - - + + label(.SRETURN) - + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c index 1d3d88309..5fb91e634 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c @@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x12 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -194,18 +194,18 @@ void bli_sgemmsup_rd_haswell_asm_6x12 #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -240,7 +240,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -271,7 +271,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -333,27 +333,27 @@ void bli_sgemmsup_rd_haswell_asm_6x12 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -364,7 +364,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12 vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -386,21 +386,21 @@ void bli_sgemmsup_rd_haswell_asm_6x12 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -408,12 +408,12 @@ void bli_sgemmsup_rd_haswell_asm_6x12 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -435,22 +435,22 @@ void bli_sgemmsup_rd_haswell_asm_6x12 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -466,7 +466,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12 vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -482,7 +482,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12 vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) - + vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -500,8 +500,8 @@ void bli_sgemmsup_rd_haswell_asm_6x12 // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -509,73 +509,73 @@ void bli_sgemmsup_rd_haswell_asm_6x12 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + lea(mem(r12, rdi, 2), r12) // @@ -599,7 +599,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12 label(.SRETURN) - + end_asm( : // output operands (none) @@ -644,8 +644,8 @@ void bli_sgemmsup_rd_haswell_asm_2x12 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -670,7 +670,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -688,7 +688,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -742,18 +742,18 @@ void bli_sgemmsup_rd_haswell_asm_2x12 prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -782,7 +782,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -808,7 +808,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -859,27 +859,27 @@ void bli_sgemmsup_rd_haswell_asm_2x12 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -888,7 +888,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -906,21 +906,21 @@ void bli_sgemmsup_rd_haswell_asm_2x12 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -928,11 +928,11 @@ void bli_sgemmsup_rd_haswell_asm_2x12 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -950,21 +950,21 @@ void bli_sgemmsup_rd_haswell_asm_2x12 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -980,7 +980,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12 vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -998,8 +998,8 @@ void bli_sgemmsup_rd_haswell_asm_2x12 // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1007,65 +1007,65 @@ void bli_sgemmsup_rd_haswell_asm_2x12 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + add(imm(4), r15) // jj += 4; @@ -1077,7 +1077,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12 label(.SRETURN) - + end_asm( : // output operands (none) @@ -1121,8 +1121,8 @@ void bli_sgemmsup_rd_haswell_asm_1x12 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1147,7 +1147,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1165,7 +1165,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1214,18 +1214,18 @@ void bli_sgemmsup_rd_haswell_asm_1x12 prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1248,7 +1248,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12 add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -1269,7 +1269,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif @@ -1309,34 +1309,34 @@ void bli_sgemmsup_rd_haswell_asm_1x12 add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) @@ -1350,21 +1350,21 @@ void bli_sgemmsup_rd_haswell_asm_1x12 add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1372,10 +1372,10 @@ void bli_sgemmsup_rd_haswell_asm_1x12 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) @@ -1389,21 +1389,21 @@ void bli_sgemmsup_rd_haswell_asm_1x12 add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1421,8 +1421,8 @@ void bli_sgemmsup_rd_haswell_asm_1x12 // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1430,57 +1430,57 @@ void bli_sgemmsup_rd_haswell_asm_1x12 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + add(imm(4), r15) // jj += 4; @@ -1492,7 +1492,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12 label(.SRETURN) - + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c index bbb75a6fc..1398c3da7 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c @@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x16 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t n_left = n0 % 16; @@ -176,7 +176,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -194,7 +194,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -269,18 +269,18 @@ void bli_sgemmsup_rd_haswell_asm_6x16 #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -315,7 +315,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -346,7 +346,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -408,27 +408,27 @@ void bli_sgemmsup_rd_haswell_asm_6x16 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -439,7 +439,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16 vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -461,21 +461,21 @@ void bli_sgemmsup_rd_haswell_asm_6x16 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -483,12 +483,12 @@ void bli_sgemmsup_rd_haswell_asm_6x16 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -510,22 +510,22 @@ void bli_sgemmsup_rd_haswell_asm_6x16 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -541,7 +541,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16 vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -557,7 +557,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16 vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) - + vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -575,8 +575,8 @@ void bli_sgemmsup_rd_haswell_asm_6x16 // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -584,73 +584,73 @@ void bli_sgemmsup_rd_haswell_asm_6x16 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + lea(mem(r12, rdi, 2), r12) // @@ -674,7 +674,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16 label(.SRETURN) - + end_asm( : // output operands (none) @@ -758,8 +758,8 @@ void bli_sgemmsup_rd_haswell_asm_2x16 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -784,7 +784,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -802,7 +802,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -856,18 +856,18 @@ void bli_sgemmsup_rd_haswell_asm_2x16 prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -897,7 +897,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -923,7 +923,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -975,27 +975,27 @@ void bli_sgemmsup_rd_haswell_asm_2x16 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1005,7 +1005,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -1023,21 +1023,21 @@ void bli_sgemmsup_rd_haswell_asm_2x16 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1045,11 +1045,11 @@ void bli_sgemmsup_rd_haswell_asm_2x16 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -1067,21 +1067,21 @@ void bli_sgemmsup_rd_haswell_asm_2x16 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1097,7 +1097,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16 vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1115,8 +1115,8 @@ void bli_sgemmsup_rd_haswell_asm_2x16 // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1124,65 +1124,65 @@ void bli_sgemmsup_rd_haswell_asm_2x16 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + add(imm(4), r15) // jj += 4; @@ -1194,7 +1194,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16 label(.SRETURN) - + end_asm( : // output operands (none) @@ -1238,8 +1238,8 @@ void bli_sgemmsup_rd_haswell_asm_1x16 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1264,7 +1264,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1282,7 +1282,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c @@ -1331,18 +1331,18 @@ void bli_sgemmsup_rd_haswell_asm_1x16 prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1367,7 +1367,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16 add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -1388,7 +1388,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1430,27 +1430,27 @@ void bli_sgemmsup_rd_haswell_asm_1x16 add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1459,7 +1459,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) @@ -1473,21 +1473,21 @@ void bli_sgemmsup_rd_haswell_asm_1x16 add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1495,10 +1495,10 @@ void bli_sgemmsup_rd_haswell_asm_1x16 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) @@ -1512,20 +1512,20 @@ void bli_sgemmsup_rd_haswell_asm_1x16 add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 - + // ymm4 ymm7 ymm10 ymm13 + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1543,8 +1543,8 @@ void bli_sgemmsup_rd_haswell_asm_1x16 // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1552,57 +1552,57 @@ void bli_sgemmsup_rd_haswell_asm_1x16 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + add(imm(4), r15) // jj += 4; @@ -1614,7 +1614,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16 label(.SRETURN) - + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c index 1e3240350..75c687267 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c @@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x2 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -170,18 +170,18 @@ void bli_sgemmsup_rd_haswell_asm_6x2 prefetch(0, mem(r10, rdi, 2, 1*4)) // prefetch c + 5*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -219,7 +219,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2 vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) @@ -253,7 +253,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -321,27 +321,27 @@ void bli_sgemmsup_rd_haswell_asm_6x2 vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -377,21 +377,21 @@ void bli_sgemmsup_rd_haswell_asm_6x2 vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -399,7 +399,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rbx ), xmm0) vmovss(mem(rbx, r11, 1), xmm1) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; @@ -429,12 +429,12 @@ void bli_sgemmsup_rd_haswell_asm_6x2 vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -447,7 +447,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2 // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 - + vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -491,8 +491,8 @@ void bli_sgemmsup_rd_haswell_asm_6x2 // xmm12[0:1] = sum(ymm12) sum(ymm13) // xmm14[0:1] = sum(ymm14) sum(ymm15) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -500,109 +500,109 @@ void bli_sgemmsup_rd_haswell_asm_6x2 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) - + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) - + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm10) vmovsd(xmm10, mem(rcx)) add(rdi, rcx) - + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm12) vmovsd(xmm12, mem(rcx)) add(rdi, rcx) - + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm14) vmovsd(xmm14, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm6, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm8, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm10, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm12, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm14, mem(rcx)) //add(rdi, rcx) - - - - - + + + + + label(.SDONE) - - + + label(.SRETURN) - + end_asm( : // output operands (none) @@ -646,8 +646,8 @@ void bli_sgemmsup_rd_haswell_asm_3x2 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -675,7 +675,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -693,7 +693,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -735,18 +735,18 @@ void bli_sgemmsup_rd_haswell_asm_3x2 prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -772,7 +772,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2 vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) - + // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) @@ -794,7 +794,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -837,27 +837,27 @@ void bli_sgemmsup_rd_haswell_asm_3x2 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -881,21 +881,21 @@ void bli_sgemmsup_rd_haswell_asm_3x2 vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -903,7 +903,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rbx ), xmm0) vmovss(mem(rbx, r11, 1), xmm1) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; @@ -921,12 +921,12 @@ void bli_sgemmsup_rd_haswell_asm_3x2 vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -936,7 +936,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2 // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 - + vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -959,8 +959,8 @@ void bli_sgemmsup_rd_haswell_asm_3x2 // xmm6[0:1] = sum(ymm6) sum(ymm7) // xmm8[0:1] = sum(ymm8) sum(ymm9) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -968,83 +968,83 @@ void bli_sgemmsup_rd_haswell_asm_3x2 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) - + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm6, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm8, mem(rcx)) //add(rdi, rcx) - - - - - + + + + + label(.SDONE) - - + + label(.SRETURN) - + end_asm( : // output operands (none) @@ -1088,8 +1088,8 @@ void bli_sgemmsup_rd_haswell_asm_2x2 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1117,7 +1117,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1135,7 +1135,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1174,18 +1174,18 @@ void bli_sgemmsup_rd_haswell_asm_2x2 prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1207,7 +1207,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2 vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) @@ -1225,7 +1225,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1260,27 +1260,27 @@ void bli_sgemmsup_rd_haswell_asm_2x2 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1300,21 +1300,21 @@ void bli_sgemmsup_rd_haswell_asm_2x2 vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1322,7 +1322,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rbx ), xmm0) vmovss(mem(rbx, r11, 1), xmm1) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; @@ -1336,12 +1336,12 @@ void bli_sgemmsup_rd_haswell_asm_2x2 vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -1350,7 +1350,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2 // ymm4 ymm5 // ymm6 ymm7 - + vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1366,8 +1366,8 @@ void bli_sgemmsup_rd_haswell_asm_2x2 // xmm4[0:1] = sum(ymm4) sum(ymm5) // xmm6[0:1] = sum(ymm6) sum(ymm7) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1375,73 +1375,73 @@ void bli_sgemmsup_rd_haswell_asm_2x2 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx)) add(rdi, rcx) - + vmovsd(xmm6, mem(rcx)) //add(rdi, rcx) - - - - - + + + + + label(.SDONE) - - + + label(.SRETURN) - + end_asm( : // output operands (none) @@ -1485,8 +1485,8 @@ void bli_sgemmsup_rd_haswell_asm_1x2 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1514,7 +1514,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1532,7 +1532,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2 //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1568,18 +1568,18 @@ void bli_sgemmsup_rd_haswell_asm_1x2 prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1597,7 +1597,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2 vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) - + // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) @@ -1611,7 +1611,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1638,27 +1638,27 @@ void bli_sgemmsup_rd_haswell_asm_1x2 add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -1674,21 +1674,21 @@ void bli_sgemmsup_rd_haswell_asm_1x2 vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1696,7 +1696,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rbx ), xmm0) vmovss(mem(rbx, r11, 1), xmm1) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; @@ -1706,12 +1706,12 @@ void bli_sgemmsup_rd_haswell_asm_1x2 vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + @@ -1719,7 +1719,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2 label(.SPOSTACCUM) // ymm4 ymm5 - + vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1728,8 +1728,8 @@ void bli_sgemmsup_rd_haswell_asm_1x2 // xmm4[0:1] = sum(ymm4) sum(ymm5) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1737,64 +1737,64 @@ void bli_sgemmsup_rd_haswell_asm_1x2 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx)) //add(rdi, rcx) - - - - - + + + + + label(.SDONE) - - + + label(.SRETURN) - + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c index 9d4e9d51d..80be4e932 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c @@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x4 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -173,18 +173,18 @@ void bli_sgemmsup_rd_haswell_asm_6x4 #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -219,7 +219,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -250,7 +250,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -312,27 +312,27 @@ void bli_sgemmsup_rd_haswell_asm_6x4 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -343,7 +343,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4 vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -365,21 +365,21 @@ void bli_sgemmsup_rd_haswell_asm_6x4 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -387,12 +387,12 @@ void bli_sgemmsup_rd_haswell_asm_6x4 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -414,22 +414,22 @@ void bli_sgemmsup_rd_haswell_asm_6x4 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -445,7 +445,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4 vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -461,7 +461,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4 vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) - + vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -479,8 +479,8 @@ void bli_sgemmsup_rd_haswell_asm_6x4 // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -488,73 +488,73 @@ void bli_sgemmsup_rd_haswell_asm_6x4 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + lea(mem(r12, rdi, 2), r12) // @@ -571,7 +571,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4 label(.SRETURN) - + end_asm( : // output operands (none) @@ -616,8 +616,8 @@ void bli_sgemmsup_rd_haswell_asm_2x4 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -642,7 +642,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -660,7 +660,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -701,18 +701,18 @@ void bli_sgemmsup_rd_haswell_asm_2x4 prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -741,7 +741,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -767,7 +767,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -818,27 +818,27 @@ void bli_sgemmsup_rd_haswell_asm_2x4 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -847,7 +847,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -865,21 +865,21 @@ void bli_sgemmsup_rd_haswell_asm_2x4 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -887,11 +887,11 @@ void bli_sgemmsup_rd_haswell_asm_2x4 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -909,21 +909,21 @@ void bli_sgemmsup_rd_haswell_asm_2x4 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -939,7 +939,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4 vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -957,8 +957,8 @@ void bli_sgemmsup_rd_haswell_asm_2x4 // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -966,70 +966,70 @@ void bli_sgemmsup_rd_haswell_asm_2x4 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + label(.SRETURN) - + end_asm( : // output operands (none) @@ -1073,8 +1073,8 @@ void bli_sgemmsup_rd_haswell_asm_1x4 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1099,7 +1099,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1117,7 +1117,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1153,18 +1153,18 @@ void bli_sgemmsup_rd_haswell_asm_1x4 prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1187,7 +1187,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4 add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -1208,7 +1208,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif @@ -1248,34 +1248,34 @@ void bli_sgemmsup_rd_haswell_asm_1x4 add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) @@ -1289,21 +1289,21 @@ void bli_sgemmsup_rd_haswell_asm_1x4 add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1311,10 +1311,10 @@ void bli_sgemmsup_rd_haswell_asm_1x4 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) @@ -1328,20 +1328,20 @@ void bli_sgemmsup_rd_haswell_asm_1x4 add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 - + // ymm4 ymm7 ymm10 ymm13 + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1359,8 +1359,8 @@ void bli_sgemmsup_rd_haswell_asm_1x4 // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1368,62 +1368,62 @@ void bli_sgemmsup_rd_haswell_asm_1x4 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + label(.SRETURN) - + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c index 788912ecf..3a82e9b3e 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c @@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x8 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -194,18 +194,18 @@ void bli_sgemmsup_rd_haswell_asm_6x8 #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -240,7 +240,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -271,7 +271,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -333,27 +333,27 @@ void bli_sgemmsup_rd_haswell_asm_6x8 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -364,7 +364,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8 vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -386,21 +386,21 @@ void bli_sgemmsup_rd_haswell_asm_6x8 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -408,12 +408,12 @@ void bli_sgemmsup_rd_haswell_asm_6x8 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -435,22 +435,22 @@ void bli_sgemmsup_rd_haswell_asm_6x8 vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -466,7 +466,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8 vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -482,7 +482,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8 vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) - + vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -500,8 +500,8 @@ void bli_sgemmsup_rd_haswell_asm_6x8 // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -509,73 +509,73 @@ void bli_sgemmsup_rd_haswell_asm_6x8 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + lea(mem(r12, rdi, 2), r12) // @@ -599,7 +599,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8 label(.SRETURN) - + end_asm( : // output operands (none) @@ -644,8 +644,8 @@ void bli_sgemmsup_rd_haswell_asm_2x8 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -670,7 +670,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -688,7 +688,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -742,18 +742,18 @@ void bli_sgemmsup_rd_haswell_asm_2x8 prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -782,7 +782,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -808,7 +808,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -859,27 +859,27 @@ void bli_sgemmsup_rd_haswell_asm_2x8 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a @@ -888,7 +888,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -906,21 +906,21 @@ void bli_sgemmsup_rd_haswell_asm_2x8 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -928,11 +928,11 @@ void bli_sgemmsup_rd_haswell_asm_2x8 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) @@ -950,21 +950,21 @@ void bli_sgemmsup_rd_haswell_asm_2x8 vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -980,7 +980,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8 vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - + vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -998,8 +998,8 @@ void bli_sgemmsup_rd_haswell_asm_2x8 // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1007,65 +1007,65 @@ void bli_sgemmsup_rd_haswell_asm_2x8 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) add(rdi, rcx) - + vmovups(xmm5, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + add(imm(4), r15) // jj += 4; @@ -1077,7 +1077,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8 label(.SRETURN) - + end_asm( : // output operands (none) @@ -1121,8 +1121,8 @@ void bli_sgemmsup_rd_haswell_asm_1x8 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1147,7 +1147,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8 begin_asm() //vzeroall() // zero all xmm/ymm registers. - + mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a @@ -1165,7 +1165,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8 lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a - + mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c @@ -1214,18 +1214,18 @@ void bli_sgemmsup_rd_haswell_asm_1x8 prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c #endif - - + + mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. - - + + label(.SLOOPKITER32) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 0 @@ -1248,7 +1248,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8 add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) @@ -1269,7 +1269,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8 // ---------------------------------- iteration 2 - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif @@ -1309,34 +1309,34 @@ void bli_sgemmsup_rd_haswell_asm_1x8 add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKITER8) - + mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. - - + + label(.SLOOPKITER8) // EDGE LOOP (ymm) - + #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; - + vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) @@ -1350,21 +1350,21 @@ void bli_sgemmsup_rd_haswell_asm_1x8 add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. - - - + + + label(.SCONSIDKLEFT1) - + mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. - - + + label(.SLOOPKLEFT1) // EDGE LOOP (scalar) @@ -1372,10 +1372,10 @@ void bli_sgemmsup_rd_haswell_asm_1x8 // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. - + vmovss(mem(rax ), xmm0) add(imm(1*4), rax) // a += 1*cs_a = 1*4; - + vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) @@ -1389,21 +1389,21 @@ void bli_sgemmsup_rd_haswell_asm_1x8 add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) - + dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - // ymm4 ymm7 ymm10 ymm13 + // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 - + vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) @@ -1421,8 +1421,8 @@ void bli_sgemmsup_rd_haswell_asm_1x8 // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) - - + + //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) @@ -1430,57 +1430,57 @@ void bli_sgemmsup_rd_haswell_asm_1x8 mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha - - - - - - + + + + + + //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) - - - + + + jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - - + + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx)) //add(rdi, rcx) - - - - + + + + label(.SDONE) - - + + add(imm(4), r15) // jj += 4; @@ -1492,7 +1492,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8 label(.SRETURN) - + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c index 1bea78ee7..65d8664da 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x12 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x12 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -186,25 +186,25 @@ void bli_sgemmsup_rv_haswell_asm_6x12 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -215,14 +215,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -231,7 +231,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12 vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 0 @@ -248,14 +248,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -263,14 +263,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12 vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -281,14 +281,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -296,7 +296,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12 vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 3 @@ -314,14 +314,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -329,50 +329,50 @@ void bli_sgemmsup_rv_haswell_asm_6x12 vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -380,22 +380,22 @@ void bli_sgemmsup_rv_haswell_asm_6x12 vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(ymm0, ymm6, ymm6) @@ -408,26 +408,26 @@ void bli_sgemmsup_rv_haswell_asm_6x12 vmulps(xmm0, xmm13, xmm13) vmulps(ymm0, ymm14, ymm14) vmulps(xmm0, xmm15, xmm15) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -436,60 +436,60 @@ void bli_sgemmsup_rv_haswell_asm_6x12 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9) vmovups(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11) vmovups(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm13) vmovups(xmm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14) vmovups(ymm14, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm15) vmovups(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -611,51 +611,51 @@ void bli_sgemmsup_rv_haswell_asm_6x12 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovups(ymm6, mem(rcx, 0*32)) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm8, mem(rcx, 0*32)) vmovups(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm10, mem(rcx, 0*32)) vmovups(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm12, mem(rcx, 0*32)) vmovups(xmm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm14, mem(rcx, 0*32)) vmovups(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -743,12 +743,12 @@ void bli_sgemmsup_rv_haswell_asm_6x12 //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -791,8 +791,8 @@ void bli_sgemmsup_rv_haswell_asm_5x12 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -813,9 +813,9 @@ void bli_sgemmsup_rv_haswell_asm_5x12 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -830,7 +830,7 @@ void bli_sgemmsup_rv_haswell_asm_5x12 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -883,25 +883,25 @@ void bli_sgemmsup_rv_haswell_asm_5x12 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -912,20 +912,20 @@ void bli_sgemmsup_rv_haswell_asm_5x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 1 #if 0 @@ -942,26 +942,26 @@ void bli_sgemmsup_rv_haswell_asm_5x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -972,19 +972,19 @@ void bli_sgemmsup_rv_haswell_asm_5x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 3 @@ -1002,82 +1002,82 @@ void bli_sgemmsup_rv_haswell_asm_5x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(ymm0, ymm6, ymm6) @@ -1088,26 +1088,26 @@ void bli_sgemmsup_rv_haswell_asm_5x12 vmulps(xmm0, xmm11, xmm11) vmulps(ymm0, ymm12, ymm12) vmulps(xmm0, xmm13, xmm13) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1116,52 +1116,52 @@ void bli_sgemmsup_rv_haswell_asm_5x12 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9) vmovups(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11) vmovups(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm13) vmovups(xmm13, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1302,46 +1302,46 @@ void bli_sgemmsup_rv_haswell_asm_5x12 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovups(ymm6, mem(rcx, 0*32)) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm8, mem(rcx, 0*32)) vmovups(xmm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm10, mem(rcx, 0*32)) vmovups(xmm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm12, mem(rcx, 0*32)) vmovups(xmm13, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1442,12 +1442,12 @@ void bli_sgemmsup_rv_haswell_asm_5x12 //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -1490,8 +1490,8 @@ void bli_sgemmsup_rv_haswell_asm_4x12 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1512,9 +1512,9 @@ void bli_sgemmsup_rv_haswell_asm_4x12 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1529,7 +1529,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1581,25 +1581,25 @@ void bli_sgemmsup_rv_haswell_asm_4x12 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -1610,7 +1610,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1619,7 +1619,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12 vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 1 #if 0 @@ -1636,7 +1636,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1644,14 +1644,14 @@ void bli_sgemmsup_rv_haswell_asm_4x12 vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -1662,7 +1662,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1670,7 +1670,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12 vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 3 @@ -1688,7 +1688,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1696,32 +1696,32 @@ void bli_sgemmsup_rv_haswell_asm_4x12 vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -1732,7 +1732,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1740,22 +1740,22 @@ void bli_sgemmsup_rv_haswell_asm_4x12 vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(ymm0, ymm6, ymm6) @@ -1764,40 +1764,40 @@ void bli_sgemmsup_rv_haswell_asm_4x12 vmulps(xmm0, xmm9, xmm9) vmulps(ymm0, ymm10, ymm10) vmulps(xmm0, xmm11, xmm11) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) @@ -1828,10 +1828,10 @@ void bli_sgemmsup_rv_haswell_asm_4x12 vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11) vmovups(xmm11, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. - + label(.SCOLSTORED) @@ -1907,19 +1907,19 @@ void bli_sgemmsup_rv_haswell_asm_4x12 jmp(.SDONE) // jump to end. - - + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -1938,8 +1938,8 @@ void bli_sgemmsup_rv_haswell_asm_4x12 vmovups(ymm10, mem(rcx, 0*32)) vmovups(xmm11, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1999,12 +1999,12 @@ void bli_sgemmsup_rv_haswell_asm_4x12 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -2047,8 +2047,8 @@ void bli_sgemmsup_rv_haswell_asm_3x12 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2069,9 +2069,9 @@ void bli_sgemmsup_rv_haswell_asm_3x12 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2086,7 +2086,7 @@ void bli_sgemmsup_rv_haswell_asm_3x12 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2137,25 +2137,25 @@ void bli_sgemmsup_rv_haswell_asm_3x12 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -2166,13 +2166,13 @@ void bli_sgemmsup_rv_haswell_asm_3x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - + // ---------------------------------- iteration 1 #if 0 @@ -2189,19 +2189,19 @@ void bli_sgemmsup_rv_haswell_asm_3x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -2212,12 +2212,12 @@ void bli_sgemmsup_rv_haswell_asm_3x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - + // ---------------------------------- iteration 3 @@ -2235,37 +2235,37 @@ void bli_sgemmsup_rv_haswell_asm_3x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -2276,67 +2276,67 @@ void bli_sgemmsup_rv_haswell_asm_3x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(ymm0, ymm6, ymm6) vmulps(xmm0, xmm7, xmm7) vmulps(ymm0, ymm8, ymm8) vmulps(xmm0, xmm9, xmm9) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) @@ -2359,10 +2359,10 @@ void bli_sgemmsup_rv_haswell_asm_3x12 vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9) vmovups(xmm9, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. - + label(.SCOLSTORED) @@ -2483,19 +2483,19 @@ void bli_sgemmsup_rv_haswell_asm_3x12 jmp(.SDONE) // jump to end. - - + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -2509,8 +2509,8 @@ void bli_sgemmsup_rv_haswell_asm_3x12 vmovups(ymm8, mem(rcx, 0*32)) vmovups(xmm9, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2587,12 +2587,12 @@ void bli_sgemmsup_rv_haswell_asm_3x12 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -2635,8 +2635,8 @@ void bli_sgemmsup_rv_haswell_asm_2x12 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2657,9 +2657,9 @@ void bli_sgemmsup_rv_haswell_asm_2x12 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2674,7 +2674,7 @@ void bli_sgemmsup_rv_haswell_asm_2x12 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2724,25 +2724,25 @@ void bli_sgemmsup_rv_haswell_asm_2x12 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -2755,7 +2755,7 @@ void bli_sgemmsup_rv_haswell_asm_2x12 vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 1 #if 0 @@ -2773,14 +2773,14 @@ void bli_sgemmsup_rv_haswell_asm_2x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -2792,7 +2792,7 @@ void bli_sgemmsup_rv_haswell_asm_2x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 3 @@ -2811,32 +2811,32 @@ void bli_sgemmsup_rv_haswell_asm_2x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -2848,60 +2848,60 @@ void bli_sgemmsup_rv_haswell_asm_2x12 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(ymm0, ymm6, ymm6) vmulps(xmm0, xmm7, xmm7) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) @@ -2916,10 +2916,10 @@ void bli_sgemmsup_rv_haswell_asm_2x12 vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7) vmovups(xmm7, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. - + label(.SCOLSTORED) @@ -2975,19 +2975,19 @@ void bli_sgemmsup_rv_haswell_asm_2x12 jmp(.SDONE) // jump to end. - - + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -2996,8 +2996,8 @@ void bli_sgemmsup_rv_haswell_asm_2x12 vmovups(ymm6, mem(rcx, 0*32)) vmovups(xmm7, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -3033,12 +3033,12 @@ void bli_sgemmsup_rv_haswell_asm_2x12 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -3081,8 +3081,8 @@ void bli_sgemmsup_rv_haswell_asm_1x12 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -3103,9 +3103,9 @@ void bli_sgemmsup_rv_haswell_asm_1x12 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -3120,7 +3120,7 @@ void bli_sgemmsup_rv_haswell_asm_1x12 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -3169,25 +3169,25 @@ void bli_sgemmsup_rv_haswell_asm_1x12 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -3197,7 +3197,7 @@ void bli_sgemmsup_rv_haswell_asm_1x12 vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 1 #if 0 @@ -3212,14 +3212,14 @@ void bli_sgemmsup_rv_haswell_asm_1x12 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -3228,7 +3228,7 @@ void bli_sgemmsup_rv_haswell_asm_1x12 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 3 @@ -3244,32 +3244,32 @@ void bli_sgemmsup_rv_haswell_asm_1x12 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; @@ -3278,68 +3278,68 @@ void bli_sgemmsup_rv_haswell_asm_1x12 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5) vmovups(xmm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. - + label(.SCOLSTORED) @@ -3414,24 +3414,24 @@ void bli_sgemmsup_rv_haswell_asm_1x12 jmp(.SDONE) // jump to end. - - + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(xmm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -3480,12 +3480,12 @@ void bli_sgemmsup_rv_haswell_asm_1x12 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c index 6a08cecd4..26eec0c09 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -108,8 +108,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { #if 0 @@ -178,7 +178,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16 // Advance C and A pointers by the mrs and nrs we just // used, and decrement m_left. cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; - } + } } // Advance C and B pointers by the mrs and nrs we just used, and @@ -208,9 +208,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -225,7 +225,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -286,25 +286,25 @@ void bli_sgemmsup_rv_haswell_asm_6x16 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -315,14 +315,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -331,7 +331,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16 vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 1 #if 0 @@ -348,14 +348,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -363,14 +363,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16 vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -381,14 +381,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -396,7 +396,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16 vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - + // ---------------------------------- iteration 3 @@ -414,14 +414,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -429,50 +429,50 @@ void bli_sgemmsup_rv_haswell_asm_6x16 vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; @@ -480,22 +480,22 @@ void bli_sgemmsup_rv_haswell_asm_6x16 vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) @@ -508,26 +508,26 @@ void bli_sgemmsup_rv_haswell_asm_6x16 vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -536,60 +536,60 @@ void bli_sgemmsup_rv_haswell_asm_6x16 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13) vmovups(ymm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14) vmovups(ymm14, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm15) vmovups(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -735,51 +735,51 @@ void bli_sgemmsup_rv_haswell_asm_6x16 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm10, mem(rcx, 0*32)) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm12, mem(rcx, 0*32)) vmovups(ymm13, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm14, mem(rcx, 0*32)) vmovups(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -884,12 +884,12 @@ void bli_sgemmsup_rv_haswell_asm_6x16 //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -932,8 +932,8 @@ void bli_sgemmsup_rv_haswell_asm_5x16 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -954,9 +954,9 @@ void bli_sgemmsup_rv_haswell_asm_5x16 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -971,7 +971,7 @@ void bli_sgemmsup_rv_haswell_asm_5x16 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1029,25 +1029,25 @@ void bli_sgemmsup_rv_haswell_asm_5x16 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -1058,20 +1058,20 @@ void bli_sgemmsup_rv_haswell_asm_5x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 1 #if 0 @@ -1088,26 +1088,26 @@ void bli_sgemmsup_rv_haswell_asm_5x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -1118,19 +1118,19 @@ void bli_sgemmsup_rv_haswell_asm_5x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - + // ---------------------------------- iteration 3 @@ -1148,82 +1148,82 @@ void bli_sgemmsup_rv_haswell_asm_5x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) @@ -1234,26 +1234,26 @@ void bli_sgemmsup_rv_haswell_asm_5x16 vmulps(ymm0, ymm11, ymm11) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm13, ymm13) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1262,52 +1262,52 @@ void bli_sgemmsup_rv_haswell_asm_5x16 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13) vmovups(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1479,46 +1479,46 @@ void bli_sgemmsup_rv_haswell_asm_5x16 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) - + vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm10, mem(rcx, 0*32)) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) - - + + vmovups(ymm12, mem(rcx, 0*32)) vmovups(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1640,12 +1640,12 @@ void bli_sgemmsup_rv_haswell_asm_5x16 //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -1688,8 +1688,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1710,9 +1710,9 @@ void bli_sgemmsup_rv_haswell_asm_4x16 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1727,7 +1727,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1784,25 +1784,25 @@ void bli_sgemmsup_rv_haswell_asm_4x16 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -1813,7 +1813,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1822,7 +1822,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16 vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 1 #if 0 @@ -1839,7 +1839,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1847,14 +1847,14 @@ void bli_sgemmsup_rv_haswell_asm_4x16 vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -1865,7 +1865,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1873,7 +1873,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16 vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - + // ---------------------------------- iteration 3 @@ -1891,7 +1891,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1899,32 +1899,32 @@ void bli_sgemmsup_rv_haswell_asm_4x16 vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -1935,7 +1935,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; @@ -1943,22 +1943,22 @@ void bli_sgemmsup_rv_haswell_asm_4x16 vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) @@ -1967,40 +1967,40 @@ void bli_sgemmsup_rv_haswell_asm_4x16 vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) @@ -2031,10 +2031,10 @@ void bli_sgemmsup_rv_haswell_asm_4x16 vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11) vmovups(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. - + label(.SCOLSTORED) @@ -2122,19 +2122,19 @@ void bli_sgemmsup_rv_haswell_asm_4x16 jmp(.SDONE) // jump to end. - - + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -2153,8 +2153,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16 vmovups(ymm10, mem(rcx, 0*32)) vmovups(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2225,12 +2225,12 @@ void bli_sgemmsup_rv_haswell_asm_4x16 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -2273,8 +2273,8 @@ void bli_sgemmsup_rv_haswell_asm_3x16 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2295,9 +2295,9 @@ void bli_sgemmsup_rv_haswell_asm_3x16 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2312,7 +2312,7 @@ void bli_sgemmsup_rv_haswell_asm_3x16 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2368,25 +2368,25 @@ void bli_sgemmsup_rv_haswell_asm_3x16 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -2397,13 +2397,13 @@ void bli_sgemmsup_rv_haswell_asm_3x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - + // ---------------------------------- iteration 1 #if 0 @@ -2420,19 +2420,19 @@ void bli_sgemmsup_rv_haswell_asm_3x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -2443,12 +2443,12 @@ void bli_sgemmsup_rv_haswell_asm_3x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - + // ---------------------------------- iteration 3 @@ -2466,37 +2466,37 @@ void bli_sgemmsup_rv_haswell_asm_3x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -2507,67 +2507,67 @@ void bli_sgemmsup_rv_haswell_asm_3x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) @@ -2590,10 +2590,10 @@ void bli_sgemmsup_rv_haswell_asm_3x16 vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. - + label(.SCOLSTORED) @@ -2745,19 +2745,19 @@ void bli_sgemmsup_rv_haswell_asm_3x16 jmp(.SDONE) // jump to end. - - + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -2771,8 +2771,8 @@ void bli_sgemmsup_rv_haswell_asm_3x16 vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2866,12 +2866,12 @@ void bli_sgemmsup_rv_haswell_asm_3x16 //lea(mem(rcx, rsi, 4), rcx) - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -2914,8 +2914,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2936,9 +2936,9 @@ void bli_sgemmsup_rv_haswell_asm_2x16 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2953,7 +2953,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -3008,25 +3008,25 @@ void bli_sgemmsup_rv_haswell_asm_2x16 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -3039,7 +3039,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16 vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 1 #if 0 @@ -3057,14 +3057,14 @@ void bli_sgemmsup_rv_haswell_asm_2x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -3076,7 +3076,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - + // ---------------------------------- iteration 3 @@ -3095,32 +3095,32 @@ void bli_sgemmsup_rv_haswell_asm_2x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -3132,60 +3132,60 @@ void bli_sgemmsup_rv_haswell_asm_2x16 vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) @@ -3200,10 +3200,10 @@ void bli_sgemmsup_rv_haswell_asm_2x16 vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. - + label(.SCOLSTORED) @@ -3271,19 +3271,19 @@ void bli_sgemmsup_rv_haswell_asm_2x16 jmp(.SDONE) // jump to end. - - + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) @@ -3292,8 +3292,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16 vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -3335,12 +3335,12 @@ void bli_sgemmsup_rv_haswell_asm_2x16 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -3383,8 +3383,8 @@ void bli_sgemmsup_rv_haswell_asm_1x16 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -3405,9 +3405,9 @@ void bli_sgemmsup_rv_haswell_asm_1x16 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -3422,7 +3422,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -3476,25 +3476,25 @@ void bli_sgemmsup_rv_haswell_asm_1x16 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -3504,7 +3504,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16 vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 1 #if 0 @@ -3519,14 +3519,14 @@ void bli_sgemmsup_rv_haswell_asm_1x16 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -3535,7 +3535,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - + // ---------------------------------- iteration 3 @@ -3551,32 +3551,32 @@ void bli_sgemmsup_rv_haswell_asm_1x16 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; @@ -3585,68 +3585,68 @@ void bli_sgemmsup_rv_haswell_asm_1x16 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. - + label(.SCOLSTORED) @@ -3740,24 +3740,24 @@ void bli_sgemmsup_rv_haswell_asm_1x16 jmp(.SDONE) // jump to end. - - + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -3817,12 +3817,12 @@ void bli_sgemmsup_rv_haswell_asm_1x16 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c index 6090f8b0b..53a70d15f 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x2 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -174,25 +174,25 @@ void bli_sgemmsup_rv_haswell_asm_6x2 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -200,19 +200,19 @@ void bli_sgemmsup_rv_haswell_asm_6x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 1 #if 0 @@ -226,25 +226,25 @@ void bli_sgemmsup_rv_haswell_asm_6x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -252,18 +252,18 @@ void bli_sgemmsup_rv_haswell_asm_6x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 3 @@ -278,103 +278,103 @@ void bli_sgemmsup_rv_haswell_asm_6x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -383,42 +383,42 @@ void bli_sgemmsup_rv_haswell_asm_6x2 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) vmovsd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14) vmovsd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -452,45 +452,45 @@ void bli_sgemmsup_rv_haswell_asm_6x2 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovsd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovsd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -516,12 +516,12 @@ void bli_sgemmsup_rv_haswell_asm_6x2 //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -564,8 +564,8 @@ void bli_sgemmsup_rv_haswell_asm_5x2 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -586,9 +586,9 @@ void bli_sgemmsup_rv_haswell_asm_5x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -603,7 +603,7 @@ void bli_sgemmsup_rv_haswell_asm_5x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -644,25 +644,25 @@ void bli_sgemmsup_rv_haswell_asm_5x2 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -670,17 +670,17 @@ void bli_sgemmsup_rv_haswell_asm_5x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) - + // ---------------------------------- iteration 1 #if 0 @@ -694,23 +694,23 @@ void bli_sgemmsup_rv_haswell_asm_5x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -718,16 +718,16 @@ void bli_sgemmsup_rv_haswell_asm_5x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) - + // ---------------------------------- iteration 3 @@ -742,98 +742,98 @@ void bli_sgemmsup_rv_haswell_asm_5x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -842,37 +842,37 @@ void bli_sgemmsup_rv_haswell_asm_5x2 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) vmovsd(xmm12, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -910,41 +910,41 @@ void bli_sgemmsup_rv_haswell_asm_5x2 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovsd(xmm12, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -973,12 +973,12 @@ void bli_sgemmsup_rv_haswell_asm_5x2 //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -1021,8 +1021,8 @@ void bli_sgemmsup_rv_haswell_asm_4x2 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1043,9 +1043,9 @@ void bli_sgemmsup_rv_haswell_asm_4x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1060,7 +1060,7 @@ void bli_sgemmsup_rv_haswell_asm_4x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1100,25 +1100,25 @@ void bli_sgemmsup_rv_haswell_asm_4x2 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1126,14 +1126,14 @@ void bli_sgemmsup_rv_haswell_asm_4x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + // ---------------------------------- iteration 1 #if 0 @@ -1147,20 +1147,20 @@ void bli_sgemmsup_rv_haswell_asm_4x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1168,13 +1168,13 @@ void bli_sgemmsup_rv_haswell_asm_4x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + // ---------------------------------- iteration 3 @@ -1189,91 +1189,91 @@ void bli_sgemmsup_rv_haswell_asm_4x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1282,32 +1282,32 @@ void bli_sgemmsup_rv_haswell_asm_4x2 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovsd(xmm10, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1331,37 +1331,37 @@ void bli_sgemmsup_rv_haswell_asm_4x2 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovsd(xmm10, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1380,12 +1380,12 @@ void bli_sgemmsup_rv_haswell_asm_4x2 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -1428,8 +1428,8 @@ void bli_sgemmsup_rv_haswell_asm_3x2 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1450,9 +1450,9 @@ void bli_sgemmsup_rv_haswell_asm_3x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1467,7 +1467,7 @@ void bli_sgemmsup_rv_haswell_asm_3x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1506,25 +1506,25 @@ void bli_sgemmsup_rv_haswell_asm_3x2 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1532,12 +1532,12 @@ void bli_sgemmsup_rv_haswell_asm_3x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) - + // ---------------------------------- iteration 1 #if 0 @@ -1551,18 +1551,18 @@ void bli_sgemmsup_rv_haswell_asm_3x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1570,11 +1570,11 @@ void bli_sgemmsup_rv_haswell_asm_3x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) - + // ---------------------------------- iteration 3 @@ -1589,86 +1589,86 @@ void bli_sgemmsup_rv_haswell_asm_3x2 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1677,27 +1677,27 @@ void bli_sgemmsup_rv_haswell_asm_3x2 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1730,33 +1730,33 @@ void bli_sgemmsup_rv_haswell_asm_3x2 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovsd(xmm8, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1780,12 +1780,12 @@ void bli_sgemmsup_rv_haswell_asm_3x2 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -1828,8 +1828,8 @@ void bli_sgemmsup_rv_haswell_asm_2x2 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1850,9 +1850,9 @@ void bli_sgemmsup_rv_haswell_asm_2x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1867,7 +1867,7 @@ void bli_sgemmsup_rv_haswell_asm_2x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1905,25 +1905,25 @@ void bli_sgemmsup_rv_haswell_asm_2x2 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1933,7 +1933,7 @@ void bli_sgemmsup_rv_haswell_asm_2x2 vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + // ---------------------------------- iteration 1 #if 0 @@ -1948,14 +1948,14 @@ void bli_sgemmsup_rv_haswell_asm_2x2 add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1964,7 +1964,7 @@ void bli_sgemmsup_rv_haswell_asm_2x2 add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + // ---------------------------------- iteration 3 @@ -1980,78 +1980,78 @@ void bli_sgemmsup_rv_haswell_asm_2x2 add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2060,22 +2060,22 @@ void bli_sgemmsup_rv_haswell_asm_2x2 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2094,29 +2094,29 @@ void bli_sgemmsup_rv_haswell_asm_2x2 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovsd(xmm6, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2130,12 +2130,12 @@ void bli_sgemmsup_rv_haswell_asm_2x2 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -2178,8 +2178,8 @@ void bli_sgemmsup_rv_haswell_asm_1x2 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2200,9 +2200,9 @@ void bli_sgemmsup_rv_haswell_asm_1x2 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2217,7 +2217,7 @@ void bli_sgemmsup_rv_haswell_asm_1x2 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2254,25 +2254,25 @@ void bli_sgemmsup_rv_haswell_asm_1x2 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -2280,7 +2280,7 @@ void bli_sgemmsup_rv_haswell_asm_1x2 add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) - + // ---------------------------------- iteration 1 #if 0 @@ -2293,21 +2293,21 @@ void bli_sgemmsup_rv_haswell_asm_1x2 vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) - + // ---------------------------------- iteration 3 @@ -2321,75 +2321,75 @@ void bli_sgemmsup_rv_haswell_asm_1x2 vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2398,17 +2398,17 @@ void bli_sgemmsup_rv_haswell_asm_1x2 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2431,25 +2431,25 @@ void bli_sgemmsup_rv_haswell_asm_1x2 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovsd(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2466,12 +2466,12 @@ void bli_sgemmsup_rv_haswell_asm_1x2 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c index 512fd6052..2d6165710 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x4 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -176,25 +176,25 @@ void bli_sgemmsup_rv_haswell_asm_6x4 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -202,19 +202,19 @@ void bli_sgemmsup_rv_haswell_asm_6x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 1 #if 0 @@ -228,25 +228,25 @@ void bli_sgemmsup_rv_haswell_asm_6x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -254,18 +254,18 @@ void bli_sgemmsup_rv_haswell_asm_6x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - + // ---------------------------------- iteration 3 @@ -280,103 +280,103 @@ void bli_sgemmsup_rv_haswell_asm_6x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -385,42 +385,42 @@ void bli_sgemmsup_rv_haswell_asm_6x4 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) vmovups(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14) vmovups(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -474,45 +474,45 @@ void bli_sgemmsup_rv_haswell_asm_6x4 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -552,12 +552,12 @@ void bli_sgemmsup_rv_haswell_asm_6x4 //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -600,8 +600,8 @@ void bli_sgemmsup_rv_haswell_asm_5x4 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -622,9 +622,9 @@ void bli_sgemmsup_rv_haswell_asm_5x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -639,7 +639,7 @@ void bli_sgemmsup_rv_haswell_asm_5x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -682,25 +682,25 @@ void bli_sgemmsup_rv_haswell_asm_5x4 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -708,17 +708,17 @@ void bli_sgemmsup_rv_haswell_asm_5x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) - + // ---------------------------------- iteration 1 #if 0 @@ -732,23 +732,23 @@ void bli_sgemmsup_rv_haswell_asm_5x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -756,16 +756,16 @@ void bli_sgemmsup_rv_haswell_asm_5x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) - + // ---------------------------------- iteration 3 @@ -780,98 +780,98 @@ void bli_sgemmsup_rv_haswell_asm_5x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -880,37 +880,37 @@ void bli_sgemmsup_rv_haswell_asm_5x4 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) vmovups(xmm12, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -970,41 +970,41 @@ void bli_sgemmsup_rv_haswell_asm_5x4 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(xmm12, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1049,12 +1049,12 @@ void bli_sgemmsup_rv_haswell_asm_5x4 //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -1097,8 +1097,8 @@ void bli_sgemmsup_rv_haswell_asm_4x4 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1119,9 +1119,9 @@ void bli_sgemmsup_rv_haswell_asm_4x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1136,7 +1136,7 @@ void bli_sgemmsup_rv_haswell_asm_4x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1178,25 +1178,25 @@ void bli_sgemmsup_rv_haswell_asm_4x4 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1204,14 +1204,14 @@ void bli_sgemmsup_rv_haswell_asm_4x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + // ---------------------------------- iteration 1 #if 0 @@ -1225,20 +1225,20 @@ void bli_sgemmsup_rv_haswell_asm_4x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1246,13 +1246,13 @@ void bli_sgemmsup_rv_haswell_asm_4x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - + // ---------------------------------- iteration 3 @@ -1267,91 +1267,91 @@ void bli_sgemmsup_rv_haswell_asm_4x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1360,32 +1360,32 @@ void bli_sgemmsup_rv_haswell_asm_4x4 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1422,37 +1422,37 @@ void bli_sgemmsup_rv_haswell_asm_4x4 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(xmm10, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1482,12 +1482,12 @@ void bli_sgemmsup_rv_haswell_asm_4x4 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -1530,8 +1530,8 @@ void bli_sgemmsup_rv_haswell_asm_3x4 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1552,9 +1552,9 @@ void bli_sgemmsup_rv_haswell_asm_3x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1569,7 +1569,7 @@ void bli_sgemmsup_rv_haswell_asm_3x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1610,25 +1610,25 @@ void bli_sgemmsup_rv_haswell_asm_3x4 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1636,12 +1636,12 @@ void bli_sgemmsup_rv_haswell_asm_3x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) - + // ---------------------------------- iteration 1 #if 0 @@ -1655,18 +1655,18 @@ void bli_sgemmsup_rv_haswell_asm_3x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -1674,11 +1674,11 @@ void bli_sgemmsup_rv_haswell_asm_3x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) - + // ---------------------------------- iteration 3 @@ -1693,86 +1693,86 @@ void bli_sgemmsup_rv_haswell_asm_3x4 vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1781,27 +1781,27 @@ void bli_sgemmsup_rv_haswell_asm_3x4 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1851,33 +1851,33 @@ void bli_sgemmsup_rv_haswell_asm_3x4 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(xmm8, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1911,12 +1911,12 @@ void bli_sgemmsup_rv_haswell_asm_3x4 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -1959,8 +1959,8 @@ void bli_sgemmsup_rv_haswell_asm_2x4 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1981,9 +1981,9 @@ void bli_sgemmsup_rv_haswell_asm_2x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1998,7 +1998,7 @@ void bli_sgemmsup_rv_haswell_asm_2x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2038,25 +2038,25 @@ void bli_sgemmsup_rv_haswell_asm_2x4 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -2066,7 +2066,7 @@ void bli_sgemmsup_rv_haswell_asm_2x4 vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + // ---------------------------------- iteration 1 #if 0 @@ -2081,14 +2081,14 @@ void bli_sgemmsup_rv_haswell_asm_2x4 add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -2097,7 +2097,7 @@ void bli_sgemmsup_rv_haswell_asm_2x4 add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - + // ---------------------------------- iteration 3 @@ -2113,78 +2113,78 @@ void bli_sgemmsup_rv_haswell_asm_2x4 add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2193,22 +2193,22 @@ void bli_sgemmsup_rv_haswell_asm_2x4 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2235,29 +2235,29 @@ void bli_sgemmsup_rv_haswell_asm_2x4 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovups(xmm6, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2276,12 +2276,12 @@ void bli_sgemmsup_rv_haswell_asm_2x4 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -2324,8 +2324,8 @@ void bli_sgemmsup_rv_haswell_asm_1x4 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2346,9 +2346,9 @@ void bli_sgemmsup_rv_haswell_asm_1x4 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2363,7 +2363,7 @@ void bli_sgemmsup_rv_haswell_asm_1x4 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2402,25 +2402,25 @@ void bli_sgemmsup_rv_haswell_asm_1x4 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; @@ -2428,7 +2428,7 @@ void bli_sgemmsup_rv_haswell_asm_1x4 add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) - + // ---------------------------------- iteration 1 #if 0 @@ -2441,21 +2441,21 @@ void bli_sgemmsup_rv_haswell_asm_1x4 vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) - + // ---------------------------------- iteration 3 @@ -2469,75 +2469,75 @@ void bli_sgemmsup_rv_haswell_asm_1x4 vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate - + vmulps(xmm0, xmm4, xmm4) // scale by alpha - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2546,17 +2546,17 @@ void bli_sgemmsup_rv_haswell_asm_1x4 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2589,25 +2589,25 @@ void bli_sgemmsup_rv_haswell_asm_1x4 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2630,12 +2630,12 @@ void bli_sgemmsup_rv_haswell_asm_1x4 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c index ac4e1ee0b..f2cb1df42 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -179,25 +179,25 @@ void bli_sgemmsup_rv_haswell_asm_6x6 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -207,19 +207,19 @@ void bli_sgemmsup_rv_haswell_asm_6x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 1 #if 0 @@ -235,25 +235,25 @@ void bli_sgemmsup_rv_haswell_asm_6x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -263,18 +263,18 @@ void bli_sgemmsup_rv_haswell_asm_6x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 3 @@ -291,105 +291,105 @@ void bli_sgemmsup_rv_haswell_asm_6x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm14, ymm14) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -398,12 +398,12 @@ void bli_sgemmsup_rv_haswell_asm_6x6 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vextractf128(imm(0x1), ymm4, xmm5) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*4)) @@ -413,8 +413,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6 vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm6, xmm7) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*4)) @@ -424,8 +424,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6 vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm8, xmm9) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*4)) @@ -435,8 +435,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6 vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm10, xmm11) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*4)) @@ -446,8 +446,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6 vmovsd(xmm11, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm12, xmm13) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm12) vmovups(xmm12, mem(rcx, 0*4)) @@ -457,8 +457,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6 vmovsd(xmm13, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm14, xmm15) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm14) vmovups(xmm14, mem(rcx, 0*4)) @@ -468,8 +468,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6 vmovsd(xmm15, mem(rcx, 4*4)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -534,57 +534,57 @@ void bli_sgemmsup_rv_haswell_asm_6x6 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vextractf128(imm(0x1), ymm4, xmm5) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) - + vextractf128(imm(0x1), ymm6, xmm7) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm8, xmm9) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm10, xmm11) vmovups(xmm10, mem(rcx, 0*4)) vmovsd(xmm11, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm12, xmm13) vmovups(xmm12, mem(rcx, 0*4)) vmovsd(xmm13, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm14, xmm15) vmovups(xmm14, mem(rcx, 0*4)) vmovsd(xmm15, mem(rcx, 4*4)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -632,10 +632,10 @@ void bli_sgemmsup_rv_haswell_asm_6x6 //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -678,8 +678,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -700,9 +700,9 @@ void bli_sgemmsup_rv_haswell_asm_5x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -717,7 +717,7 @@ void bli_sgemmsup_rv_haswell_asm_5x6 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -763,25 +763,25 @@ void bli_sgemmsup_rv_haswell_asm_5x6 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -791,17 +791,17 @@ void bli_sgemmsup_rv_haswell_asm_5x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) - + // ---------------------------------- iteration 1 #if 0 @@ -817,23 +817,23 @@ void bli_sgemmsup_rv_haswell_asm_5x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -843,16 +843,16 @@ void bli_sgemmsup_rv_haswell_asm_5x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) - + // ---------------------------------- iteration 3 @@ -869,100 +869,100 @@ void bli_sgemmsup_rv_haswell_asm_5x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -971,12 +971,12 @@ void bli_sgemmsup_rv_haswell_asm_5x6 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vextractf128(imm(0x1), ymm4, xmm5) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*4)) @@ -986,8 +986,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6 vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm6, xmm7) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*4)) @@ -997,8 +997,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6 vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm8, xmm9) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*4)) @@ -1008,8 +1008,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6 vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm10, xmm11) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*4)) @@ -1019,8 +1019,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6 vmovsd(xmm11, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm12, xmm13) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm12) vmovups(xmm12, mem(rcx, 0*4)) @@ -1030,8 +1030,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6 vmovsd(xmm13, mem(rcx, 4*4)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1106,51 +1106,51 @@ void bli_sgemmsup_rv_haswell_asm_5x6 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vextractf128(imm(0x1), ymm4, xmm5) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) - + vextractf128(imm(0x1), ymm6, xmm7) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm8, xmm9) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm10, xmm11) vmovups(xmm10, mem(rcx, 0*4)) vmovsd(xmm11, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm12, xmm13) vmovups(xmm12, mem(rcx, 0*4)) vmovsd(xmm13, mem(rcx, 4*4)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1206,10 +1206,10 @@ void bli_sgemmsup_rv_haswell_asm_5x6 //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -1252,8 +1252,8 @@ void bli_sgemmsup_rv_haswell_asm_4x6 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1274,9 +1274,9 @@ void bli_sgemmsup_rv_haswell_asm_4x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1291,7 +1291,7 @@ void bli_sgemmsup_rv_haswell_asm_4x6 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1336,25 +1336,25 @@ void bli_sgemmsup_rv_haswell_asm_4x6 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -1364,14 +1364,14 @@ void bli_sgemmsup_rv_haswell_asm_4x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + // ---------------------------------- iteration 1 #if 0 @@ -1387,20 +1387,20 @@ void bli_sgemmsup_rv_haswell_asm_4x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -1410,13 +1410,13 @@ void bli_sgemmsup_rv_haswell_asm_4x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + // ---------------------------------- iteration 3 @@ -1433,93 +1433,93 @@ void bli_sgemmsup_rv_haswell_asm_4x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -1528,12 +1528,12 @@ void bli_sgemmsup_rv_haswell_asm_4x6 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vextractf128(imm(0x1), ymm4, xmm5) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*4)) @@ -1543,8 +1543,8 @@ void bli_sgemmsup_rv_haswell_asm_4x6 vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm6, xmm7) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*4)) @@ -1554,8 +1554,8 @@ void bli_sgemmsup_rv_haswell_asm_4x6 vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm8, xmm9) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*4)) @@ -1565,9 +1565,9 @@ void bli_sgemmsup_rv_haswell_asm_4x6 vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) - - - vextractf128(imm(0x1), ymm10, xmm11) + + + vextractf128(imm(0x1), ymm10, xmm11) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*4)) @@ -1576,8 +1576,8 @@ void bli_sgemmsup_rv_haswell_asm_4x6 vmovsd(xmm11, mem(rcx, 4*4)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1620,45 +1620,45 @@ void bli_sgemmsup_rv_haswell_asm_4x6 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vextractf128(imm(0x1), ymm4, xmm5) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) - + vextractf128(imm(0x1), ymm6, xmm7) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm8, xmm9) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm10, xmm11) vmovups(xmm10, mem(rcx, 0*4)) vmovsd(xmm11, mem(rcx, 4*4)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1693,10 +1693,10 @@ void bli_sgemmsup_rv_haswell_asm_4x6 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -1739,8 +1739,8 @@ void bli_sgemmsup_rv_haswell_asm_3x6 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1761,9 +1761,9 @@ void bli_sgemmsup_rv_haswell_asm_3x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1778,7 +1778,7 @@ void bli_sgemmsup_rv_haswell_asm_3x6 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1822,25 +1822,25 @@ void bli_sgemmsup_rv_haswell_asm_3x6 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -1850,12 +1850,12 @@ void bli_sgemmsup_rv_haswell_asm_3x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) - + // ---------------------------------- iteration 1 #if 0 @@ -1871,18 +1871,18 @@ void bli_sgemmsup_rv_haswell_asm_3x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -1892,11 +1892,11 @@ void bli_sgemmsup_rv_haswell_asm_3x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) - + // ---------------------------------- iteration 3 @@ -1913,88 +1913,88 @@ void bli_sgemmsup_rv_haswell_asm_3x6 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2003,12 +2003,12 @@ void bli_sgemmsup_rv_haswell_asm_3x6 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vextractf128(imm(0x1), ymm4, xmm5) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*4)) @@ -2018,8 +2018,8 @@ void bli_sgemmsup_rv_haswell_asm_3x6 vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm6, xmm7) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*4)) @@ -2029,8 +2029,8 @@ void bli_sgemmsup_rv_haswell_asm_3x6 vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm8, xmm9) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*4)) @@ -2040,8 +2040,8 @@ void bli_sgemmsup_rv_haswell_asm_3x6 vmovsd(xmm9, mem(rcx, 4*4)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2106,39 +2106,39 @@ void bli_sgemmsup_rv_haswell_asm_3x6 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vextractf128(imm(0x1), ymm4, xmm5) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) - + vextractf128(imm(0x1), ymm6, xmm7) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm8, xmm9) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(xmm9, mem(rcx, 4*4)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2181,10 +2181,10 @@ void bli_sgemmsup_rv_haswell_asm_3x6 //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -2227,8 +2227,8 @@ void bli_sgemmsup_rv_haswell_asm_2x6 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2249,9 +2249,9 @@ void bli_sgemmsup_rv_haswell_asm_2x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2266,7 +2266,7 @@ void bli_sgemmsup_rv_haswell_asm_2x6 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2309,25 +2309,25 @@ void bli_sgemmsup_rv_haswell_asm_2x6 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -2339,7 +2339,7 @@ void bli_sgemmsup_rv_haswell_asm_2x6 vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + // ---------------------------------- iteration 1 #if 0 @@ -2356,14 +2356,14 @@ void bli_sgemmsup_rv_haswell_asm_2x6 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -2374,7 +2374,7 @@ void bli_sgemmsup_rv_haswell_asm_2x6 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + // ---------------------------------- iteration 3 @@ -2392,80 +2392,80 @@ void bli_sgemmsup_rv_haswell_asm_2x6 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2474,12 +2474,12 @@ void bli_sgemmsup_rv_haswell_asm_2x6 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vextractf128(imm(0x1), ymm4, xmm5) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*4)) @@ -2489,8 +2489,8 @@ void bli_sgemmsup_rv_haswell_asm_2x6 vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) - - + + vextractf128(imm(0x1), ymm6, xmm7) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*4)) @@ -2500,8 +2500,8 @@ void bli_sgemmsup_rv_haswell_asm_2x6 vmovsd(xmm7, mem(rcx, 4*4)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2533,33 +2533,33 @@ void bli_sgemmsup_rv_haswell_asm_2x6 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vextractf128(imm(0x1), ymm4, xmm5) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) - + vextractf128(imm(0x1), ymm6, xmm7) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(xmm7, mem(rcx, 4*4)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2581,10 +2581,10 @@ void bli_sgemmsup_rv_haswell_asm_2x6 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -2627,8 +2627,8 @@ void bli_sgemmsup_rv_haswell_asm_1x6 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2649,9 +2649,9 @@ void bli_sgemmsup_rv_haswell_asm_1x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2666,7 +2666,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2708,25 +2708,25 @@ void bli_sgemmsup_rv_haswell_asm_1x6 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -2736,7 +2736,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - + // ---------------------------------- iteration 1 #if 0 @@ -2751,14 +2751,14 @@ void bli_sgemmsup_rv_haswell_asm_1x6 vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) @@ -2767,7 +2767,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6 vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - + // ---------------------------------- iteration 3 @@ -2783,77 +2783,77 @@ void bli_sgemmsup_rv_haswell_asm_1x6 vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -2862,12 +2862,12 @@ void bli_sgemmsup_rv_haswell_asm_1x6 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vextractf128(imm(0x1), ymm4, xmm5) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*4)) @@ -2877,8 +2877,8 @@ void bli_sgemmsup_rv_haswell_asm_1x6 vmovsd(xmm5, mem(rcx, 4*4)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2920,27 +2920,27 @@ void bli_sgemmsup_rv_haswell_asm_1x6 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vextractf128(imm(0x1), ymm4, xmm5) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(xmm5, mem(rcx, 4*4)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2968,12 +2968,12 @@ void bli_sgemmsup_rv_haswell_asm_1x6 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -3018,8 +3018,8 @@ void bli_sgemmsup_rv_haswell_asm_1x6 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -3040,9 +3040,9 @@ void bli_sgemmsup_rv_haswell_asm_1x6 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -3057,7 +3057,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -3101,25 +3101,25 @@ void bli_sgemmsup_rv_haswell_asm_1x6 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -3127,7 +3127,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - + // ---------------------------------- iteration 1 #if 0 @@ -3140,21 +3140,21 @@ void bli_sgemmsup_rv_haswell_asm_1x6 vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - + // ---------------------------------- iteration 3 @@ -3168,96 +3168,96 @@ void bli_sgemmsup_rv_haswell_asm_1x6 vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. - + label(.SCOLSTORED) @@ -3308,23 +3308,23 @@ void bli_sgemmsup_rv_haswell_asm_1x6 jmp(.SDONE) // jump to end. - - + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -3357,12 +3357,12 @@ void bli_sgemmsup_rv_haswell_asm_1x6 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c index 2b1a221ad..603ba7554 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c @@ -40,20 +40,20 @@ /* rrr: - -------- ------ -------- - -------- ------ -------- - -------- += ------ ... -------- - -------- ------ -------- - -------- ------ : - -------- ------ : + -------- ------ -------- + -------- ------ -------- + -------- += ------ ... -------- + -------- ------ -------- + -------- ------ : + -------- ------ : rcr: - -------- | | | | -------- - -------- | | | | -------- - -------- += | | | | ... -------- - -------- | | | | -------- - -------- | | | | : - -------- | | | | : + -------- | | | | -------- + -------- | | | | -------- + -------- += | | | | ... -------- + -------- | | | | -------- + -------- | | | | : + -------- | | | | : Assumptions: - B is row-stored; @@ -69,12 +69,12 @@ cost of the in-register transpose). crr: - | | | | | | | | ------ -------- - | | | | | | | | ------ -------- - | | | | | | | | += ------ ... -------- - | | | | | | | | ------ -------- - | | | | | | | | ------ : - | | | | | | | | ------ : + | | | | | | | | ------ -------- + | | | | | | | | ------ -------- + | | | | | | | | += ------ ... -------- + | | | | | | | | ------ -------- + | | | | | | | | ------ : + | | | | | | | | ------ : */ // Prototype reference microkernels. @@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x8 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -181,25 +181,25 @@ void bli_sgemmsup_rv_haswell_asm_6x8 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -207,19 +207,19 @@ void bli_sgemmsup_rv_haswell_asm_6x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 1 #if 0 @@ -233,25 +233,25 @@ void bli_sgemmsup_rv_haswell_asm_6x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -259,18 +259,18 @@ void bli_sgemmsup_rv_haswell_asm_6x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - + // ---------------------------------- iteration 3 @@ -285,103 +285,103 @@ void bli_sgemmsup_rv_haswell_asm_6x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm14, ymm14) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -390,42 +390,42 @@ void bli_sgemmsup_rv_haswell_asm_6x8 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14) vmovups(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -502,45 +502,45 @@ void bli_sgemmsup_rv_haswell_asm_6x8 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(ymm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(ymm12, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -596,12 +596,12 @@ void bli_sgemmsup_rv_haswell_asm_6x8 //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -644,8 +644,8 @@ void bli_sgemmsup_rv_haswell_asm_5x8 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -666,9 +666,9 @@ void bli_sgemmsup_rv_haswell_asm_5x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -683,7 +683,7 @@ void bli_sgemmsup_rv_haswell_asm_5x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -731,25 +731,25 @@ void bli_sgemmsup_rv_haswell_asm_5x8 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - - + + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -757,17 +757,17 @@ void bli_sgemmsup_rv_haswell_asm_5x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) - + // ---------------------------------- iteration 1 #if 0 @@ -781,23 +781,23 @@ void bli_sgemmsup_rv_haswell_asm_5x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -805,16 +805,16 @@ void bli_sgemmsup_rv_haswell_asm_5x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) - + // ---------------------------------- iteration 3 @@ -829,98 +829,98 @@ void bli_sgemmsup_rv_haswell_asm_5x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; - + vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - - + + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case @@ -929,37 +929,37 @@ void bli_sgemmsup_rv_haswell_asm_5x8 cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - - + + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1049,41 +1049,41 @@ void bli_sgemmsup_rv_haswell_asm_5x8 jmp(.SDONE) // jump to end. - - - - + + + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) - + vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(ymm8, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(ymm10, mem(rcx, 0*32)) add(rdi, rcx) - - + + vmovups(ymm12, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1147,12 +1147,12 @@ void bli_sgemmsup_rv_haswell_asm_5x8 //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -1195,8 +1195,8 @@ void bli_sgemmsup_rv_haswell_asm_4x8 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1217,9 +1217,9 @@ void bli_sgemmsup_rv_haswell_asm_4x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1234,7 +1234,7 @@ void bli_sgemmsup_rv_haswell_asm_4x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1281,25 +1281,25 @@ void bli_sgemmsup_rv_haswell_asm_4x8 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1307,14 +1307,14 @@ void bli_sgemmsup_rv_haswell_asm_4x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + // ---------------------------------- iteration 1 #if 0 @@ -1328,20 +1328,20 @@ void bli_sgemmsup_rv_haswell_asm_4x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1349,13 +1349,13 @@ void bli_sgemmsup_rv_haswell_asm_4x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - + // ---------------------------------- iteration 3 @@ -1370,38 +1370,38 @@ void bli_sgemmsup_rv_haswell_asm_4x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1409,66 +1409,66 @@ void bli_sgemmsup_rv_haswell_asm_4x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) @@ -1487,10 +1487,10 @@ void bli_sgemmsup_rv_haswell_asm_4x8 vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. - + label(.SCOLSTORED) @@ -1538,19 +1538,19 @@ void bli_sgemmsup_rv_haswell_asm_4x8 jmp(.SDONE) // jump to end. - - + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) @@ -1565,8 +1565,8 @@ void bli_sgemmsup_rv_haswell_asm_4x8 vmovups(ymm10, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -1604,12 +1604,12 @@ void bli_sgemmsup_rv_haswell_asm_4x8 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -1652,8 +1652,8 @@ void bli_sgemmsup_rv_haswell_asm_3x8 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1674,9 +1674,9 @@ void bli_sgemmsup_rv_haswell_asm_3x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -1691,7 +1691,7 @@ void bli_sgemmsup_rv_haswell_asm_3x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -1737,25 +1737,25 @@ void bli_sgemmsup_rv_haswell_asm_3x8 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1763,12 +1763,12 @@ void bli_sgemmsup_rv_haswell_asm_3x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) - + // ---------------------------------- iteration 1 #if 0 @@ -1782,18 +1782,18 @@ void bli_sgemmsup_rv_haswell_asm_3x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1801,11 +1801,11 @@ void bli_sgemmsup_rv_haswell_asm_3x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) - + // ---------------------------------- iteration 3 @@ -1820,36 +1820,36 @@ void bli_sgemmsup_rv_haswell_asm_3x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -1857,63 +1857,63 @@ void bli_sgemmsup_rv_haswell_asm_3x8 vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) @@ -1927,10 +1927,10 @@ void bli_sgemmsup_rv_haswell_asm_3x8 vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. - + label(.SCOLSTORED) @@ -2010,19 +2010,19 @@ void bli_sgemmsup_rv_haswell_asm_3x8 jmp(.SDONE) // jump to end. - - + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) @@ -2033,8 +2033,8 @@ void bli_sgemmsup_rv_haswell_asm_3x8 vmovups(ymm8, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2084,12 +2084,12 @@ void bli_sgemmsup_rv_haswell_asm_3x8 //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -2132,8 +2132,8 @@ void bli_sgemmsup_rv_haswell_asm_2x8 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2154,9 +2154,9 @@ void bli_sgemmsup_rv_haswell_asm_2x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2171,7 +2171,7 @@ void bli_sgemmsup_rv_haswell_asm_2x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2216,25 +2216,25 @@ void bli_sgemmsup_rv_haswell_asm_2x8 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -2244,7 +2244,7 @@ void bli_sgemmsup_rv_haswell_asm_2x8 vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + // ---------------------------------- iteration 1 #if 0 @@ -2259,14 +2259,14 @@ void bli_sgemmsup_rv_haswell_asm_2x8 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -2275,7 +2275,7 @@ void bli_sgemmsup_rv_haswell_asm_2x8 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - + // ---------------------------------- iteration 3 @@ -2291,32 +2291,32 @@ void bli_sgemmsup_rv_haswell_asm_2x8 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -2325,58 +2325,58 @@ void bli_sgemmsup_rv_haswell_asm_2x8 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) @@ -2385,10 +2385,10 @@ void bli_sgemmsup_rv_haswell_asm_2x8 vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. - + label(.SCOLSTORED) @@ -2426,27 +2426,27 @@ void bli_sgemmsup_rv_haswell_asm_2x8 jmp(.SDONE) // jump to end. - - + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2470,12 +2470,12 @@ void bli_sgemmsup_rv_haswell_asm_2x8 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) @@ -2518,8 +2518,8 @@ void bli_sgemmsup_rv_haswell_asm_1x8 float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -2540,9 +2540,9 @@ void bli_sgemmsup_rv_haswell_asm_1x8 // ------------------------------------------------------------------------- begin_asm() - + vzeroall() // zero all xmm/ymm registers. - + mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a @@ -2557,7 +2557,7 @@ void bli_sgemmsup_rv_haswell_asm_1x8 //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) - + // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result @@ -2601,25 +2601,25 @@ void bli_sgemmsup_rv_haswell_asm_1x8 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.SLOOPKITER) // MAIN LOOP - - + + // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; @@ -2627,7 +2627,7 @@ void bli_sgemmsup_rv_haswell_asm_1x8 add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - + // ---------------------------------- iteration 1 #if 0 @@ -2640,21 +2640,21 @@ void bli_sgemmsup_rv_haswell_asm_1x8 vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - - + + // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - + // ---------------------------------- iteration 3 @@ -2668,96 +2668,96 @@ void bli_sgemmsup_rv_haswell_asm_1x8 vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - - - + + + dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. - - - - - - + + + + + + label(.SCONSIDKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif - + vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) - - + + dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. - - - + + + label(.SPOSTACCUM) - + mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate - + vmulps(ymm0, ymm4, ymm4) // scale by alpha - - - - - - + + + + + + mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) - + //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; - - - + + + // now avoid loading C if beta == 0 - + vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case - + cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case - + label(.SROWSTORED) - - + + vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. - + label(.SCOLSTORED) @@ -2808,23 +2808,23 @@ void bli_sgemmsup_rv_haswell_asm_1x8 jmp(.SDONE) // jump to end. - - + + label(.SBETAZERO) - + cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case - + label(.SROWSTORBZ) - - + + vmovups(ymm4, mem(rcx, 0*32)) //add(rdi, rcx) - - + + jmp(.SDONE) // jump to end. @@ -2857,12 +2857,12 @@ void bli_sgemmsup_rv_haswell_asm_1x8 //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c - - - + + + label(.SDONE) - - + + end_asm( : // output operands (none) diff --git a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c index f20e43f7c..a53b763da 100644 --- a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c +++ b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c @@ -264,8 +264,8 @@ void bli_dgemm_knc_asm_30x8 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { double * a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c index 18a8e5e2e..7374abfe0 100644 --- a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c +++ b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c @@ -264,8 +264,8 @@ void bli_sgemm_knc_asm_30x16 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { float * a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c index 91fe1989f..2464ecf0a 100644 --- a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c +++ b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c @@ -116,7 +116,7 @@ void bli_dpackm_knl_asm_8xk double* restrict kappa_, double* restrict a_, inc_t inca_, inc_t lda_, double* restrict p_, inc_t ldp_, - cntx_t* restrict cntx + cntx_t* cntx ) { const int32_t* offsetPtr = &offsets[0]; @@ -367,7 +367,7 @@ void bli_dpackm_knl_asm_24xk double* restrict kappa_, double* restrict a_, inc_t inca_, inc_t lda_, double* restrict p_, inc_t ldp_, - cntx_t* restrict cntx + cntx_t* cntx ) { const int32_t* offsetPtr = &offsets[0]; diff --git a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c index 8c4bdfe6b..4326a00dd 100644 --- a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c +++ b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c @@ -118,7 +118,7 @@ void bli_spackm_knl_asm_16xk float* restrict kappa_, float* restrict a_, inc_t inca_, inc_t lda_, float* restrict p_, inc_t ldp_, - cntx_t* restrict cntx + cntx_t* cntx ) { const int32_t* offsetPtr = &offsets[0]; @@ -385,7 +385,7 @@ void bli_spackm_knl_asm_24xk float* restrict kappa_, float* restrict a_, inc_t inca_, inc_t lda_, float* restrict p_, inc_t ldp_, - cntx_t* restrict cntx + cntx_t* cntx ) { const int32_t* offsetPtr = &offsets[0]; diff --git a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c index a7f860ae0..11a480997 100644 --- a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c +++ b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c @@ -193,8 +193,8 @@ void bli_dgemm_knl_asm_24x8 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c_, inc_t cs_c_, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { (void)data; diff --git a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c index 64feba09f..cbef0cb82 100644 --- a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c +++ b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c @@ -190,8 +190,8 @@ void bli_sgemm_knl_asm_24x16 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c_, inc_t cs_c_, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { (void)data; diff --git a/kernels/penryn/1/bli_axpyv_penryn_int.c b/kernels/penryn/1/bli_axpyv_penryn_int.c index 2dd7c7324..c329912b4 100644 --- a/kernels/penryn/1/bli_axpyv_penryn_int.c +++ b/kernels/penryn/1/bli_axpyv_penryn_int.c @@ -50,7 +50,7 @@ void bli_daxpyv_penryn_int double* restrict alpha, double* restrict x, inc_t incx, double* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { double* restrict alpha_cast = alpha; diff --git a/kernels/penryn/1/bli_dotv_penryn_int.c b/kernels/penryn/1/bli_dotv_penryn_int.c index 2e88a577a..6d63a9cf0 100644 --- a/kernels/penryn/1/bli_dotv_penryn_int.c +++ b/kernels/penryn/1/bli_dotv_penryn_int.c @@ -51,7 +51,7 @@ void bli_ddotv_penryn_int double* restrict x, inc_t incx, double* restrict y, inc_t incy, double* restrict rho, - cntx_t* restrict cntx + cntx_t* cntx ) { double* restrict x_cast = x; diff --git a/kernels/penryn/1f/bli_axpy2v_penryn_int.c b/kernels/penryn/1f/bli_axpy2v_penryn_int.c index c809ebb41..350a0af5f 100644 --- a/kernels/penryn/1f/bli_axpy2v_penryn_int.c +++ b/kernels/penryn/1f/bli_axpy2v_penryn_int.c @@ -53,7 +53,7 @@ void bli_daxpy2v_penryn_int double* restrict x, inc_t incx, double* restrict y, inc_t incy, double* restrict z, inc_t incz, - cntx_t* restrict cntx + cntx_t* cntx ) { double* restrict alpha_cast = alpha; diff --git a/kernels/penryn/1f/bli_axpyf_penryn_int.c b/kernels/penryn/1f/bli_axpyf_penryn_int.c index ce4c4f786..f52c05d67 100644 --- a/kernels/penryn/1f/bli_axpyf_penryn_int.c +++ b/kernels/penryn/1f/bli_axpyf_penryn_int.c @@ -53,7 +53,7 @@ void bli_daxpyf_penryn_int double* restrict a, inc_t inca, inc_t lda, double* restrict x, inc_t incx, double* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { double* restrict alpha_cast = alpha; diff --git a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c index 6b9dab773..244e3f11c 100644 --- a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c +++ b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c @@ -54,7 +54,7 @@ void bli_ddotaxpyv_penryn_int double* restrict y, inc_t incy, double* restrict rho, double* restrict z, inc_t incz, - cntx_t* restrict cntx + cntx_t* cntx ) { double* restrict alpha_cast = alpha; diff --git a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c index fe102d427..3ff80319a 100644 --- a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c +++ b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c @@ -58,7 +58,7 @@ void bli_ddotxaxpyf_penryn_int double* restrict beta, double* restrict y, inc_t incy, double* restrict z, inc_t incz, - cntx_t* restrict cntx + cntx_t* cntx ) { double* restrict alpha_cast = alpha; diff --git a/kernels/penryn/1f/bli_dotxf_penryn_int.c b/kernels/penryn/1f/bli_dotxf_penryn_int.c index ac9887d59..e8775bd0c 100644 --- a/kernels/penryn/1f/bli_dotxf_penryn_int.c +++ b/kernels/penryn/1f/bli_dotxf_penryn_int.c @@ -54,7 +54,7 @@ void bli_ddotxf_penryn_int double* restrict x, inc_t incx, double* restrict beta, double* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { double* restrict alpha_cast = alpha; diff --git a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c index a3e39c3ac..8a3ec077f 100644 --- a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c @@ -47,8 +47,8 @@ void bli_sgemm_penryn_asm_8x4 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -522,8 +522,8 @@ void bli_dgemm_penryn_asm_4x4 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { void* a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c index 7bef618fa..aa8dcf858 100644 --- a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c @@ -47,8 +47,8 @@ void bli_sgemmtrsm_l_penryn_asm_8x4 float* restrict b01, float* restrict b11, float* restrict c11, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { } @@ -65,8 +65,8 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 double* restrict b01, double* restrict b11, double* restrict c11, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { void* b_next = bli_auxinfo_next_b( data ); @@ -81,30 +81,30 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 GEMMTRSM_UKR_SETUP_CT( d, 4, 4, false ); begin_asm() - + mov(var(a10), rax) // load address of a10. mov(var(b01), rbx) // load address of b01. //mov(var(b_next), r9) // load address of b_next. - + sub(imm(0-8*16), rax) // increment pointers to allow byte sub(imm(0-8*16), rbx) // offsets in the unrolled iterations. - + movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements movaps(mem(rax, -7*16), xmm1) // of a and b. movaps(mem(rbx, -8*16), xmm2) - + //mov(var(c11), rcx) // load address of c11 //mov(var(rs_c), rdi) // load cs_c //lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*cs_c; - + //prefetch(2, mem(r9, 0*8)) // prefetch b_next - + xorpd(xmm3, xmm3) xorpd(xmm4, xmm4) xorpd(xmm5, xmm5) xorpd(xmm6, xmm6) - + //prefetch(2, mem(rcx, 3*8)) // prefetch c + 0*cs_c xorpd(xmm8, xmm8) movaps(xmm8, xmm9) @@ -117,20 +117,20 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 //prefetch(2, mem(rdx, rdi, 1, 3*8)) // prefetch c + 3*cs_c movaps(xmm8, xmm14) movaps(xmm8, xmm15) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CONSIDERKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.LOOPKITER) // MAIN LOOP - + //prefetch(0, mem(rax, 1264)) prefetch(0, mem(rax, (4*35+1)*8)) - + addpd(xmm3, xmm11) // iteration 0 movaps(mem(rbx, -7*16), xmm3) addpd(xmm4, xmm15) @@ -138,13 +138,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + addpd(xmm2, xmm9) movaps(mem(rbx, -6*16), xmm2) addpd(xmm4, xmm13) @@ -152,7 +152,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -160,8 +160,8 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 movaps(mem(rax, -6*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -5*16), xmm1) - - + + addpd(xmm3, xmm11) // iteration 1 movaps(mem(rbx, -5*16), xmm3) addpd(xmm4, xmm15) @@ -169,13 +169,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + addpd(xmm2, xmm9) movaps(mem(rbx, -4*16), xmm2) addpd(xmm4, xmm13) @@ -183,7 +183,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -191,10 +191,10 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 movaps(mem(rax, -4*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -3*16), xmm1) - + //prefetch(0, mem(rax, 1328)) prefetch(0, mem(rax, (4*37+1)*8)) - + addpd(xmm3, xmm11) // iteration 2 movaps(mem(rbx, -3*16), xmm3) addpd(xmm4, xmm15) @@ -202,13 +202,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + addpd(xmm2, xmm9) movaps(mem(rbx, -2*16), xmm2) addpd(xmm4, xmm13) @@ -216,7 +216,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -224,8 +224,8 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 movaps(mem(rax, -2*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -1*16), xmm1) - - + + addpd(xmm3, xmm11) // iteration 3 movaps(mem(rbx, -1*16), xmm3) addpd(xmm4, xmm15) @@ -233,17 +233,17 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + sub(imm(0-4*4*8), rax) // a += 4*4 (unroll x mr) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + //sub(imm(-4*4*8), r9) // b_next += 4*4 (unroll x nr) - + addpd(xmm2, xmm9) movaps(mem(rbx, 0*16), xmm2) addpd(xmm4, xmm13) @@ -251,9 +251,9 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - + sub(imm(0-4*4*8), rbx) // b += 4*4 (unroll x nr) - + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -261,26 +261,26 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 movaps(mem(rax, -8*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -7*16), xmm1) - + //prefetch(2, mem(r9, 0*8)) // prefetch b_next[0] //prefetch(2, mem(r9, 8*8)) // prefetch b_next[8] - - + + dec(rsi) // i -= 1; jne(.LOOPKITER) // iterate again if i != 0. - - - + + + label(.CONSIDERKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.POSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.LOOPKLEFT) // EDGE LOOP - + addpd(xmm3, xmm11) // iteration 0 movaps(mem(rbx, -7*16), xmm3) addpd(xmm4, xmm15) @@ -288,13 +288,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + addpd(xmm2, xmm9) movaps(mem(rbx, -6*16), xmm2) addpd(xmm4, xmm13) @@ -302,7 +302,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -310,28 +310,28 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 movaps(mem(rax, -6*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -5*16), xmm1) - - + + sub(imm(0-4*1*8), rax) // a += 4 (1 x mr) sub(imm(0-4*1*8), rbx) // b += 4 (1 x nr) - - + + dec(rsi) // i -= 1; jne(.LOOPKLEFT) // iterate again if i != 0. - - - + + + label(.POSTACCUM) - + addpd(xmm3, xmm11) addpd(xmm4, xmm15) addpd(xmm5, xmm10) addpd(xmm6, xmm14) - - - + + + mov(var(b11), rbx) // load address of b11. - + // xmm8: xmm9: xmm10: xmm11: // ( ab01 ( ab00 ( ab03 ( ab02 // ab10 ) ab11 ) ab12 ) ab13 ) @@ -343,31 +343,31 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 movaps(xmm8, xmm1) unpcklpd(xmm8, xmm0) unpckhpd(xmm9, xmm1) - + movaps(xmm11, xmm4) movaps(xmm10, xmm5) unpcklpd(xmm10, xmm4) unpckhpd(xmm11, xmm5) - + movaps(xmm13, xmm2) movaps(xmm12, xmm3) unpcklpd(xmm12, xmm2) unpckhpd(xmm13, xmm3) - + movaps(xmm15, xmm6) movaps(xmm14, xmm7) unpcklpd(xmm14, xmm6) unpckhpd(xmm15, xmm7) - + // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 ) // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 ) // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 ) // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 ) - + mov(var(alpha), rax) // load address of alpha movddup(mem(rax), xmm15) // load alpha and duplicate - - movaps(mem(rbx, 0*16), xmm8) + + movaps(mem(rbx, 0*16), xmm8) movaps(mem(rbx, 1*16), xmm12) mulpd(xmm15, xmm8) // xmm8 = alpha * ( beta00 beta01 ) mulpd(xmm15, xmm12) // xmm12 = alpha * ( beta02 beta03 ) @@ -382,13 +382,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 movaps(mem(rbx, 6*16), xmm11) mulpd(xmm15, xmm11) // xmm11 = alpha * ( beta30 beta31 ) mulpd(mem(rbx, 7*16), xmm15) // xmm15 = alpha * ( beta32 beta33 ) - + // (Now scaled by alpha:) // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 ) // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 ) // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 ) // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 ) - + subpd(xmm0, xmm8) // xmm8 -= xmm0 subpd(xmm1, xmm9) // xmm9 -= xmm1 subpd(xmm2, xmm10) // xmm10 -= xmm2 @@ -397,28 +397,28 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 subpd(xmm5, xmm13) // xmm13 -= xmm5 subpd(xmm6, xmm14) // xmm14 -= xmm6 subpd(xmm7, xmm15) // xmm15 -= xmm7 - - - + + + label(.TRSM) - - + + mov(var(a11), rax) // load address of a11 mov(var(c11), rcx) // load address of c11 - + mov(var(rs_c), rsi) // load rs_c mov(var(cs_c), rdi) // load cs_c sal(imm(3), rsi) // rs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double ) - + lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c - - - + + + // iteration 0 - + movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00) - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00); mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00); @@ -426,7 +426,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 divpd(xmm0, xmm8) // xmm8 /= alpha00; divpd(xmm0, xmm12) // xmm12 /= alpha00; #endif - + movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8 movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12 movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0] @@ -435,14 +435,14 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1] add(rsi, rcx) // c11 += rs_c add(rsi, rdx) // c11_2 += rs_c - - - + + + // iteration 1 - + movddup(mem(1+0*4)*8(rax), xmm0) // load xmm0 = alpha10 movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11) - + movaps(xmm0, xmm4) // xmm4 = xmm0 mulpd(xmm8, xmm0) // xmm0 = alpha10 * ( beta00 beta01 ) mulpd(xmm12, xmm4) // xmm4 = alpha10 * ( beta02 beta03 ) @@ -455,7 +455,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 divpd(xmm1, xmm9) // xmm9 /= alpha11; divpd(xmm1, xmm13) // xmm13 /= alpha11; #endif - + movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9 movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13 movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0] @@ -464,15 +464,15 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1] add(rsi, rcx) // c11 += rs_c add(rsi, rdx) // c11_2 += rs_c - - - + + + // iteration 2 - + movddup(mem(2+0*4)*8(rax), xmm0) // load xmm0 = alpha20 movddup(mem(2+1*4)*8(rax), xmm1) // load xmm1 = alpha21 movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22) - + movaps(xmm0, xmm4) // xmm4 = xmm0 movaps(xmm1, xmm5) // xmm5 = xmm1 mulpd(xmm8, xmm0) // xmm0 = alpha20 * ( beta00 beta01 ) @@ -490,7 +490,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 divpd(xmm2, xmm10) // xmm10 /= alpha22; divpd(xmm2, xmm14) // xmm14 /= alpha22; #endif - + movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10 movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14 movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0] @@ -499,16 +499,16 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1] add(rsi, rcx) // c11 += rs_c add(rsi, rdx) // c11_2 += rs_c - - - + + + // iteration 3 - + movddup(mem(3+0*4)*8(rax), xmm0) // load xmm0 = alpha30 movddup(mem(3+1*4)*8(rax), xmm1) // load xmm1 = alpha31 movddup(mem(3+2*4)*8(rax), xmm2) // load xmm2 = alpha32 movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33) - + movaps(xmm0, xmm4) // xmm4 = xmm0 movaps(xmm1, xmm5) // xmm5 = xmm1 movaps(xmm2, xmm6) // xmm6 = xmm2 @@ -531,16 +531,16 @@ void bli_dgemmtrsm_l_penryn_asm_4x4 divpd(xmm3, xmm11) // xmm11 /= alpha33; divpd(xmm3, xmm15) // xmm15 /= alpha33; #endif - + movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11 movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15 movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0] movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1] movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0] movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1] - - - + + + end_asm( : // output operands (none) diff --git a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c index add12ea24..2efc037cc 100644 --- a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c @@ -47,8 +47,8 @@ void bli_sgemmtrsm_u_penryn_asm_8x4 float* restrict b21, float* restrict b11, float* restrict c11, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { } @@ -65,8 +65,8 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 double* restrict b21, double* restrict b11, double* restrict c11, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { void* b_next = bli_auxinfo_next_b( data ); @@ -81,23 +81,23 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 GEMMTRSM_UKR_SETUP_CT( d, 4, 4, false ); begin_asm() - + mov(var(a12), rax) // load address of a12. mov(var(b21), rbx) // load address of b21. //mov(var(b_next), r9) // load address of b_next. - + add(imm(8*16), rax) // increment pointers to allow byte add(imm(8*16), rbx) // offsets in the unrolled iterations. - + movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements movaps(mem(rax, -7*16), xmm1) // of a and b. movaps(mem(rbx, -8*16), xmm2) - + xorpd(xmm3, xmm3) xorpd(xmm4, xmm4) xorpd(xmm5, xmm5) xorpd(xmm6, xmm6) - + xorpd(xmm8, xmm8) movaps(xmm8, xmm9) movaps(xmm8, xmm10) @@ -106,19 +106,19 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 movaps(xmm8, xmm13) movaps(xmm8, xmm14) movaps(xmm8, xmm15) - - - + + + mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CONSIDERKLEFT) // if i == 0, jump to code that // contains the k_left loop. - - + + label(.LOOPKITER) // MAIN LOOP - + prefetch(0, mem(rax, 1264)) - + addpd(xmm3, xmm11) // iteration 0 movaps(mem(rbx, -7*16), xmm3) addpd(xmm4, xmm15) @@ -126,13 +126,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + addpd(xmm2, xmm9) movaps(mem(rbx, -6*16), xmm2) addpd(xmm4, xmm13) @@ -140,7 +140,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -148,8 +148,8 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 movaps(mem(rax, -6*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -5*16), xmm1) - - + + addpd(xmm3, xmm11) // iteration 1 movaps(mem(rbx, -5*16), xmm3) addpd(xmm4, xmm15) @@ -157,13 +157,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + addpd(xmm2, xmm9) movaps(mem(rbx, -4*16), xmm2) addpd(xmm4, xmm13) @@ -171,7 +171,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -179,9 +179,9 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 movaps(mem(rax, -4*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -3*16), xmm1) - + prefetch(0, mem(rax, 1328)) - + addpd(xmm3, xmm11) // iteration 2 movaps(mem(rbx, -3*16), xmm3) addpd(xmm4, xmm15) @@ -189,13 +189,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + addpd(xmm2, xmm9) movaps(mem(rbx, -2*16), xmm2) addpd(xmm4, xmm13) @@ -203,7 +203,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -211,8 +211,8 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 movaps(mem(rax, -2*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -1*16), xmm1) - - + + addpd(xmm3, xmm11) // iteration 3 movaps(mem(rbx, -1*16), xmm3) addpd(xmm4, xmm15) @@ -220,15 +220,15 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + add(imm(4*4*8), rax) // a += 4*4 (unroll x mr) - + addpd(xmm2, xmm9) movaps(mem(rbx, 0*16), xmm2) addpd(xmm4, xmm13) @@ -236,9 +236,9 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - + add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr) - + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -246,24 +246,24 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 movaps(mem(rax, -8*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -7*16), xmm1) - - - + + + dec(rsi) // i -= 1; jne(.LOOPKITER) // iterate again if i != 0. - - - + + + label(.CONSIDERKLEFT) - + mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.POSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. - - + + label(.LOOPKLEFT) // EDGE LOOP - + addpd(xmm3, xmm11) // iteration 0 movaps(mem(rbx, -7*16), xmm3) addpd(xmm4, xmm15) @@ -271,13 +271,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) - + addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) - + addpd(xmm2, xmm9) movaps(mem(rbx, -6*16), xmm2) addpd(xmm4, xmm13) @@ -285,7 +285,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) - + addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) @@ -293,28 +293,28 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 movaps(mem(rax, -6*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -5*16), xmm1) - - + + add(imm(4*1*8), rax) // a += 4 (1 x mr) add(imm(4*1*8), rbx) // b += 4 (1 x nr) - - + + dec(rsi) // i -= 1; jne(.LOOPKLEFT) // iterate again if i != 0. - - - + + + label(.POSTACCUM) - + addpd(xmm3, xmm11) addpd(xmm4, xmm15) addpd(xmm5, xmm10) addpd(xmm6, xmm14) - - - + + + mov(var(b11), rbx) // load address of b11. - + // xmm8: xmm9: xmm10: xmm11: // ( ab01 ( ab00 ( ab03 ( ab02 // ab10 ) ab11 ) ab12 ) ab13 ) @@ -326,30 +326,30 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 movaps(xmm8, xmm1) unpcklpd(xmm8, xmm0) unpckhpd(xmm9, xmm1) - + movaps(xmm11, xmm4) movaps(xmm10, xmm5) unpcklpd(xmm10, xmm4) unpckhpd(xmm11, xmm5) - + movaps(xmm13, xmm2) movaps(xmm12, xmm3) unpcklpd(xmm12, xmm2) unpckhpd(xmm13, xmm3) - + movaps(xmm15, xmm6) movaps(xmm14, xmm7) unpcklpd(xmm14, xmm6) unpckhpd(xmm15, xmm7) - + // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 ) // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 ) // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 ) // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 ) - + mov(var(alpha), rax) // load address of alpha movddup(mem(rax), xmm15) // load alpha and duplicate - + movaps(mem(rbx, 0*16), xmm8) movaps(mem(rbx, 1*16), xmm12) mulpd(xmm15, xmm8) // xmm8 = alpha * ( beta00 beta01 ) @@ -365,13 +365,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 movaps(mem(rbx, 6*16), xmm11) mulpd(xmm15, xmm11) // xmm11 = alpha * ( beta30 beta31 ) mulpd(mem(rbx, 7*16), xmm15) // xmm15 = alpha * ( beta32 beta33 ) - + // (Now scaled by alpha:) // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 ) // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 ) // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 ) // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 ) - + subpd(xmm0, xmm8) // xmm8 -= xmm0 subpd(xmm1, xmm9) // xmm9 -= xmm1 subpd(xmm2, xmm10) // xmm10 -= xmm2 @@ -380,31 +380,31 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 subpd(xmm5, xmm13) // xmm13 -= xmm5 subpd(xmm6, xmm14) // xmm14 -= xmm6 subpd(xmm7, xmm15) // xmm15 -= xmm7 - - - + + + label(.TRSM) - - + + mov(var(a11), rax) // load address of a11 mov(var(c11), rcx) // load address of c11 - + mov(var(rs_c), rsi) // load rs_c mov(var(cs_c), rdi) // load cs_c sal(imm(3), rsi) // rs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double ) - + add(rsi, rcx) // c11 += (4-1)*rs_c add(rsi, rcx) add(rsi, rcx) lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c; - - - + + + // iteration 0 - + movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33) - + #ifdef BLIS_ENABLE_TRSM_PREINVERSION mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33); mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33); @@ -412,7 +412,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 divpd(xmm3, xmm11) // xmm11 /= alpha33; divpd(xmm3, xmm15) // xmm15 /= alpha33; #endif - + movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11 movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15 movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0] @@ -421,14 +421,14 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1] sub(rsi, rcx) // c11 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c - - - + + + // iteration 1 - + movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22) movddup(mem(2+3*4)*8(rax), xmm3) // load xmm3 = alpha23 - + movaps(xmm3, xmm7) // xmm7 = xmm3 mulpd(xmm11, xmm3) // xmm3 = alpha23 * ( beta30 beta31 ) mulpd(xmm15, xmm7) // xmm7 = alpha23 * ( beta32 beta33 ) @@ -441,7 +441,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 divpd(xmm2, xmm10) // xmm10 /= alpha22; divpd(xmm2, xmm14) // xmm14 /= alpha22; #endif - + movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10 movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14 movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0] @@ -450,15 +450,15 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1] sub(rsi, rcx) // c11 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c - - - + + + // iteration 2 - + movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11) movddup(mem(1+2*4)*8(rax), xmm2) // load xmm2 = alpha12 movddup(mem(1+3*4)*8(rax), xmm3) // load xmm3 = alpha13 - + movaps(xmm2, xmm6) // xmm6 = xmm2 movaps(xmm3, xmm7) // xmm7 = xmm3 mulpd(xmm10, xmm2) // xmm2 = alpha12 * ( beta20 beta21 ) @@ -476,7 +476,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 divpd(xmm1, xmm9) // xmm9 /= alpha11; divpd(xmm1, xmm13) // xmm13 /= alpha11; #endif - + movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9 movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13 movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0] @@ -485,16 +485,16 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1] sub(rsi, rcx) // c11 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c - - - + + + // iteration 3 - + movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00) movddup(mem(0+1*4)*8(rax), xmm1) // load xmm1 = alpha01 movddup(mem(0+2*4)*8(rax), xmm2) // load xmm2 = alpha02 movddup(mem(0+3*4)*8(rax), xmm3) // load xmm3 = alpha03 - + movaps(xmm1, xmm5) // xmm5 = xmm1 movaps(xmm2, xmm6) // xmm6 = xmm2 movaps(xmm3, xmm7) // xmm7 = xmm3 @@ -517,16 +517,16 @@ void bli_dgemmtrsm_u_penryn_asm_4x4 divpd(xmm0, xmm8) // xmm8 /= alpha00; divpd(xmm0, xmm12) // xmm12 /= alpha00; #endif - + movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8 movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12 movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0] movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1] movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0] movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1] - - - + + + end_asm( : // output operands (none) : // input operands diff --git a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c index 21c0b2f10..69341320e 100644 --- a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c @@ -43,8 +43,8 @@ void bli_strsm_l_penryn_asm_8x4 float* restrict a11, float* restrict b11, float* restrict c11, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { } @@ -55,8 +55,8 @@ void bli_dtrsm_l_penryn_asm_4x4 double* restrict a11, double* restrict b11, double* restrict c11, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a @@ -65,9 +65,9 @@ void bli_dtrsm_l_penryn_asm_4x4 uint64_t cs_c = cs_c0; begin_asm() - + mov(var(b11), rbx) // load address of b11. - + movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 ) movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 ) movaps(mem(rbx, 2*16), xmm9) // xmm10 = ( beta10 beta11 ) @@ -76,28 +76,28 @@ void bli_dtrsm_l_penryn_asm_4x4 movaps(mem(rbx, 5*16), xmm14) // xmm13 = ( beta22 beta23 ) movaps(mem(rbx, 6*16), xmm11) // xmm14 = ( beta30 beta31 ) movaps(mem(rbx, 7*16), xmm15) // xmm15 = ( beta32 beta33 ) - - - + + + mov(var(a11), rax) // load address of a11 mov(var(c11), rcx) // load address of c11 - + mov(var(rs_c), rsi) // load rs_c mov(var(cs_c), rdi) // load cs_c sal(imm(3), rsi) // rs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double ) - + lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c - - - + + + // iteration 0 - + movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00) - + mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00); mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00); - + movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8 movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12 movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0] @@ -106,14 +106,14 @@ void bli_dtrsm_l_penryn_asm_4x4 movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1] add(rsi, rcx) // c11 += rs_c add(rsi, rdx) // c11_2 += rs_c - - - + + + // iteration 1 - + movddup(mem(1+0*4)*8(rax), xmm0) // load xmm0 = alpha10 movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11) - + movaps(xmm0, xmm4) // xmm4 = xmm0 mulpd(xmm8, xmm0) // xmm0 = alpha10 * ( beta00 beta01 ) mulpd(xmm12, xmm4) // xmm4 = alpha10 * ( beta02 beta03 ) @@ -121,7 +121,7 @@ void bli_dtrsm_l_penryn_asm_4x4 subpd(xmm4, xmm13) // xmm13 -= xmm4 mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11); mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11); - + movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9 movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13 movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0] @@ -130,15 +130,15 @@ void bli_dtrsm_l_penryn_asm_4x4 movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1] add(rsi, rcx) // c11 += rs_c add(rsi, rdx) // c11_2 += rs_c - - - + + + // iteration 2 - + movddup(mem(2+0*4)*8(rax), xmm0) // load xmm0 = alpha20 movddup(mem(2+1*4)*8(rax), xmm1) // load xmm1 = alpha21 movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22) - + movaps(xmm0, xmm4) // xmm4 = xmm0 movaps(xmm1, xmm5) // xmm5 = xmm1 mulpd(xmm8, xmm0) // xmm0 = alpha20 * ( beta00 beta01 ) @@ -151,7 +151,7 @@ void bli_dtrsm_l_penryn_asm_4x4 subpd(xmm4, xmm14) // xmm14 -= xmm4 mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22); mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22); - + movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10 movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14 movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0] @@ -160,16 +160,16 @@ void bli_dtrsm_l_penryn_asm_4x4 movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1] add(rsi, rcx) // c11 += rs_c add(rsi, rdx) // c11_2 += rs_c - - - + + + // iteration 3 - + movddup(mem(3+0*4)*8(rax), xmm0) // load xmm0 = alpha30 movddup(mem(3+1*4)*8(rax), xmm1) // load xmm1 = alpha31 movddup(mem(3+2*4)*8(rax), xmm2) // load xmm2 = alpha32 movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33) - + movaps(xmm0, xmm4) // xmm4 = xmm0 movaps(xmm1, xmm5) // xmm5 = xmm1 movaps(xmm2, xmm6) // xmm6 = xmm2 @@ -187,16 +187,16 @@ void bli_dtrsm_l_penryn_asm_4x4 subpd(xmm4, xmm15) // xmm15 -= xmm4 mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33); mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33); - + movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11 movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15 movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0] movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1] movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0] movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1] - - - + + + end_asm( : // output operands (none) diff --git a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c index 23855a460..0befb4e4e 100644 --- a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c +++ b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c @@ -43,8 +43,8 @@ void bli_strsm_u_penryn_asm_8x4 float* restrict a11, float* restrict b11, float* restrict c11, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { } @@ -55,8 +55,8 @@ void bli_dtrsm_u_penryn_asm_4x4 double* restrict a11, double* restrict b11, double* restrict c11, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a @@ -65,9 +65,9 @@ void bli_dtrsm_u_penryn_asm_4x4 uint64_t cs_c = cs_c0; begin_asm() - + mov(var(b11), rbx) // load address of b11. - + movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 ) movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 ) movaps(mem(rbx, 2*16), xmm9) // xmm10 = ( beta10 beta11 ) @@ -76,31 +76,31 @@ void bli_dtrsm_u_penryn_asm_4x4 movaps(mem(rbx, 5*16), xmm14) // xmm13 = ( beta22 beta23 ) movaps(mem(rbx, 6*16), xmm11) // xmm14 = ( beta30 beta31 ) movaps(mem(rbx, 7*16), xmm15) // xmm15 = ( beta32 beta33 ) - - - + + + mov(var(a11), rax) // load address of a11 mov(var(c11), rcx) // load address of c11 - + mov(var(rs_c), rsi) // load rs_c mov(var(cs_c), rdi) // load cs_c sal(imm(3), rsi) // rs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double ) - + add(rsi, rcx) // c11 += (4-1)*rs_c add(rsi, rcx) add(rsi, rcx) lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c; - - - + + + // iteration 0 - + movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33) - + mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33); mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33); - + movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11 movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15 movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0] @@ -109,14 +109,14 @@ void bli_dtrsm_u_penryn_asm_4x4 movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1] sub(rsi, rcx) // c11 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c - - - + + + // iteration 1 - + movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22) movddup(mem(2+3*4)*8(rax), xmm3) // load xmm3 = alpha23 - + movaps(xmm3, xmm7) // xmm7 = xmm3 mulpd(xmm11, xmm3) // xmm3 = alpha23 * ( beta30 beta31 ) mulpd(xmm15, xmm7) // xmm7 = alpha23 * ( beta32 beta33 ) @@ -124,7 +124,7 @@ void bli_dtrsm_u_penryn_asm_4x4 subpd(xmm7, xmm14) // xmm14 -= xmm7 mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22); mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22); - + movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10 movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14 movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0] @@ -133,15 +133,15 @@ void bli_dtrsm_u_penryn_asm_4x4 movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1] sub(rsi, rcx) // c11 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c - - - + + + // iteration 2 - + movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11) movddup(mem(1+2*4)*8(rax), xmm2) // load xmm2 = alpha12 movddup(mem(1+3*4)*8(rax), xmm3) // load xmm3 = alpha13 - + movaps(xmm2, xmm6) // xmm6 = xmm2 movaps(xmm3, xmm7) // xmm7 = xmm3 mulpd(xmm10, xmm2) // xmm2 = alpha12 * ( beta20 beta21 ) @@ -154,7 +154,7 @@ void bli_dtrsm_u_penryn_asm_4x4 subpd(xmm6, xmm13) // xmm13 -= xmm6 mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11); mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11); - + movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9 movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13 movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0] @@ -163,16 +163,16 @@ void bli_dtrsm_u_penryn_asm_4x4 movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1] sub(rsi, rcx) // c11 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c - - - + + + // iteration 3 - + movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00) movddup(mem(0+1*4)*8(rax), xmm1) // load xmm1 = alpha01 movddup(mem(0+2*4)*8(rax), xmm2) // load xmm2 = alpha02 movddup(mem(0+3*4)*8(rax), xmm3) // load xmm3 = alpha03 - + movaps(xmm1, xmm5) // xmm5 = xmm1 movaps(xmm2, xmm6) // xmm6 = xmm2 movaps(xmm3, xmm7) // xmm7 = xmm3 @@ -190,16 +190,16 @@ void bli_dtrsm_u_penryn_asm_4x4 subpd(xmm5, xmm12) // xmm12 -= xmm5 mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00); mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00); - + movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8 movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12 movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0] movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1] movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0] movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1] - - - + + + end_asm( : // output operands (none) diff --git a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c index e65ce7178..95ce7edeb 100644 --- a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c +++ b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c @@ -50,8 +50,8 @@ void bli_sgemm_piledriver_asm_16x3 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { void* a_next = bli_auxinfo_next_a( data ); @@ -531,8 +531,8 @@ void bli_dgemm_piledriver_asm_8x3 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { void* a_next = bli_auxinfo_next_a( data ); @@ -987,8 +987,8 @@ void bli_cgemm_piledriver_asm_4x2 scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { void* a_next = bli_auxinfo_next_a( data ); @@ -1397,8 +1397,8 @@ void bli_zgemm_piledriver_asm_2x2 dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { void* a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/power10/3/bli_dgemm_power10_mma.c b/kernels/power10/3/bli_dgemm_power10_mma.c index 84e7d16d3..abf66f58f 100644 --- a/kernels/power10/3/bli_dgemm_power10_mma.c +++ b/kernels/power10/3/bli_dgemm_power10_mma.c @@ -70,8 +70,8 @@ void bli_dgemm_power10_mma_8x8 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { diff --git a/kernels/power10/3/bli_i16gemm_power10_mma.c b/kernels/power10/3/bli_i16gemm_power10_mma.c index c7f81dc7d..d0c9390f5 100644 --- a/kernels/power10/3/bli_i16gemm_power10_mma.c +++ b/kernels/power10/3/bli_i16gemm_power10_mma.c @@ -63,8 +63,8 @@ void bli_i16gemm_power10_mma_8x16 short* restrict b, int32_t* restrict beta, int32_t* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { diff --git a/kernels/power10/3/bli_i16sgemm_power10_mma.c b/kernels/power10/3/bli_i16sgemm_power10_mma.c index 9e8d99c13..7d84e68e2 100644 --- a/kernels/power10/3/bli_i16sgemm_power10_mma.c +++ b/kernels/power10/3/bli_i16sgemm_power10_mma.c @@ -63,8 +63,8 @@ void bli_i16sgemm_power10_mma_8x16 short* restrict b, int32_t* restrict beta, int32_t* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { diff --git a/kernels/power10/3/bli_i4gemm_power10_mma.c b/kernels/power10/3/bli_i4gemm_power10_mma.c index 7527f271f..6c78a9f00 100644 --- a/kernels/power10/3/bli_i4gemm_power10_mma.c +++ b/kernels/power10/3/bli_i4gemm_power10_mma.c @@ -63,8 +63,8 @@ void bli_i4gemm_power10_mma_8x16 nibbles* restrict b, int32_t* restrict beta, int32_t* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { diff --git a/kernels/power10/3/bli_i8gemm_power10_mma.c b/kernels/power10/3/bli_i8gemm_power10_mma.c index 037a28595..8a0b158a5 100644 --- a/kernels/power10/3/bli_i8gemm_power10_mma.c +++ b/kernels/power10/3/bli_i8gemm_power10_mma.c @@ -63,8 +63,8 @@ void bli_i8gemm_power10_mma_8x16 int8_t* restrict b, int32_t* restrict beta, int32_t* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t k_iter = (k-1) / 4; diff --git a/kernels/power10/3/bli_sbgemm_power10_mma.c b/kernels/power10/3/bli_sbgemm_power10_mma.c index b37a0c7ce..c16710f45 100644 --- a/kernels/power10/3/bli_sbgemm_power10_mma.c +++ b/kernels/power10/3/bli_sbgemm_power10_mma.c @@ -64,8 +64,8 @@ void bli_sbgemm_power10_mma_8x16 bfloat16* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { diff --git a/kernels/power10/3/bli_sgemm_power10_mma.c b/kernels/power10/3/bli_sgemm_power10_mma.c index 42bbaa916..15895e654 100644 --- a/kernels/power10/3/bli_sgemm_power10_mma.c +++ b/kernels/power10/3/bli_sgemm_power10_mma.c @@ -63,8 +63,8 @@ void bli_sgemm_power10_mma_8x16 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a diff --git a/kernels/power10/3/bli_shgemm_power10_mma.c b/kernels/power10/3/bli_shgemm_power10_mma.c index 0e80735df..dc62b5d60 100644 --- a/kernels/power10/3/bli_shgemm_power10_mma.c +++ b/kernels/power10/3/bli_shgemm_power10_mma.c @@ -64,8 +64,8 @@ void bli_shgemm_power10_mma_8x16 float16* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { diff --git a/kernels/power7/3/bli_gemm_power7_int_8x4.c b/kernels/power7/3/bli_gemm_power7_int_8x4.c index b9ce85f72..8ca0c891e 100644 --- a/kernels/power7/3/bli_gemm_power7_int_8x4.c +++ b/kernels/power7/3/bli_gemm_power7_int_8x4.c @@ -58,8 +58,8 @@ void bli_sgemm_power7_int_8x4 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { #if 1 || defined(UTEST) @@ -100,8 +100,8 @@ void bli_dgemm_power7_int_8x4 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { if ( cs_c == 1 ) @@ -457,8 +457,8 @@ void bli_cgemm_power7_int_8x4 scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { #if 1 || defined(UTEST) @@ -510,8 +510,8 @@ void bli_zgemm_power7_int_8x4 scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { #if 1 || defined(UTEST) diff --git a/kernels/power7/3/test/bli_gemm_power7_int_8x4.h b/kernels/power7/3/test/bli_gemm_power7_int_8x4.h index 50984a67d..a8082b38e 100644 --- a/kernels/power7/3/test/bli_gemm_power7_int_8x4.h +++ b/kernels/power7/3/test/bli_gemm_power7_int_8x4.h @@ -51,8 +51,8 @@ void bli_sgemm_opt_8x4 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ); void bli_dgemm_opt_8x4 @@ -65,8 +65,8 @@ void bli_dgemm_opt_8x4 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ); void bli_cgemm_opt_8x4 @@ -79,8 +79,8 @@ void bli_cgemm_opt_8x4 scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ); void bli_zgemm_opt_8x4 @@ -93,8 +93,8 @@ void bli_zgemm_opt_8x4 dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ); #endif diff --git a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c index 3e5f0d416..70af2b17e 100644 --- a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c +++ b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c @@ -45,8 +45,8 @@ void bli_dgemm_power9_asm_12x6 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c index 7890ad347..051af62e7 100644 --- a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c +++ b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c @@ -50,8 +50,8 @@ void bli_sgemm_sandybridge_asm_8x8 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -542,8 +542,8 @@ void bli_dgemm_sandybridge_asm_8x4 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1004,8 +1004,8 @@ void bli_cgemm_sandybridge_asm_8x4 scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1707,8 +1707,8 @@ void bli_zgemm_sandybridge_asm_4x4 dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c index 6bf991082..cb1cdc7c2 100644 --- a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c +++ b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c @@ -48,8 +48,8 @@ void bli_sgemm_sandybridge_int_8x8 float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { } @@ -65,8 +65,8 @@ void bli_dgemm_sandybridge_int_8x4 double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { @@ -503,8 +503,8 @@ void bli_cgemm_sandybridge_int_8x4 scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { } @@ -523,8 +523,8 @@ void bli_zgemm_sandybridge_int_4x4 dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { } diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c index 9943a170b..2579ac4b5 100644 --- a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c +++ b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c @@ -298,7 +298,7 @@ void bli_dgemm_skx_asm_16x12_l2 double* restrict beta, double* restrict c, inc_t rs_c_, inc_t cs_c_, auxinfo_t* data, - cntx_t* restrict cntx + cntx_t* cntx ) { (void)data; diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c index e3bc52041..babb89a1d 100644 --- a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c +++ b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c @@ -164,7 +164,7 @@ void bli_dgemm_skx_asm_16x14 double* restrict beta, double* restrict c, inc_t rs_c_, inc_t cs_c_, auxinfo_t* data, - cntx_t* restrict cntx + cntx_t* cntx ) { (void)data; diff --git a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c index 8808449b6..99b850d1d 100644 --- a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c +++ b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c @@ -328,7 +328,7 @@ void bli_sgemm_skx_asm_32x12_l2 float* restrict beta, float* restrict c, inc_t rs_c_, inc_t cs_c_, auxinfo_t* data, - cntx_t* restrict cntx + cntx_t* cntx ) { (void)data; diff --git a/kernels/zen/1/bli_amaxv_zen_int.c b/kernels/zen/1/bli_amaxv_zen_int.c index 4ece5af29..d1263a6c1 100644 --- a/kernels/zen/1/bli_amaxv_zen_int.c +++ b/kernels/zen/1/bli_amaxv_zen_int.c @@ -104,7 +104,7 @@ void bli_samaxv_zen_int dim_t n, float* restrict x, inc_t incx, dim_t* restrict i_max, - cntx_t* restrict cntx + cntx_t* cntx ) { float* minus_one = PASTEMAC(s,m1); @@ -202,7 +202,7 @@ void bli_samaxv_zen_int max_vec_hi.v = _mm256_extractf128_ps( max_vec.v, 1 ); maxInx_vec_lo.v = _mm256_extractf128_ps( maxInx_vec.v, 0 ); maxInx_vec_hi.v = _mm256_extractf128_ps( maxInx_vec.v, 1 ); - + mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); @@ -210,7 +210,7 @@ void bli_samaxv_zen_int max_vec_hi.v = _mm_permute_ps( max_vec_lo.v, 14 ); maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 14 ); - + mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); @@ -218,7 +218,7 @@ void bli_samaxv_zen_int max_vec_hi.v = _mm_permute_ps( max_vec_lo.v, 1 ); maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 1 ); - + mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); @@ -269,7 +269,7 @@ void bli_damaxv_zen_int dim_t n, double* restrict x, inc_t incx, dim_t* restrict i_max, - cntx_t* restrict cntx + cntx_t* cntx ) { double* minus_one = PASTEMAC(d,m1); @@ -367,15 +367,15 @@ void bli_damaxv_zen_int max_vec_hi.v = _mm256_extractf128_pd( max_vec.v, 1 ); maxInx_vec_lo.v = _mm256_extractf128_pd( maxInx_vec.v, 0 ); maxInx_vec_hi.v = _mm256_extractf128_pd( maxInx_vec.v, 1 ); - + mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); max_vec_lo.v = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v ); - + max_vec_hi.v = _mm_permute_pd( max_vec_lo.v, 1 ); maxInx_vec_hi.v = _mm_permute_pd( maxInx_vec_lo.v, 1 ); - + mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); max_vec_lo.v = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); diff --git a/kernels/zen/1/bli_axpyv_zen_int.c b/kernels/zen/1/bli_axpyv_zen_int.c index 686580b29..b842c59ed 100644 --- a/kernels/zen/1/bli_axpyv_zen_int.c +++ b/kernels/zen/1/bli_axpyv_zen_int.c @@ -62,7 +62,7 @@ void bli_saxpyv_zen_int float* restrict alpha, float* restrict x, inc_t incx, float* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 8; @@ -166,7 +166,7 @@ void bli_daxpyv_zen_int double* restrict alpha, double* restrict x, inc_t incx, double* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 4; diff --git a/kernels/zen/1/bli_axpyv_zen_int10.c b/kernels/zen/1/bli_axpyv_zen_int10.c index 873b7da53..6ad6d30cf 100644 --- a/kernels/zen/1/bli_axpyv_zen_int10.c +++ b/kernels/zen/1/bli_axpyv_zen_int10.c @@ -62,7 +62,7 @@ void bli_saxpyv_zen_int10 float* restrict alpha, float* restrict x, inc_t incx, float* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 8; @@ -268,7 +268,7 @@ void bli_daxpyv_zen_int10 double* restrict alpha, double* restrict x, inc_t incx, double* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 4; diff --git a/kernels/zen/1/bli_copyv_zen_int.c b/kernels/zen/1/bli_copyv_zen_int.c index 5fd2b1576..6307b5341 100644 --- a/kernels/zen/1/bli_copyv_zen_int.c +++ b/kernels/zen/1/bli_copyv_zen_int.c @@ -1,330 +1,330 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "immintrin.h" -#include "blis.h" - -// ----------------------------------------------------------------------------- - -void bli_scopyv_zen_int - ( - conj_t conjx, - dim_t n, - float* restrict x, inc_t incx, - float* restrict y, inc_t incy, - cntx_t* restrict cntx - ) -{ - const dim_t num_elem_per_reg = 8; - dim_t i = 0; - __m256 xv[16]; - - // If the vector dimension is zero return early. - if ( bli_zero_dim1( n ) ) return; - - if ( incx == 1 && incy == 1 ) - { -#if 0 - PRAGMA_SIMD - for (i = 0; i < n; i++) - { - y[i] = x[i]; - } -#endif -#if 0 - memcpy(y, x, n << 2); -#endif -#if 1 - - // For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128 - // for example if n = 255 - // n & ~0x7F results in 128: copy from 0 to 128 happens in first loop - // n & ~0x3F results in 192: copy from 128 to 192 happens in second loop - // n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on. - for ( i = 0; i < (n & (~0x7F)); i += 128 ) - { - xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); - xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); - xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2); - xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3); - xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4); - xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5); - xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6); - xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7); - xv[8] = _mm256_loadu_ps(x + num_elem_per_reg * 8); - xv[9] = _mm256_loadu_ps(x + num_elem_per_reg * 9); - xv[10] = _mm256_loadu_ps(x + num_elem_per_reg * 10); - xv[11] = _mm256_loadu_ps(x + num_elem_per_reg * 11); - xv[12] = _mm256_loadu_ps(x + num_elem_per_reg * 12); - xv[13] = _mm256_loadu_ps(x + num_elem_per_reg * 13); - xv[14] = _mm256_loadu_ps(x + num_elem_per_reg * 14); - xv[15] = _mm256_loadu_ps(x + num_elem_per_reg * 15); - - _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); - _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); - _mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]); - _mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]); - _mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]); - _mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]); - _mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]); - _mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]); - _mm256_storeu_ps(y + num_elem_per_reg * 8, xv[8]); - _mm256_storeu_ps(y + num_elem_per_reg * 9, xv[9]); - _mm256_storeu_ps(y + num_elem_per_reg * 10, xv[10]); - _mm256_storeu_ps(y + num_elem_per_reg * 11, xv[11]); - _mm256_storeu_ps(y + num_elem_per_reg * 12, xv[12]); - _mm256_storeu_ps(y + num_elem_per_reg * 13, xv[13]); - _mm256_storeu_ps(y + num_elem_per_reg * 14, xv[14]); - _mm256_storeu_ps(y + num_elem_per_reg * 15, xv[15]); - - y += 128; - x += 128; - } - for ( ; i < (n & (~0x3F)); i += 64 ) - { - xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); - xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); - xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2); - xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3); - xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4); - xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5); - xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6); - xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7); - - _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); - _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); - _mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]); - _mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]); - _mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]); - _mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]); - _mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]); - _mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]); - - y += 64; - x += 64; - } - for ( ; i < (n & (~0x1F)); i += 32 ) - { - xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); - xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); - xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2); - xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3); - - _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); - _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); - _mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]); - _mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]); - - y += 32; - x += 32; - } - for ( ; i < (n & (~0x0F)); i += 16 ) - { - xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); - xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); - - _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); - _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); - - y += 16; - x += 16; - } - for ( ; i < (n & (~0x07)); i += 8 ) - { - xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); - _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); - y += 8; - x += 8; - } - for ( ; i < n; ++i ) - { - *y++ = *x++; - } -#endif - } - else - { - for ( dim_t i = 0; i < n; ++i ) - { - *y = *x; - x += incx; - y += incy; - } - } -} - -// ----------------------------------------------------------------------------- - -void bli_dcopyv_zen_int - ( - conj_t conjx, - dim_t n, - double* restrict x, inc_t incx, - double* restrict y, inc_t incy, - cntx_t* restrict cntx - ) -{ - const dim_t num_elem_per_reg = 4; - dim_t i = 0; - __m256d xv[16]; - - // If the vector dimension is zero return early. - if ( bli_zero_dim1( n ) ) return; - - if ( incx == 1 && incy == 1 ) - { -#if 0 - PRAGMA_SIMD - for (i = 0; i < n; ++i) - { - y[i] = x[i]; - } -#endif -#if 0 - memcpy(y, x, n << 3); -#endif -#if 1 - // n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64, - // the copy operation will be done for the multiples of 64 - for ( i = 0; i < (n & (~0x3F)); i += 64 ) - { - xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); - xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); - xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2); - xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3); - xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4); - xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5); - xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6); - xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7); - xv[8] = _mm256_loadu_pd(x + num_elem_per_reg * 8); - xv[9] = _mm256_loadu_pd(x + num_elem_per_reg * 9); - xv[10] = _mm256_loadu_pd(x + num_elem_per_reg * 10); - xv[11] = _mm256_loadu_pd(x + num_elem_per_reg * 11); - xv[12] = _mm256_loadu_pd(x + num_elem_per_reg * 12); - xv[13] = _mm256_loadu_pd(x + num_elem_per_reg * 13); - xv[14] = _mm256_loadu_pd(x + num_elem_per_reg * 14); - xv[15] = _mm256_loadu_pd(x + num_elem_per_reg * 15); - _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); - _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); - _mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]); - _mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]); - _mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]); - _mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]); - _mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]); - _mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]); - _mm256_storeu_pd(y + num_elem_per_reg * 8, xv[8]); - _mm256_storeu_pd(y + num_elem_per_reg * 9, xv[9]); - _mm256_storeu_pd(y + num_elem_per_reg * 10, xv[10]); - _mm256_storeu_pd(y + num_elem_per_reg * 11, xv[11]); - _mm256_storeu_pd(y + num_elem_per_reg * 12, xv[12]); - _mm256_storeu_pd(y + num_elem_per_reg * 13, xv[13]); - _mm256_storeu_pd(y + num_elem_per_reg * 14, xv[14]); - _mm256_storeu_pd(y + num_elem_per_reg * 15, xv[15]); - y += num_elem_per_reg * 16; - x += num_elem_per_reg * 16; - } - for ( ; i < (n & (~0x1F)); i += 32 ) - { - xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); - xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); - xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2); - xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3); - xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4); - xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5); - xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6); - xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7); - - _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); - _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); - _mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]); - _mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]); - _mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]); - _mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]); - _mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]); - _mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]); - - y += num_elem_per_reg * 8; - x += num_elem_per_reg * 8; - } - for ( ; i < (n & (~0xF)); i += 16 ) - { - xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); - xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); - xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2); - xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3); - - _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); - _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); - _mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]); - _mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]); - - y += num_elem_per_reg * 4; - x += num_elem_per_reg * 4; - } - for ( ; i < (n & (~0x07)); i += 8 ) - { - xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); - xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); - - _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); - _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); - - y += num_elem_per_reg * 2; - x += num_elem_per_reg * 2; - } - for ( ; i < (n & (~0x03)); i += 4 ) - { - xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); - _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); - y += num_elem_per_reg; - x += num_elem_per_reg; - } - for ( ; i < n; ++i ) - { - *y++ = *x++; - } -#endif - } - else - { - for ( i = 0; i < n; ++i ) - { - *y = *x; - - x += incx; - y += incy; - } - } -} - +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "immintrin.h" +#include "blis.h" + +// ----------------------------------------------------------------------------- + +void bli_scopyv_zen_int + ( + conj_t conjx, + dim_t n, + float* restrict x, inc_t incx, + float* restrict y, inc_t incy, + cntx_t* cntx + ) +{ + const dim_t num_elem_per_reg = 8; + dim_t i = 0; + __m256 xv[16]; + + // If the vector dimension is zero return early. + if ( bli_zero_dim1( n ) ) return; + + if ( incx == 1 && incy == 1 ) + { +#if 0 + PRAGMA_SIMD + for (i = 0; i < n; i++) + { + y[i] = x[i]; + } +#endif +#if 0 + memcpy(y, x, n << 2); +#endif +#if 1 + + // For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128 + // for example if n = 255 + // n & ~0x7F results in 128: copy from 0 to 128 happens in first loop + // n & ~0x3F results in 192: copy from 128 to 192 happens in second loop + // n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on. + for ( i = 0; i < (n & (~0x7F)); i += 128 ) + { + xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); + xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2); + xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3); + xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4); + xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5); + xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6); + xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7); + xv[8] = _mm256_loadu_ps(x + num_elem_per_reg * 8); + xv[9] = _mm256_loadu_ps(x + num_elem_per_reg * 9); + xv[10] = _mm256_loadu_ps(x + num_elem_per_reg * 10); + xv[11] = _mm256_loadu_ps(x + num_elem_per_reg * 11); + xv[12] = _mm256_loadu_ps(x + num_elem_per_reg * 12); + xv[13] = _mm256_loadu_ps(x + num_elem_per_reg * 13); + xv[14] = _mm256_loadu_ps(x + num_elem_per_reg * 14); + xv[15] = _mm256_loadu_ps(x + num_elem_per_reg * 15); + + _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); + _mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]); + _mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]); + _mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]); + _mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]); + _mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]); + _mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]); + _mm256_storeu_ps(y + num_elem_per_reg * 8, xv[8]); + _mm256_storeu_ps(y + num_elem_per_reg * 9, xv[9]); + _mm256_storeu_ps(y + num_elem_per_reg * 10, xv[10]); + _mm256_storeu_ps(y + num_elem_per_reg * 11, xv[11]); + _mm256_storeu_ps(y + num_elem_per_reg * 12, xv[12]); + _mm256_storeu_ps(y + num_elem_per_reg * 13, xv[13]); + _mm256_storeu_ps(y + num_elem_per_reg * 14, xv[14]); + _mm256_storeu_ps(y + num_elem_per_reg * 15, xv[15]); + + y += 128; + x += 128; + } + for ( ; i < (n & (~0x3F)); i += 64 ) + { + xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); + xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2); + xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3); + xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4); + xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5); + xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6); + xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7); + + _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); + _mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]); + _mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]); + _mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]); + _mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]); + _mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]); + _mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]); + + y += 64; + x += 64; + } + for ( ; i < (n & (~0x1F)); i += 32 ) + { + xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); + xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2); + xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3); + + _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); + _mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]); + _mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]); + + y += 32; + x += 32; + } + for ( ; i < (n & (~0x0F)); i += 16 ) + { + xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); + + _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); + + y += 16; + x += 16; + } + for ( ; i < (n & (~0x07)); i += 8 ) + { + xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); + _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); + y += 8; + x += 8; + } + for ( ; i < n; ++i ) + { + *y++ = *x++; + } +#endif + } + else + { + for ( dim_t i = 0; i < n; ++i ) + { + *y = *x; + x += incx; + y += incy; + } + } +} + +// ----------------------------------------------------------------------------- + +void bli_dcopyv_zen_int + ( + conj_t conjx, + dim_t n, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + cntx_t* cntx + ) +{ + const dim_t num_elem_per_reg = 4; + dim_t i = 0; + __m256d xv[16]; + + // If the vector dimension is zero return early. + if ( bli_zero_dim1( n ) ) return; + + if ( incx == 1 && incy == 1 ) + { +#if 0 + PRAGMA_SIMD + for (i = 0; i < n; ++i) + { + y[i] = x[i]; + } +#endif +#if 0 + memcpy(y, x, n << 3); +#endif +#if 1 + // n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64, + // the copy operation will be done for the multiples of 64 + for ( i = 0; i < (n & (~0x3F)); i += 64 ) + { + xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); + xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2); + xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3); + xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4); + xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5); + xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6); + xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7); + xv[8] = _mm256_loadu_pd(x + num_elem_per_reg * 8); + xv[9] = _mm256_loadu_pd(x + num_elem_per_reg * 9); + xv[10] = _mm256_loadu_pd(x + num_elem_per_reg * 10); + xv[11] = _mm256_loadu_pd(x + num_elem_per_reg * 11); + xv[12] = _mm256_loadu_pd(x + num_elem_per_reg * 12); + xv[13] = _mm256_loadu_pd(x + num_elem_per_reg * 13); + xv[14] = _mm256_loadu_pd(x + num_elem_per_reg * 14); + xv[15] = _mm256_loadu_pd(x + num_elem_per_reg * 15); + _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); + _mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]); + _mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]); + _mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]); + _mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]); + _mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]); + _mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]); + _mm256_storeu_pd(y + num_elem_per_reg * 8, xv[8]); + _mm256_storeu_pd(y + num_elem_per_reg * 9, xv[9]); + _mm256_storeu_pd(y + num_elem_per_reg * 10, xv[10]); + _mm256_storeu_pd(y + num_elem_per_reg * 11, xv[11]); + _mm256_storeu_pd(y + num_elem_per_reg * 12, xv[12]); + _mm256_storeu_pd(y + num_elem_per_reg * 13, xv[13]); + _mm256_storeu_pd(y + num_elem_per_reg * 14, xv[14]); + _mm256_storeu_pd(y + num_elem_per_reg * 15, xv[15]); + y += num_elem_per_reg * 16; + x += num_elem_per_reg * 16; + } + for ( ; i < (n & (~0x1F)); i += 32 ) + { + xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); + xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2); + xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3); + xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4); + xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5); + xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6); + xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7); + + _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); + _mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]); + _mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]); + _mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]); + _mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]); + _mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]); + _mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]); + + y += num_elem_per_reg * 8; + x += num_elem_per_reg * 8; + } + for ( ; i < (n & (~0xF)); i += 16 ) + { + xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); + xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2); + xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3); + + _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); + _mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]); + _mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]); + + y += num_elem_per_reg * 4; + x += num_elem_per_reg * 4; + } + for ( ; i < (n & (~0x07)); i += 8 ) + { + xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); + xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); + + _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); + _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); + + y += num_elem_per_reg * 2; + x += num_elem_per_reg * 2; + } + for ( ; i < (n & (~0x03)); i += 4 ) + { + xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); + _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); + y += num_elem_per_reg; + x += num_elem_per_reg; + } + for ( ; i < n; ++i ) + { + *y++ = *x++; + } +#endif + } + else + { + for ( i = 0; i < n; ++i ) + { + *y = *x; + + x += incx; + y += incy; + } + } +} + diff --git a/kernels/zen/1/bli_dotv_zen_int.c b/kernels/zen/1/bli_dotv_zen_int.c index 01022d353..03c448f85 100644 --- a/kernels/zen/1/bli_dotv_zen_int.c +++ b/kernels/zen/1/bli_dotv_zen_int.c @@ -62,7 +62,7 @@ void bli_sdotv_zen_int float* restrict x, inc_t incx, float* restrict y, inc_t incy, float* restrict rho, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 8; @@ -184,7 +184,7 @@ void bli_ddotv_zen_int double* restrict x, inc_t incx, double* restrict y, inc_t incy, double* restrict rho, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 4; diff --git a/kernels/zen/1/bli_dotv_zen_int10.c b/kernels/zen/1/bli_dotv_zen_int10.c index 8c445849b..f3fe5ea71 100644 --- a/kernels/zen/1/bli_dotv_zen_int10.c +++ b/kernels/zen/1/bli_dotv_zen_int10.c @@ -63,7 +63,7 @@ void bli_sdotv_zen_int10 float* restrict x, inc_t incx, float* restrict y, inc_t incy, float* restrict rho, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 8; @@ -254,7 +254,7 @@ void bli_ddotv_zen_int10 double* restrict x, inc_t incx, double* restrict y, inc_t incy, double* restrict rho, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 4; diff --git a/kernels/zen/1/bli_dotxv_zen_int.c b/kernels/zen/1/bli_dotxv_zen_int.c index 99ea51710..48a9878a7 100644 --- a/kernels/zen/1/bli_dotxv_zen_int.c +++ b/kernels/zen/1/bli_dotxv_zen_int.c @@ -64,7 +64,7 @@ void bli_sdotxv_zen_int float* restrict y, inc_t incy, float* restrict beta, float* restrict rho, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 8; @@ -192,7 +192,7 @@ void bli_ddotxv_zen_int double* restrict y, inc_t incy, double* restrict beta, double* restrict rho, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 4; @@ -264,7 +264,7 @@ void bli_ddotxv_zen_int x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); - + // Compute the element-wise product of the x and y vectors, // storing in the corresponding rho vectors. rho0v.v = _mm256_fmadd_pd( x0v.v, y0v.v, rho0v.v ); diff --git a/kernels/zen/1/bli_scalv_zen_int.c b/kernels/zen/1/bli_scalv_zen_int.c index fb17dd4b3..f92cb0c6c 100644 --- a/kernels/zen/1/bli_scalv_zen_int.c +++ b/kernels/zen/1/bli_scalv_zen_int.c @@ -61,7 +61,7 @@ void bli_sscalv_zen_int dim_t n, float* restrict alpha, float* restrict x, inc_t incx, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 8; @@ -160,7 +160,7 @@ void bli_dscalv_zen_int dim_t n, double* restrict alpha, double* restrict x, inc_t incx, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 4; diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c index 9f31b7200..7487880b8 100644 --- a/kernels/zen/1/bli_scalv_zen_int10.c +++ b/kernels/zen/1/bli_scalv_zen_int10.c @@ -61,7 +61,7 @@ void bli_sscalv_zen_int10 dim_t n, float* restrict alpha, float* restrict x, inc_t incx, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 8; @@ -82,7 +82,7 @@ void bli_sscalv_zen_int10 { float* zero = bli_s0; - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx(); ssetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx ); @@ -255,7 +255,7 @@ void bli_dscalv_zen_int10 dim_t n, double* restrict alpha, double* restrict x, inc_t incx, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 4; @@ -276,7 +276,7 @@ void bli_dscalv_zen_int10 { double* zero = bli_d0; - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx(); dsetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx ); diff --git a/kernels/zen/1/bli_setv_zen_int.c b/kernels/zen/1/bli_setv_zen_int.c index 16e02c94d..0fbc24cfd 100644 --- a/kernels/zen/1/bli_setv_zen_int.c +++ b/kernels/zen/1/bli_setv_zen_int.c @@ -43,7 +43,7 @@ void bli_ssetv_zen_int dim_t n, float* restrict alpha, float* restrict x, inc_t incx, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t num_elem_per_reg = 8; @@ -138,7 +138,7 @@ void bli_dsetv_zen_int dim_t n, double* restrict alpha, double* restrict x, inc_t incx, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t num_elem_per_reg = 4; diff --git a/kernels/zen/1/bli_swapv_zen_int8.c b/kernels/zen/1/bli_swapv_zen_int8.c index aa7a6e339..824fd0fb8 100644 --- a/kernels/zen/1/bli_swapv_zen_int8.c +++ b/kernels/zen/1/bli_swapv_zen_int8.c @@ -59,7 +59,7 @@ void bli_sswapv_zen_int8 dim_t n, float* restrict x, inc_t incx, float* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { @@ -205,7 +205,7 @@ void bli_dswapv_zen_int8 dim_t n, double* restrict x, inc_t incx, double* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t n_elem_per_reg = 4; diff --git a/kernels/zen/1f/bli_axpyf_zen_int_4.c b/kernels/zen/1f/bli_axpyf_zen_int_4.c index 0ec5f44f5..ddebc5ee0 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_4.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c @@ -79,7 +79,7 @@ void bli_caxpyf_zen_int_4 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx(); caxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx ); diff --git a/kernels/zen/1f/bli_axpyf_zen_int_5.c b/kernels/zen/1f/bli_axpyf_zen_int_5.c index 1566f9809..9c8a370e1 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_5.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_5.c @@ -108,7 +108,7 @@ void bli_saxpyf_zen_int_5 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx(); saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); @@ -360,7 +360,7 @@ void bli_daxpyf_zen_int_5 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx(); daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); @@ -899,7 +899,7 @@ void bli_daxpyf_zen_int_16x4 // operation as a loop over axpyv. if ( b_n != fuse_fac ) { - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx(); daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); diff --git a/kernels/zen/1f/bli_axpyf_zen_int_8.c b/kernels/zen/1f/bli_axpyf_zen_int_8.c index 15fdf4651..24e6ee5e2 100644 --- a/kernels/zen/1f/bli_axpyf_zen_int_8.c +++ b/kernels/zen/1f/bli_axpyf_zen_int_8.c @@ -64,7 +64,7 @@ void bli_saxpyf_zen_int_8 float* restrict a, inc_t inca, inc_t lda, float* restrict x, inc_t incx, float* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t fuse_fac = 8; @@ -273,7 +273,7 @@ void bli_daxpyf_zen_int_8 double* restrict a, inc_t inca, inc_t lda, double* restrict x, inc_t incx, double* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t fuse_fac = 8; diff --git a/kernels/zen/1f/bli_dotxf_zen_int_8.c b/kernels/zen/1f/bli_dotxf_zen_int_8.c index 1f4a671b6..50ca92561 100644 --- a/kernels/zen/1f/bli_dotxf_zen_int_8.c +++ b/kernels/zen/1f/bli_dotxf_zen_int_8.c @@ -65,7 +65,7 @@ void bli_sdotxf_zen_int_8 float* restrict x, inc_t incx, float* restrict beta, float* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t fuse_fac = 8; @@ -455,7 +455,7 @@ void bli_ddotxf_zen_int_8 double* restrict x, inc_t incx, double* restrict beta, double* restrict y, inc_t incy, - cntx_t* restrict cntx + cntx_t* cntx ) { const dim_t fuse_fac = 8; diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c index 8d10406a0..076953725 100644 --- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c +++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c @@ -123,8 +123,8 @@ void bli_cgemmsup_rv_zen_asm_3x8m scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t n_left = n0 % 8; @@ -495,7 +495,7 @@ void bli_cgemmsup_rv_zen_asm_3x8m vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm13, ymm13) - /* (ßr + ßi)x C + ((ar + ai) x AB) */ + /* (�r + �i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate @@ -583,7 +583,7 @@ void bli_cgemmsup_rv_zen_asm_3x8m CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm9, ymm0, ymm9) - add(rdi, rcx) + add(rdi, rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm13, ymm0, ymm13) @@ -609,18 +609,18 @@ void bli_cgemmsup_rv_zen_asm_3x8m vmovups(xmm2, mem(rcx)) // store (gamma03-13) vmovhpd(xmm12, mem(rcx, 16)) // store (gamma33) lea(mem(rcx, rsi, 1), rcx) - + /******************Transpose bottom tile 4x3***************************/ vunpcklpd(ymm9, ymm5, ymm0) //a8a9b8b9 a12a13b12b13 //gamma04-14 gamma06-16 vunpckhpd(ymm9, ymm5, ymm2) //a10a11b10b11 a14a15b14b15 //gamma05-15 gamma07-17 - + vmovups(xmm0, mem(rcx)) // store (gamma04-14) vmovlpd(xmm13, mem(rcx, 16)) // store (gamma24) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma05-15) vmovhpd(xmm13, mem(rcx, 16)) // store (gamma25) lea(mem(rcx, rsi, 1), rcx) - + vextractf128(imm(0x1), ymm0, xmm0) vextractf128(imm(0x1), ymm2, xmm2) vextractf128(imm(0x1), ymm13, xmm13) @@ -658,8 +658,8 @@ void bli_cgemmsup_rv_zen_asm_3x8m mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) - vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 - vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 + vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 + vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 /******************Transpose top tile 4x3***************************/ vmovups(xmm0, mem(rcx)) @@ -680,8 +680,8 @@ void bli_cgemmsup_rv_zen_asm_3x8m lea(mem(rcx, rsi, 1), rcx) /******************Transpose bottom tile 4x3***************************/ - vunpcklpd(ymm9, ymm5, ymm0) //a8a9b8b9 a12a13b12b13 - vunpckhpd(ymm9, ymm5, ymm2) //a10a11b10b11 a14a15b14b15 + vunpcklpd(ymm9, ymm5, ymm0) //a8a9b8b9 a12a13b12b13 + vunpckhpd(ymm9, ymm5, ymm2) //a10a11b10b11 a14a15b14b15 vmovups(xmm0, mem(rcx)) vmovlpd(xmm13, mem(rcx, 16)) @@ -788,8 +788,8 @@ void bli_cgemmsup_rv_zen_asm_3x4m scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1060,7 +1060,7 @@ void bli_cgemmsup_rv_zen_asm_3x4m vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm12, ymm12) - /* (ßr + ßi)x C + ((ar + ai) x AB) */ + /* (�r + �i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate @@ -1117,7 +1117,7 @@ void bli_cgemmsup_rv_zen_asm_3x4m CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm4, ymm0, ymm4) add(rdi, rcx) - + CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm8, ymm0, ymm8) add(rdi, rcx) @@ -1136,7 +1136,7 @@ void bli_cgemmsup_rv_zen_asm_3x4m vmovups(xmm2, mem(rcx)) // store (gamma01-11) vmovhpd(xmm12, mem(rcx, 16)) // store (gamma21) lea(mem(rcx, rsi, 1), rcx) - + vextractf128(imm(0x1), ymm0, xmm0) vextractf128(imm(0x1), ymm2, xmm2) vextractf128(imm(0x1), ymm12, xmm12) @@ -1172,8 +1172,8 @@ void bli_cgemmsup_rv_zen_asm_3x4m mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) - vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 - vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 + vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 + vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 vmovups(xmm0, mem(rcx)) vmovlpd(xmm12, mem(rcx, 16)) @@ -1277,8 +1277,8 @@ void bli_cgemmsup_rv_zen_asm_3x2m scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -1543,7 +1543,7 @@ void bli_cgemmsup_rv_zen_asm_3x2m vmulps(xmm1, xmm3, xmm3) vaddsubps(xmm3, xmm12, xmm12) - /* (ßr + ßi)x C + ((ar + ai) x AB) */ + /* (�r + �i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), xmm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), xmm2) // load beta_i and duplicate @@ -1627,7 +1627,7 @@ void bli_cgemmsup_rv_zen_asm_3x2m CGEMM_INPUT_SCALE_CS_BETA_NZ_128 vaddps(xmm4, xmm0, xmm4) add(rdi, rcx) - + CGEMM_INPUT_SCALE_CS_BETA_NZ_128 vaddps(xmm8, xmm0, xmm8) add(rdi, rcx) @@ -1753,4 +1753,3 @@ void bli_cgemmsup_rv_zen_asm_3x2m } } - \ No newline at end of file diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c index 6c68707e1..62491dfb4 100644 --- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c +++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c @@ -80,14 +80,14 @@ void bli_cgemmsup_rv_zen_asm_3x8n scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t m_left = m0 % 3; if ( m_left ) { - cgemmsup_ker_ft ker_fps[3] = + cgemmsup_ker_ft ker_fps[3] = { NULL, bli_cgemmsup_rv_zen_asm_1x8n, @@ -120,7 +120,7 @@ void bli_cgemmsup_rv_zen_asm_3x8n uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - + if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- @@ -150,7 +150,7 @@ void bli_cgemmsup_rv_zen_asm_3x8n ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); ymm15 = _mm256_setzero_ps(); - + dim_t ta_inc_row = rs_a; dim_t tb_inc_row = rs_b; dim_t tc_inc_row = rs_c; @@ -170,7 +170,7 @@ void bli_cgemmsup_rv_zen_asm_3x8n // This loop is processing MR x K ymm0 = _mm256_loadu_ps((float const *)(tB + tb_inc_row * k_iter)); ymm1 = _mm256_loadu_ps((float const *)(tB + tb_inc_row * k_iter + 4)); - + //broadcasted matrix B elements are multiplied //with matrix A columns. ymm2 = _mm256_broadcast_ss((float const *)(tA)); @@ -534,8 +534,8 @@ void bli_cgemmsup_rv_zen_asm_2x8n scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -600,7 +600,7 @@ void bli_cgemmsup_rv_zen_asm_2x8n // This loop is processing MR x K ymm0 = _mm256_loadu_ps((float const *)(tB + tb_inc_row * k_iter)); ymm1 = _mm256_loadu_ps((float const *)(tB + tb_inc_row * k_iter + 4)); - + //broadcasted matrix B elements are multiplied //with matrix A columns. ymm2 = _mm256_broadcast_ss((float const *)(tA)); @@ -882,8 +882,8 @@ void bli_cgemmsup_rv_zen_asm_1x8n scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { @@ -1151,8 +1151,8 @@ void bli_cgemmsup_rv_zen_asm_3x4 scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { @@ -1184,7 +1184,7 @@ void bli_cgemmsup_rv_zen_asm_3x4 ymm10 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); - + dim_t ta_inc_row = rs_a; dim_t tb_inc_row = rs_b; dim_t tc_inc_row = rs_c; @@ -1386,8 +1386,8 @@ void bli_cgemmsup_rv_zen_asm_3x2 scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { @@ -1408,7 +1408,7 @@ void bli_cgemmsup_rv_zen_asm_3x2 scomplex *tB = b; scomplex *tC = c; // clear scratch registers. - __m128 xmm0, xmm1, xmm2, xmm3; + __m128 xmm0, xmm1, xmm2, xmm3; __m128 xmm4 = _mm_setzero_ps(); __m128 xmm6 = _mm_setzero_ps(); __m128 xmm8 = _mm_setzero_ps(); diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c index 1638eaba0..b9ed3c9f9 100644 --- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c +++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c @@ -82,8 +82,8 @@ void bli_zgemmsup_rv_zen_asm_2x4 dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -357,7 +357,7 @@ void bli_zgemmsup_rv_zen_asm_2x4 vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm9, ymm9) - /* (ßr + ßi)x C + ((ar + ai) x AB) */ + /* (�r + �i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate @@ -413,7 +413,7 @@ void bli_zgemmsup_rv_zen_asm_2x4 mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(real dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof((real+imag) dt) - + lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a ZGEMM_INPUT_SCALE_CS_BETA_NZ @@ -423,16 +423,16 @@ void bli_zgemmsup_rv_zen_asm_2x4 ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm8, ymm0, ymm8) add(rdi, rcx) - + lea(mem(r12, rsi, 2), rcx) - + ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm5, ymm0, ymm5) add(rdi, rcx) ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm9, ymm0, ymm9) - add(rdi, rcx) + add(rdi, rcx) mov(r12, rcx) // reset rcx to current utile of c. @@ -454,12 +454,12 @@ void bli_zgemmsup_rv_zen_asm_2x4 vmovups(xmm8, mem(rcx, 16)) add(rsi, rcx) - + vmovups(xmm5, mem(rcx)) vmovups(xmm9, mem(rcx, 16)) - + add(rsi, rcx) - + vextractf128(imm(0x1), ymm5, xmm5) vextractf128(imm(0x1), ymm9, xmm9) vmovups(xmm5, mem(rcx)) @@ -501,12 +501,12 @@ void bli_zgemmsup_rv_zen_asm_2x4 vmovups(xmm8, mem(rcx, 16)) add(rsi, rcx) - + vmovups(xmm5, mem(rcx)) vmovups(xmm9, mem(rcx, 16)) - + add(rsi, rcx) - + vextractf128(imm(0x1), ymm5, xmm5) vextractf128(imm(0x1), ymm9, xmm9) vmovups(xmm5, mem(rcx)) @@ -558,8 +558,8 @@ void bli_zgemmsup_rv_zen_asm_1x4 dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { @@ -781,7 +781,7 @@ void bli_zgemmsup_rv_zen_asm_1x4 vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm5, ymm5) - /* (ßr + ßi)x C + ((ar + ai) x AB) */ + /* (�r + �i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate @@ -828,14 +828,14 @@ void bli_zgemmsup_rv_zen_asm_1x4 mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(real dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof((real+imag) dt) - + lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm4, ymm0, ymm4) lea(mem(r12, rsi, 2), rcx) - + ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm5, ymm0, ymm5) @@ -854,7 +854,7 @@ void bli_zgemmsup_rv_zen_asm_1x4 vmovups(xmm4, mem(rcx)) add(rsi, rcx) - + vmovups(xmm5, mem(rcx)) add(rsi, rcx) @@ -943,8 +943,8 @@ void bli_zgemmsup_rv_zen_asm_2x2 dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { @@ -1178,7 +1178,7 @@ void bli_zgemmsup_rv_zen_asm_2x2 vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm8, ymm8) - /* (ßr + ßi)x C + ((ar + ai) x AB) */ + /* (�r + �i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate @@ -1226,7 +1226,7 @@ void bli_zgemmsup_rv_zen_asm_2x2 mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(real dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof((real+imag) dt) - + lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a ZGEMM_INPUT_SCALE_CS_BETA_NZ @@ -1330,8 +1330,8 @@ void bli_zgemmsup_rv_zen_asm_1x2 dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { @@ -1529,7 +1529,7 @@ void bli_zgemmsup_rv_zen_asm_1x2 vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm4, ymm4) - /* (ßr + ßi)x C + ((ar + ai) x AB) */ + /* (�r + �i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate @@ -1571,7 +1571,7 @@ void bli_zgemmsup_rv_zen_asm_1x2 mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(real dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof((real+imag) dt) - + lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a ZGEMM_INPUT_SCALE_CS_BETA_NZ @@ -1602,7 +1602,7 @@ void bli_zgemmsup_rv_zen_asm_1x2 vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + jmp(.SDONE) // jump to end. diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c index 05e05dfec..1dd37a395 100644 --- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c +++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c @@ -97,7 +97,7 @@ crr: | | | | | | | | ------ -------- - | | | | | | | | += ------ + | | | | | | | | += ------ -------- | | | | | | | | ------ -------- | | | | | | | | ------ : @@ -114,8 +114,8 @@ void bli_zgemmsup_rv_zen_asm_3x4m dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t n_left = n0 % 4; @@ -477,7 +477,7 @@ void bli_zgemmsup_rv_zen_asm_3x4m vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm13, ymm13) - /* (ßr + ßi)x C + ((ar + ai) x AB) */ + /* (�r + �i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate @@ -563,7 +563,7 @@ void bli_zgemmsup_rv_zen_asm_3x4m ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm9, ymm0, ymm9) - add(rdi, rcx) + add(rdi, rcx) ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm13, ymm0, ymm13) @@ -591,13 +591,13 @@ void bli_zgemmsup_rv_zen_asm_3x4m vmovups(xmm12, mem(rcx,32)) add(rsi, rcx) - + vmovups(xmm5, mem(rcx)) vmovups(xmm9, mem(rcx, 16)) vmovups(xmm13,mem(rcx,32)) - + add(rsi, rcx) - + vextractf128(imm(0x1), ymm5, xmm5) vextractf128(imm(0x1), ymm9, xmm9) vextractf128(imm(0x1), ymm13, xmm13) @@ -649,13 +649,13 @@ void bli_zgemmsup_rv_zen_asm_3x4m vmovups(xmm12, mem(rcx,32)) add(rsi, rcx) - + vmovups(xmm5, mem(rcx)) vmovups(xmm9, mem(rcx, 16)) vmovups(xmm13,mem(rcx,32)) - + add(rsi, rcx) - + vextractf128(imm(0x1), ymm5, xmm5) vextractf128(imm(0x1), ymm9, xmm9) vextractf128(imm(0x1), ymm13, xmm13) @@ -750,8 +750,8 @@ void bli_zgemmsup_rv_zen_asm_3x2m dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { @@ -1025,7 +1025,7 @@ void bli_zgemmsup_rv_zen_asm_3x2m vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm12, ymm12) - /* (ßr + ßi)x C + ((ar + ai) x AB) */ + /* (�r + �i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate @@ -1079,7 +1079,7 @@ void bli_zgemmsup_rv_zen_asm_3x2m mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(real dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof((real+imag) dt) - + lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a ZGEMM_INPUT_SCALE_CS_BETA_NZ @@ -1089,7 +1089,7 @@ void bli_zgemmsup_rv_zen_asm_3x2m ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm8, ymm0, ymm8) add(rdi, rcx) - + ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm12, ymm0, ymm12) @@ -1126,10 +1126,10 @@ void bli_zgemmsup_rv_zen_asm_3x2m vmovupd(ymm4, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm8, mem(rcx)) add(rdi, rcx) - + vmovupd(ymm12, mem(rcx)) jmp(.SDONE) // jump to end. diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c index 872d04868..58d08ecbd 100644 --- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c +++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c @@ -79,14 +79,14 @@ void bli_zgemmsup_rv_zen_asm_3x4n dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t m_left = m0 % 3; if ( m_left ) { - zgemmsup_ker_ft ker_fps[3] = + zgemmsup_ker_ft ker_fps[3] = { NULL, bli_zgemmsup_rv_zen_asm_1x4n, @@ -150,7 +150,7 @@ void bli_zgemmsup_rv_zen_asm_3x4n ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); - + dim_t ta_inc_row = rs_a; dim_t tb_inc_row = rs_b; dim_t tc_inc_row = rs_c; @@ -170,7 +170,7 @@ void bli_zgemmsup_rv_zen_asm_3x4n // This loop is processing MR x K ymm0 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter)); ymm1 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter + 2)); - + //broadcasted matrix B elements are multiplied //with matrix A columns. ymm2 = _mm256_broadcast_sd((double const *)(tA)); @@ -472,8 +472,8 @@ void bli_zgemmsup_rv_zen_asm_2x4n dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { @@ -515,7 +515,7 @@ void bli_zgemmsup_rv_zen_asm_2x4n ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); - + dim_t ta_inc_row = rs_a; dim_t tb_inc_row = rs_b; dim_t tc_inc_row = rs_c; @@ -535,7 +535,7 @@ void bli_zgemmsup_rv_zen_asm_2x4n // This loop is processing MR x K ymm0 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter)); ymm1 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter + 2)); - + //broadcasted matrix B elements are multiplied //with matrix A columns. ymm2 = _mm256_broadcast_sd((double const *)(tA)); @@ -772,8 +772,8 @@ void bli_zgemmsup_rv_zen_asm_1x4n dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); @@ -832,7 +832,7 @@ void bli_zgemmsup_rv_zen_asm_1x4n // This loop is processing MR x K ymm0 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter)); ymm1 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter + 2)); - + //broadcasted matrix B elements are multiplied //with matrix A columns. ymm2 = _mm256_broadcast_sd((double const *)(tA)); @@ -999,8 +999,8 @@ void bli_zgemmsup_rv_zen_asm_3x2 dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx + auxinfo_t* data, + cntx_t* cntx ) { uint64_t k_iter = 0; @@ -1046,7 +1046,7 @@ void bli_zgemmsup_rv_zen_asm_3x2 // multiplies it with the A matrix. // This loop is processing MR x K ymm0 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter)); - + //broadcasted matrix B elements are multiplied //with matrix A columns. ymm2 = _mm256_broadcast_sd((double const *)(tA)); diff --git a/kernels/zen2/1f/old/bli_axpyf_zen_int_5.c b/kernels/zen2/1f/old/bli_axpyf_zen_int_5.c new file mode 100644 index 000000000..8a60bce46 --- /dev/null +++ b/kernels/zen2/1f/old/bli_axpyf_zen_int_5.c @@ -0,0 +1,599 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "immintrin.h" +#include "blis.h" + +/* Union data structure to access AVX registers + One 256-bit AVX register holds 8 SP elements. */ +typedef union +{ + __m256 v; + float f[8] __attribute__((aligned(64))); +} v8sf_t; + +/* Union data structure to access AVX registers +* One 256-bit AVX register holds 4 DP elements. */ +typedef union +{ + __m256d v; + double d[4] __attribute__((aligned(64))); +} v4df_t; + + +void bli_saxpyf_zen_int_5 + ( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + float* restrict alpha, + float* restrict a, inc_t inca, inc_t lda, + float* restrict x, inc_t incx, + float* restrict y, inc_t incy, + cntx_t* cntx + ) +{ + const dim_t fuse_fac = 5; + + const dim_t n_elem_per_reg = 8; + const dim_t n_iter_unroll = 2; + + dim_t i; + + float* restrict a0; + float* restrict a1; + float* restrict a2; + float* restrict a3; + float* restrict a4; + + float* restrict y0; + + v8sf_t chi0v, chi1v, chi2v, chi3v; + v8sf_t chi4v; + + v8sf_t a00v, a01v, a02v, a03v; + v8sf_t a04v; + + v8sf_t a10v, a11v, a12v, a13v; + v8sf_t a14v; + + v8sf_t y0v, y1v; + + float chi0, chi1, chi2, chi3; + float chi4; + + // If either dimension is zero, or if alpha is zero, return early. + if ( bli_zero_dim2( m, b_n ) || bli_seq0( *alpha ) ) return; + + // If b_n is not equal to the fusing factor, then perform the entire + // operation as a loop over axpyv. + if ( b_n != fuse_fac ) + { +#ifdef BLIS_CONFIG_ZEN2 + for ( i = 0; i < b_n; ++i ) + { + float* a1 = a + (0 )*inca + (i )*lda; + float* chi1 = x + (i )*incx; + float* y1 = y + (0 )*incy; + float alpha_chi1; + + bli_scopycjs( conjx, *chi1, alpha_chi1 ); + bli_sscals( *alpha, alpha_chi1 ); + + bli_saxpyv_zen_int10 + ( + conja, + m, + &alpha_chi1, + a1, inca, + y1, incy, + cntx + ); + } + +#else + saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); + + for ( i = 0; i < b_n; ++i ) + { + float* a1 = a + (0 )*inca + (i )*lda; + float* chi1 = x + (i )*incx; + float* y1 = y + (0 )*incy; + float alpha_chi1; + + bli_scopycjs( conjx, *chi1, alpha_chi1 ); + bli_sscals( *alpha, alpha_chi1 ); + + f + ( + conja, + m, + &alpha_chi1, + a1, inca, + y1, incy, + cntx + ); + } + +#endif + return; + } + + // At this point, we know that b_n is exactly equal to the fusing factor. + + a0 = a + 0*lda; + a1 = a + 1*lda; + a2 = a + 2*lda; + a3 = a + 3*lda; + a4 = a + 4*lda; + y0 = y; + + chi0 = *( x + 0*incx ); + chi1 = *( x + 1*incx ); + chi2 = *( x + 2*incx ); + chi3 = *( x + 3*incx ); + chi4 = *( x + 4*incx ); + + + // Scale each chi scalar by alpha. + bli_sscals( *alpha, chi0 ); + bli_sscals( *alpha, chi1 ); + bli_sscals( *alpha, chi2 ); + bli_sscals( *alpha, chi3 ); + bli_sscals( *alpha, chi4 ); + + // Broadcast the (alpha*chi?) scalars to all elements of vector registers. + chi0v.v = _mm256_broadcast_ss( &chi0 ); + chi1v.v = _mm256_broadcast_ss( &chi1 ); + chi2v.v = _mm256_broadcast_ss( &chi2 ); + chi3v.v = _mm256_broadcast_ss( &chi3 ); + chi4v.v = _mm256_broadcast_ss( &chi4 ); + + // If there are vectorized iterations, perform them with vector + // instructions. + if ( inca == 1 && incy == 1 ) + { + for ( i = 0; (i + 15) < m; i += 16 ) + { + // Load the input values. + y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); + + a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg ); + a10v.v = _mm256_loadu_ps( a0 + 1*n_elem_per_reg ); + + a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg ); + a11v.v = _mm256_loadu_ps( a1 + 1*n_elem_per_reg ); + + a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg ); + a12v.v = _mm256_loadu_ps( a2 + 1*n_elem_per_reg ); + + a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg ); + a13v.v = _mm256_loadu_ps( a3 + 1*n_elem_per_reg ); + + a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg ); + a14v.v = _mm256_loadu_ps( a4 + 1*n_elem_per_reg ); + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v ); + y1v.v = _mm256_fmadd_ps( a10v.v, chi0v.v, y1v.v ); + + y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v ); + y1v.v = _mm256_fmadd_ps( a11v.v, chi1v.v, y1v.v ); + + y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v ); + y1v.v = _mm256_fmadd_ps( a12v.v, chi2v.v, y1v.v ); + + y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v ); + y1v.v = _mm256_fmadd_ps( a13v.v, chi3v.v, y1v.v ); + + y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v ); + y1v.v = _mm256_fmadd_ps( a14v.v, chi4v.v, y1v.v ); + + + // Store the output. + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v ); + _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v ); + + y0 += n_iter_unroll * n_elem_per_reg; + a0 += n_iter_unroll * n_elem_per_reg; + a1 += n_iter_unroll * n_elem_per_reg; + a2 += n_iter_unroll * n_elem_per_reg; + a3 += n_iter_unroll * n_elem_per_reg; + a4 += n_iter_unroll * n_elem_per_reg; + } + + for( ; (i + 7) < m; i += 8 ) + { + // Load the input values. + y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + + a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg ); + a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg ); + a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg ); + a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg ); + a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg ); + + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v ); + y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v ); + y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v ); + y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v ); + y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v ); + + // Store the output. + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v ); + + y0 += n_elem_per_reg; + a0 += n_elem_per_reg; + a1 += n_elem_per_reg; + a2 += n_elem_per_reg; + a3 += n_elem_per_reg; + a4 += n_elem_per_reg; + } + + // If there are leftover iterations, perform them with scalar code. + for ( ; (i + 0) < m ; ++i ) + { + double y0c = *y0; + + const float a0c = *a0; + const float a1c = *a1; + const float a2c = *a2; + const float a3c = *a3; + const float a4c = *a4; + + y0c += chi0 * a0c; + y0c += chi1 * a1c; + y0c += chi2 * a2c; + y0c += chi3 * a3c; + y0c += chi4 * a4c; + + *y0 = y0c; + + a0 += 1; + a1 += 1; + a2 += 1; + a3 += 1; + a4 += 1; + y0 += 1; + } + } + else + { + for ( i = 0; (i + 0) < m ; ++i ) + { + double y0c = *y0; + + const float a0c = *a0; + const float a1c = *a1; + const float a2c = *a2; + const float a3c = *a3; + const float a4c = *a4; + + y0c += chi0 * a0c; + y0c += chi1 * a1c; + y0c += chi2 * a2c; + y0c += chi3 * a3c; + y0c += chi4 * a4c; + + *y0 = y0c; + + a0 += inca; + a1 += inca; + a2 += inca; + a3 += inca; + a4 += inca; + y0 += incy; + } + + } +} + + +// ----------------------------------------------------------------------------- + +void bli_daxpyf_zen_int_5 + ( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + double* restrict alpha, + double* restrict a, inc_t inca, inc_t lda, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + cntx_t* cntx + ) +{ + const dim_t fuse_fac = 5; + + const dim_t n_elem_per_reg = 4; + const dim_t n_iter_unroll = 2; + + dim_t i; + + double* restrict a0; + double* restrict a1; + double* restrict a2; + double* restrict a3; + double* restrict a4; + + double* restrict y0; + + v4df_t chi0v, chi1v, chi2v, chi3v; + v4df_t chi4v; + + v4df_t a00v, a01v, a02v, a03v; + v4df_t a04v; + + v4df_t a10v, a11v, a12v, a13v; + v4df_t a14v; + + v4df_t y0v, y1v; + + double chi0, chi1, chi2, chi3; + double chi4; + + // If either dimension is zero, or if alpha is zero, return early. + if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return; + + // If b_n is not equal to the fusing factor, then perform the entire + // operation as a loop over axpyv. + if ( b_n != fuse_fac ) + { +#ifdef BLIS_CONFIG_ZEN2 + for ( i = 0; i < b_n; ++i ) + { + double* a1 = a + (0 )*inca + (i )*lda; + double* chi1 = x + (i )*incx; + double* y1 = y + (0 )*incy; + double alpha_chi1; + + bli_dcopycjs( conjx, *chi1, alpha_chi1 ); + bli_dscals( *alpha, alpha_chi1 ); + + bli_daxpyv_zen_int10 + ( + conja, + m, + &alpha_chi1, + a1, inca, + y1, incy, + cntx + ); + } + +#else + daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + + for ( i = 0; i < b_n; ++i ) + { + double* a1 = a + (0 )*inca + (i )*lda; + double* chi1 = x + (i )*incx; + double* y1 = y + (0 )*incy; + double alpha_chi1; + + bli_dcopycjs( conjx, *chi1, alpha_chi1 ); + bli_dscals( *alpha, alpha_chi1 ); + + f + ( + conja, + m, + &alpha_chi1, + a1, inca, + y1, incy, + cntx + ); + } + +#endif + return; + } + + // At this point, we know that b_n is exactly equal to the fusing factor. + + a0 = a + 0*lda; + a1 = a + 1*lda; + a2 = a + 2*lda; + a3 = a + 3*lda; + a4 = a + 4*lda; + y0 = y; + + chi0 = *( x + 0*incx ); + chi1 = *( x + 1*incx ); + chi2 = *( x + 2*incx ); + chi3 = *( x + 3*incx ); + chi4 = *( x + 4*incx ); + + + // Scale each chi scalar by alpha. + bli_dscals( *alpha, chi0 ); + bli_dscals( *alpha, chi1 ); + bli_dscals( *alpha, chi2 ); + bli_dscals( *alpha, chi3 ); + bli_dscals( *alpha, chi4 ); + + // Broadcast the (alpha*chi?) scalars to all elements of vector registers. + chi0v.v = _mm256_broadcast_sd( &chi0 ); + chi1v.v = _mm256_broadcast_sd( &chi1 ); + chi2v.v = _mm256_broadcast_sd( &chi2 ); + chi3v.v = _mm256_broadcast_sd( &chi3 ); + chi4v.v = _mm256_broadcast_sd( &chi4 ); + + // If there are vectorized iterations, perform them with vector + // instructions. + if ( inca == 1 && incy == 1 ) + { + for ( i = 0; (i + 7) < m; i += 8 ) + { + // Load the input values. + y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + + a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); + a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); + + a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); + a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); + + a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); + a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg ); + + a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); + a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg ); + + a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg ); + a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg ); + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v ); + + + // Store the output. + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v ); + _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v ); + + y0 += n_iter_unroll * n_elem_per_reg; + a0 += n_iter_unroll * n_elem_per_reg; + a1 += n_iter_unroll * n_elem_per_reg; + a2 += n_iter_unroll * n_elem_per_reg; + a3 += n_iter_unroll * n_elem_per_reg; + a4 += n_iter_unroll * n_elem_per_reg; + } + + for( ; (i + 3) < m; i += 4 ) + { + // Load the input values. + y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + + a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); + a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); + a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); + a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); + a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg ); + + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); + y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); + y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); + y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); + y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v ); + + // Store the output. + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v ); + + y0 += n_elem_per_reg; + a0 += n_elem_per_reg; + a1 += n_elem_per_reg; + a2 += n_elem_per_reg; + a3 += n_elem_per_reg; + a4 += n_elem_per_reg; + } + + // If there are leftover iterations, perform them with scalar code. + for ( ; (i + 0) < m ; ++i ) + { + double y0c = *y0; + + const double a0c = *a0; + const double a1c = *a1; + const double a2c = *a2; + const double a3c = *a3; + const double a4c = *a4; + + y0c += chi0 * a0c; + y0c += chi1 * a1c; + y0c += chi2 * a2c; + y0c += chi3 * a3c; + y0c += chi4 * a4c; + + *y0 = y0c; + + a0 += 1; + a1 += 1; + a2 += 1; + a3 += 1; + a4 += 1; + y0 += 1; + } + } + else + { + for ( i = 0; (i + 0) < m ; ++i ) + { + double y0c = *y0; + + const double a0c = *a0; + const double a1c = *a1; + const double a2c = *a2; + const double a3c = *a3; + const double a4c = *a4; + + y0c += chi0 * a0c; + y0c += chi1 * a1c; + y0c += chi2 * a2c; + y0c += chi3 * a3c; + y0c += chi4 * a4c; + + *y0 = y0c; + + a0 += inca; + a1 += inca; + a2 += inca; + a3 += inca; + a4 += inca; + y0 += incy; + } + + } +} + diff --git a/ref_kernels/1/bli_addv_ref.c b/ref_kernels/1/bli_addv_ref.c index 6724cdfd1..bb637d7e6 100644 --- a/ref_kernels/1/bli_addv_ref.c +++ b/ref_kernels/1/bli_addv_ref.c @@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ diff --git a/ref_kernels/1/bli_amaxv_ref.c b/ref_kernels/1/bli_amaxv_ref.c index 169180f3b..cdfae9568 100644 --- a/ref_kernels/1/bli_amaxv_ref.c +++ b/ref_kernels/1/bli_amaxv_ref.c @@ -46,7 +46,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict i_max, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ ctype_r* minus_one = PASTEMAC(chr,m1); \ diff --git a/ref_kernels/1/bli_axpbyv_ref.c b/ref_kernels/1/bli_axpbyv_ref.c index 2da4bc928..fb48070a5 100644 --- a/ref_kernels/1/bli_axpbyv_ref.c +++ b/ref_kernels/1/bli_axpbyv_ref.c @@ -45,7 +45,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ diff --git a/ref_kernels/1/bli_axpyv_ref.c b/ref_kernels/1/bli_axpyv_ref.c index 30076ddaf..295fcf24c 100644 --- a/ref_kernels/1/bli_axpyv_ref.c +++ b/ref_kernels/1/bli_axpyv_ref.c @@ -45,7 +45,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ @@ -135,7 +135,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ diff --git a/ref_kernels/1/bli_copyv_ref.c b/ref_kernels/1/bli_copyv_ref.c index 9cf005aae..1202aa896 100644 --- a/ref_kernels/1/bli_copyv_ref.c +++ b/ref_kernels/1/bli_copyv_ref.c @@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ diff --git a/ref_kernels/1/bli_dotv_ref.c b/ref_kernels/1/bli_dotv_ref.c index f2cfae78b..d17c71dd3 100644 --- a/ref_kernels/1/bli_dotv_ref.c +++ b/ref_kernels/1/bli_dotv_ref.c @@ -45,7 +45,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ ctype dotxy; \ diff --git a/ref_kernels/1/bli_dotxv_ref.c b/ref_kernels/1/bli_dotxv_ref.c index e2283bcc6..caea62176 100644 --- a/ref_kernels/1/bli_dotxv_ref.c +++ b/ref_kernels/1/bli_dotxv_ref.c @@ -47,7 +47,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ ctype dotxy; \ diff --git a/ref_kernels/1/bli_invertv_ref.c b/ref_kernels/1/bli_invertv_ref.c index 07c52d82d..914663c82 100644 --- a/ref_kernels/1/bli_invertv_ref.c +++ b/ref_kernels/1/bli_invertv_ref.c @@ -41,7 +41,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ diff --git a/ref_kernels/1/bli_scal2v_ref.c b/ref_kernels/1/bli_scal2v_ref.c index ba0595990..f4785c228 100644 --- a/ref_kernels/1/bli_scal2v_ref.c +++ b/ref_kernels/1/bli_scal2v_ref.c @@ -44,7 +44,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ diff --git a/ref_kernels/1/bli_scalv_ref.c b/ref_kernels/1/bli_scalv_ref.c index 3e6be7492..6ca9a88a5 100644 --- a/ref_kernels/1/bli_scalv_ref.c +++ b/ref_kernels/1/bli_scalv_ref.c @@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ diff --git a/ref_kernels/1/bli_setv_ref.c b/ref_kernels/1/bli_setv_ref.c index 862ff177d..be6e76cbb 100644 --- a/ref_kernels/1/bli_setv_ref.c +++ b/ref_kernels/1/bli_setv_ref.c @@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ diff --git a/ref_kernels/1/bli_subv_ref.c b/ref_kernels/1/bli_subv_ref.c index 6b512909f..ce1ec2079 100644 --- a/ref_kernels/1/bli_subv_ref.c +++ b/ref_kernels/1/bli_subv_ref.c @@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ diff --git a/ref_kernels/1/bli_swapv_ref.c b/ref_kernels/1/bli_swapv_ref.c index 6f8d54f66..73a90c87b 100644 --- a/ref_kernels/1/bli_swapv_ref.c +++ b/ref_kernels/1/bli_swapv_ref.c @@ -42,7 +42,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ diff --git a/ref_kernels/1/bli_xpbyv_ref.c b/ref_kernels/1/bli_xpbyv_ref.c index 28286a5f8..0a6844bf1 100644 --- a/ref_kernels/1/bli_xpbyv_ref.c +++ b/ref_kernels/1/bli_xpbyv_ref.c @@ -44,7 +44,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ diff --git a/ref_kernels/1f/bli_axpy2v_ref.c b/ref_kernels/1f/bli_axpy2v_ref.c index 6439ff8b0..0563322ae 100644 --- a/ref_kernels/1f/bli_axpy2v_ref.c +++ b/ref_kernels/1f/bli_axpy2v_ref.c @@ -48,7 +48,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ diff --git a/ref_kernels/1f/bli_axpyf_ref.c b/ref_kernels/1f/bli_axpyf_ref.c index 5799a03a6..873cee563 100644 --- a/ref_kernels/1f/bli_axpyf_ref.c +++ b/ref_kernels/1f/bli_axpyf_ref.c @@ -48,7 +48,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( m ) ) return; \ diff --git a/ref_kernels/1f/bli_dotaxpyv_ref.c b/ref_kernels/1f/bli_dotaxpyv_ref.c index 42936c650..b83b927c9 100644 --- a/ref_kernels/1f/bli_dotaxpyv_ref.c +++ b/ref_kernels/1f/bli_dotaxpyv_ref.c @@ -49,7 +49,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( bli_zero_dim1( m ) ) return; \ diff --git a/ref_kernels/1f/bli_dotxaxpyf_ref.c b/ref_kernels/1f/bli_dotxaxpyf_ref.c index 990133621..249b9a6de 100644 --- a/ref_kernels/1f/bli_dotxaxpyf_ref.c +++ b/ref_kernels/1f/bli_dotxaxpyf_ref.c @@ -53,7 +53,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ /* A is m x n. */ \ diff --git a/ref_kernels/1f/bli_dotxf_ref.c b/ref_kernels/1f/bli_dotxf_ref.c index 86781fd58..2d2da1318 100644 --- a/ref_kernels/1f/bli_dotxf_ref.c +++ b/ref_kernels/1f/bli_dotxf_ref.c @@ -49,7 +49,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ if ( inca == 1 && incx == 1 && incy == 1 && b_n == ff ) \ diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c index 5cee5535b..e2008d255 100644 --- a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c +++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c @@ -122,7 +122,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ const num_t dt_r = PASTEMAC(chr,type); \ diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c index 80ffcbc14..d12ff59ab 100644 --- a/ref_kernels/1m/bli_packm_cxc_diag_ref.c +++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c @@ -66,7 +66,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c index 56d8379be..f3dd3d78f 100644 --- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c +++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c @@ -87,7 +87,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ const dim_t dfac = PASTECH2(bb0, _, chr); \ @@ -99,15 +99,15 @@ void PASTEMAC3(ch,opname,arch,suf) \ const dim_t mnr = PASTECH2(mnr0, _, chr) == -1 ? -1 : PASTECH2(mnr0, _, chr) / 2; \ const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ \ - ctype_r kappa_r = ( ( ctype_r* )kappa )[0]; \ - ctype_r kappa_i = ( ( ctype_r* )kappa )[1]; \ - ctype_r* restrict alpha1 = ( ctype_r* )a; \ - ctype_r* restrict pi1_ri = ( ctype_r* )p; \ - ctype_r* restrict pi1_ir = ( ctype_r* )p + ldp; \ + ctype_r kappa_r = ( ( ctype_r* )kappa )[0]; \ + ctype_r kappa_i = ( ( ctype_r* )kappa )[1]; \ + ctype_r* restrict alpha1 = ( ctype_r* )a; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p; \ + ctype_r* restrict pi1_ir = ( ctype_r* )p + ldp; \ \ if ( cdim == mnr && mnr != -1 ) \ { \ @@ -140,15 +140,15 @@ void PASTEMAC3(ch,opname,arch,suf) \ const dim_t mnr = PASTECH2(mnr0, _, chr); \ const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \ \ - const inc_t inca2 = 2 * inca; \ - const inc_t lda2 = 2 * lda; \ - const inc_t ldp2 = 2 * ldp; \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ \ - ctype_r kappa_r = ( ( ctype_r* )kappa )[0]; \ - ctype_r kappa_i = ( ( ctype_r* )kappa )[1]; \ - ctype_r* restrict alpha1 = ( ctype_r* )a; \ - ctype_r* restrict pi1_r = ( ctype_r* )p; \ - ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ + ctype_r kappa_r = ( ( ctype_r* )kappa )[0]; \ + ctype_r kappa_i = ( ( ctype_r* )kappa )[1]; \ + ctype_r* restrict alpha1 = ( ctype_r* )a; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ \ if ( cdim == mnr && mnr != -1 ) \ { \ diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c index eefdb464b..efbbc95e4 100644 --- a/ref_kernels/1m/bli_packm_cxk_ref.c +++ b/ref_kernels/1m/bli_packm_cxk_ref.c @@ -63,7 +63,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ const dim_t mnr = PASTECH2(mnr0, _, ch); \ diff --git a/ref_kernels/1m/bli_unpackm_cxk_ref.c b/ref_kernels/1m/bli_unpackm_cxk_ref.c index 73d98e268..172e93bdf 100644 --- a/ref_kernels/1m/bli_unpackm_cxk_ref.c +++ b/ref_kernels/1m/bli_unpackm_cxk_ref.c @@ -61,7 +61,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ - cntx_t* restrict cntx \ + cntx_t* cntx \ ) \ { \ const dim_t mnr = PASTECH2(mnr0, _, ch); \ diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c index f284acb98..26eda0c65 100644 --- a/ref_kernels/3/bli_gemm_ref.c +++ b/ref_kernels/3/bli_gemm_ref.c @@ -50,31 +50,31 @@ static void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ - const num_t dt = PASTEMAC(ch,type); \ + const num_t dt = PASTEMAC(ch,type); \ \ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ + const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ + const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ - const inc_t rs_a = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \ - const inc_t cs_a = packmr; \ + const inc_t rs_a = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \ + const inc_t cs_a = packmr; \ \ - const inc_t rs_b = packnr; \ - const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \ + const inc_t rs_b = packnr; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \ \ - ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = m; \ + ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const inc_t rs_ab = 1; \ + const inc_t cs_ab = m; \ \ - dim_t l, j, i; \ + dim_t l, j, i; \ \ - ctype ai; \ - ctype bj; \ + ctype ai; \ + ctype bj; \ \ \ /* Initialize the accumulator elements in ab to zero. */ \ @@ -118,18 +118,24 @@ static void PASTEMAC3(ch,opname,arch,suf) \ scale by beta and then add the scaled redult in ab. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ - PASTEMAC(ch,copys_mxn)( m, \ - n, \ - ab, rs_ab, cs_ab, \ - c, rs_c, cs_c ); \ + PASTEMAC(ch,copys_mxn) \ + ( \ + m, \ + n, \ + ab, rs_ab, cs_ab, \ + c, rs_c, cs_c \ + ); \ } \ else \ { \ - PASTEMAC(ch,xpbys_mxn)( m, \ - n, \ - ab, rs_ab, cs_ab, \ - beta, \ - c, rs_c, cs_c ); \ + PASTEMAC(ch,xpbys_mxn) \ + ( \ + m, \ + n, \ + ab, rs_ab, cs_ab, \ + beta, \ + c, rs_c, cs_c \ + ); \ } \ } @@ -153,13 +159,13 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ \ - const dim_t mr = PASTECH(BLIS_MR_,ch); \ - const dim_t nr = PASTECH(BLIS_NR_,ch); \ + const dim_t mr = PASTECH(BLIS_MR_,ch); \ + const dim_t nr = PASTECH(BLIS_NR_,ch); \ \ if ( mr == -1 || nr == -1 ) \ { \ diff --git a/ref_kernels/3/bli_gemmsup_ref.c b/ref_kernels/3/bli_gemmsup_ref.c index 0c3773c1c..9cadb3bd6 100644 --- a/ref_kernels/3/bli_gemmsup_ref.c +++ b/ref_kernels/3/bli_gemmsup_ref.c @@ -53,12 +53,12 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ /* NOTE: This microkernel can actually handle arbitrarily large - values of m, n, and k. */ \ + values of m, n, and k. */ \ \ if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ { \ @@ -258,12 +258,12 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ /* NOTE: This microkernel can actually handle arbitrarily large - values of m, n, and k. */ \ + values of m, n, and k. */ \ \ if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ { \ @@ -478,17 +478,17 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ - const dim_t mn = m * n; \ + ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ \ - ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = n; \ - const inc_t cs_ab = 1; \ + const dim_t mn = m * n; \ + const inc_t rs_ab = n; \ + const inc_t cs_ab = 1; \ \ \ /* Assumptions: m <= mr, n <= nr so that the temporary array ab is @@ -602,17 +602,17 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ - const dim_t mn = m * n; \ + ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ \ - ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = m; \ + const dim_t mn = m * n; \ + const inc_t rs_ab = 1; \ + const inc_t cs_ab = m; \ \ \ /* Assumptions: m <= mr, n <= nr so that the temporary array ab is @@ -725,17 +725,17 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ - const dim_t mn = m * n; \ + ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ \ - ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = m; \ + const dim_t mn = m * n; \ + const inc_t rs_ab = 1; \ + const inc_t cs_ab = m; \ \ \ /* Assumptions: m <= mr, n <= nr so that the temporary array ab is diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c index 046aa5617..0a11aa052 100644 --- a/ref_kernels/3/bli_gemmtrsm_ref.c +++ b/ref_kernels/3/bli_gemmtrsm_ref.c @@ -51,30 +51,30 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ - const num_t dt = PASTEMAC(ch,type); \ + const num_t dt = PASTEMAC(ch,type); \ \ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ + const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ - const inc_t rs_b = packnr; \ - const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \ + const inc_t rs_b = packnr; \ + const inc_t cs_b = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \ /* printf( "bli_gemmtrsm_ref(): cs_b = %d\n", (int)cs_b ); \ printf( "bli_gemmtrsm_ref(): k nr = %d %d\n", (int)k, (int)nr ); \ */ \ \ - ctype* minus_one = PASTEMAC(ch,m1); \ + ctype* minus_one = PASTEMAC(ch,m1); \ \ PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ + gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ PASTECH(ch,trsm_ukr_ft) \ - trsm_ukr = bli_cntx_get_ukr_dt( dt, trsmkerid, cntx ); \ + trsm_ukr = bli_cntx_get_ukr_dt( dt, trsmkerid, cntx ); \ \ /* PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \ @@ -94,15 +94,15 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \ circumstances where we would want the gemmtrsm_? operations to have and exercise their own IO preferences -- I'd have to think about it -- but this doesn't seem to be one of them. */ \ - const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : nr ); \ - const inc_t cs_ct = ( col_pref ? mr : 1 ); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : nr ); \ + const inc_t cs_ct = ( col_pref ? mr : 1 ); \ \ - const bool use_ct = ( m < mr || n < nr ); \ + const bool use_ct = ( m < mr || n < nr ); \ \ - ctype* restrict c11_use = c11; \ - inc_t rs_c_use = rs_c; \ - inc_t cs_c_use = cs_c; \ + ctype* restrict c11_use = c11; \ + inc_t rs_c_use = rs_c; \ + inc_t cs_c_use = cs_c; \ \ if ( use_ct ) \ { \ diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c index 8234a84cc..f115e2a60 100644 --- a/ref_kernels/3/bli_trsm_ref.c +++ b/ref_kernels/3/bli_trsm_ref.c @@ -45,8 +45,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -74,10 +74,10 @@ void PASTEMAC3(ch,opname,arch,suf) \ i = iter; \ n_behind = i; \ \ - ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \ - ctype* restrict a10t = a + (i )*rs_a + (0 )*cs_a; \ - ctype* restrict B0 = b + (0 )*rs_b + (0 )*cs_b; \ - ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \ + ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \ + ctype* restrict a10t = a + (i )*rs_a + (0 )*cs_a; \ + ctype* restrict B0 = b + (0 )*rs_b + (0 )*cs_b; \ + ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \ \ /* b1 = b1 - a10t * B0; */ \ /* b1 = b1 / alpha11; */ \ @@ -132,8 +132,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -158,10 +158,10 @@ void PASTEMAC3(ch,opname,arch,suf) \ dim_t i = m - iter - 1; \ dim_t n_behind = iter; \ \ - ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \ - ctype* restrict a12t = a + (i )*rs_a + (i+1)*cs_a; \ - ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \ - ctype* restrict B2 = b + (i+1)*rs_b + (0 )*cs_b; \ + ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \ + ctype* restrict a12t = a + (i )*rs_a + (i+1)*cs_a; \ + ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \ + ctype* restrict B2 = b + (i+1)*rs_b + (0 )*cs_b; \ \ /* b1 = b1 - a12t * B2; */ \ /* b1 = b1 / alpha11; */ \ diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c index 69c546cd4..e094db54b 100644 --- a/ref_kernels/bli_cntx_ref.c +++ b/ref_kernels/bli_cntx_ref.c @@ -306,7 +306,7 @@ void GENBARNAME(cntx_init) // -- Set level-3 virtual micro-kernels ------------------------------------ - funcs = bli_cntx_ukrs_buf( cntx ); + funcs = cntx->ukrs; // NOTE: We set the virtual micro-kernel slots to contain the addresses // of the native micro-kernels. In general, the ukernels in the virtual @@ -322,7 +322,7 @@ void GENBARNAME(cntx_init) // -- Set level-3 native micro-kernels and preferences --------------------- - mbools = bli_cntx_ukr_prefs_buf( cntx ); + mbools = cntx->ukr_prefs; gen_func_init( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name ); gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name ); @@ -416,7 +416,7 @@ void GENBARNAME(cntx_init) // -- Set level-3 small/unpacked handlers ---------------------------------- - vfuncs = bli_cntx_l3_sup_handlers_buf( cntx ); + vfuncs = cntx->l3_sup_handlers; // Initialize all of the function pointers to NULL; for ( i = 0; i < BLIS_NUM_LEVEL3_OPS; ++i ) vfuncs[ i ] = NULL; @@ -452,7 +452,7 @@ void GENBAINAME(cntx_init) // -- Set induced method level-3 virtual micro-kernels --------------------- - funcs = bli_cntx_ukrs_buf( cntx ); + funcs = cntx->ukrs; if ( method == BLIS_1M ) { @@ -483,8 +483,8 @@ void GENBAINAME(cntx_init) // beta has a zero imaginary component and C is either row- or column-stored). if ( method == BLIS_1M ) { - func_t* gemm_nat_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_UKR, cntx ); - func_t* gemm_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, cntx ); + func_t* gemm_nat_ukrs = ( func_t* )bli_cntx_get_ukrs( BLIS_GEMM_UKR, cntx ); + func_t* gemm_vir_ukrs = ( func_t* )bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, cntx ); bli_func_copy_dt( BLIS_FLOAT, gemm_nat_ukrs, BLIS_FLOAT, gemm_vir_ukrs ); bli_func_copy_dt( BLIS_DOUBLE, gemm_nat_ukrs, BLIS_DOUBLE, gemm_vir_ukrs ); diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c index 2f0808389..317cf2604 100644 --- a/ref_kernels/ind/bli_gemm1m_ref.c +++ b/ref_kernels/ind/bli_gemm1m_ref.c @@ -47,8 +47,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c index 6cfb83cae..1688b688d 100644 --- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c +++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c @@ -48,8 +48,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ diff --git a/ref_kernels/ind/bli_trsm1m_ref.c b/ref_kernels/ind/bli_trsm1m_ref.c index 5eda20f20..37551b399 100644 --- a/ref_kernels/ind/bli_trsm1m_ref.c +++ b/ref_kernels/ind/bli_trsm1m_ref.c @@ -43,8 +43,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -92,13 +92,13 @@ void PASTEMAC3(ch,opname,arch,suf) \ i = iter; \ n_behind = i; \ \ - ctype_r* restrict alpha11_r = a_r + (i )*rs_a2 + (i )*cs_a2; \ - ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ - ctype_r* restrict a10t_r = a_r + (i )*rs_a2 + (0 )*cs_a2; \ - ctype_r* restrict a10t_i = a_i + (i )*rs_a2 + (0 )*cs_a2; \ - ctype_r* restrict b1_ri = b_ri + (i )*rs_b2 + (0 )*cs_b2; \ - ctype_r* restrict b1_ir = b_ir + (i )*rs_b2 + (0 )*cs_b2; \ - ctype_r* restrict B0_ri = b_ri + (0 )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict alpha11_r = a_r + (i )*rs_a2 + (i )*cs_a2; \ + ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ + ctype_r* restrict a10t_r = a_r + (i )*rs_a2 + (0 )*cs_a2; \ + ctype_r* restrict a10t_i = a_i + (i )*rs_a2 + (0 )*cs_a2; \ + ctype_r* restrict b1_ri = b_ri + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict b1_ir = b_ir + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B0_ri = b_ri + (0 )*rs_b2 + (0 )*cs_b2; \ \ /* b1 = b1 - a10t * B0; */ \ /* b1 = b1 / alpha11; */ \ @@ -261,8 +261,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ + auxinfo_t* data, \ + cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ diff --git a/testsuite/src/test_axpy2v.c b/testsuite/src/test_axpy2v.c index eeebf15e7..3019d472b 100644 --- a/testsuite/src/test_axpy2v.c +++ b/testsuite/src/test_axpy2v.c @@ -176,7 +176,7 @@ void libblis_test_axpy2v_experiment // Query a context. - cntx = bli_gks_query_cntx(); + cntx = ( cntx_t* )bli_gks_query_cntx(); // Use the datatype of the first char in the datatype combination string. bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); @@ -220,7 +220,7 @@ void libblis_test_axpy2v_experiment bli_obj_set_conj( conjx, &x ); bli_obj_set_conj( conjy, &y ); - // Repeat the experiment n_repeats times and record results. + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copyv( &z_save, &z ); diff --git a/testsuite/src/test_axpyf.c b/testsuite/src/test_axpyf.c index 7a85b2212..42ab73018 100644 --- a/testsuite/src/test_axpyf.c +++ b/testsuite/src/test_axpyf.c @@ -174,7 +174,7 @@ void libblis_test_axpyf_experiment // Query a context. - cntx = bli_gks_query_cntx(); + cntx = ( cntx_t* )bli_gks_query_cntx(); // Use the datatype of the first char in the datatype combination string. bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); @@ -223,7 +223,7 @@ void libblis_test_axpyf_experiment bli_obj_set_conj( conja, &a ); bli_obj_set_conj( conjx, &x ); - // Repeat the experiment n_repeats times and record results. + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copyv( &y_save, &y ); diff --git a/testsuite/src/test_dotaxpyv.c b/testsuite/src/test_dotaxpyv.c index 391c119bb..8e09e3ee1 100644 --- a/testsuite/src/test_dotaxpyv.c +++ b/testsuite/src/test_dotaxpyv.c @@ -179,7 +179,7 @@ void libblis_test_dotaxpyv_experiment // Query a context. - cntx = bli_gks_query_cntx(); + cntx = ( cntx_t* )bli_gks_query_cntx(); // Use the datatype of the first char in the datatype combination string. bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); @@ -222,7 +222,7 @@ void libblis_test_dotaxpyv_experiment bli_obj_alias_to( &x, &xt ); // Determine whether to make a copy of x with or without conjugation. - // + // // conjx conjy ~conjx^conjy y is initialized as // n n c y = conj(x) // n c n y = x @@ -239,7 +239,7 @@ void libblis_test_dotaxpyv_experiment bli_obj_set_conj( conjx, &x ); bli_obj_set_conj( conjy, &y ); - // Repeat the experiment n_repeats times and record results. + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copysc( &BLIS_MINUS_ONE, &rho ); diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c index a2c3ef3e9..ec519de51 100644 --- a/testsuite/src/test_dotxaxpyf.c +++ b/testsuite/src/test_dotxaxpyf.c @@ -184,7 +184,7 @@ void libblis_test_dotxaxpyf_experiment // Query a context. - cntx = bli_gks_query_cntx(); + cntx = ( cntx_t* )bli_gks_query_cntx(); // Use the datatype of the first char in the datatype combination string. bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); @@ -251,7 +251,7 @@ void libblis_test_dotxaxpyf_experiment bli_obj_set_conj( conjw, &w ); bli_obj_set_conj( conjx, &x ); - // Repeat the experiment n_repeats times and record results. + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copyv( &y_save, &y ); diff --git a/testsuite/src/test_dotxf.c b/testsuite/src/test_dotxf.c index 8a1eca4eb..83f4b44eb 100644 --- a/testsuite/src/test_dotxf.c +++ b/testsuite/src/test_dotxf.c @@ -176,7 +176,7 @@ void libblis_test_dotxf_experiment // Query a context. - cntx = bli_gks_query_cntx(); + cntx = ( cntx_t* )bli_gks_query_cntx(); // Use the datatype of the first char in the datatype combination string. bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); @@ -228,7 +228,7 @@ void libblis_test_dotxf_experiment bli_obj_set_conj( conjat, &a ); bli_obj_set_conj( conjx, &x ); - // Repeat the experiment n_repeats times and record results. + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copyv( &y_save, &y ); diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index d37005b28..69ee4339d 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -181,7 +181,7 @@ void libblis_test_gemm_ukr_experiment // Query a context. - cntx = bli_gks_query_cntx(); + cntx = ( cntx_t* )bli_gks_query_cntx(); // Use the datatype of the first char in the datatype combination string. bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index 48fcb78db..44ba51587 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -207,7 +207,7 @@ void libblis_test_gemmtrsm_ukr_experiment // Query a context. - cntx = bli_gks_query_cntx(); + cntx = ( cntx_t* )bli_gks_query_cntx(); // Use the datatype of the first char in the datatype combination string. bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index edab9796d..f267ae158 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -977,7 +977,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "\n" ); // Query a native context. - cntx = bli_gks_query_nat_cntx(); + cntx = ( cntx_t* )bli_gks_query_nat_cntx(); libblis_test_fprintf_c( os, "level-3 blocksizes s d c z \n" ); libblis_test_fprintf_c( os, " mc %7d %7d %7d %7d\n", @@ -1081,8 +1081,8 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "\n" ); // Query a native context. - cntx_c = bli_gks_query_ind_cntx( im, BLIS_SCOMPLEX ); - cntx_z = bli_gks_query_ind_cntx( im, BLIS_DCOMPLEX ); + cntx_c = ( cntx_t* )bli_gks_query_ind_cntx( im, BLIS_SCOMPLEX ); + cntx_z = ( cntx_t* )bli_gks_query_ind_cntx( im, BLIS_DCOMPLEX ); libblis_test_fprintf_c( os, "level-3 blocksizes c z \n" ); libblis_test_fprintf_c( os, " mc %7d %7d\n", @@ -2178,7 +2178,7 @@ void libblis_test_op_driver // Query the implementation string associated with the // current operation and datatype. If the operation is // not level-3, we will always get back the native string. - ind_str = bli_ind_oper_get_avail_impl_string( op->opid, datatype ); + ind_str = ( char* )bli_ind_oper_get_avail_impl_string( op->opid, datatype ); // Loop over the requested parameter combinations. for ( pci = 0; pci < n_param_combos; ++pci ) @@ -3051,7 +3051,7 @@ void libblis_test_parse_command_line( int argc, char** argv ) bli_getopt_init_state( 0, &state ); // Process all option arguments until we get a -1, which means we're done. - while( (opt = bli_getopt( argc, argv, "g:o:", &state )) != -1 ) + while( (opt = bli_getopt( argc, ( const char** )argv, "g:o:", &state )) != -1 ) { // Explicitly typecast opt, which is an int, to a char. (Failing to // typecast resulted in at least one user-reported problem whereby diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 9568dfee7..5f4988e1c 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -186,7 +186,7 @@ void libblis_test_trsm_ukr_experiment // Query a context. - cntx = bli_gks_query_cntx(); + cntx = ( cntx_t* )bli_gks_query_cntx(); // Use the datatype of the first char in the datatype combination string. bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); From 6431c9e13b86e4442b6aacba18a0ace12288c955 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 14 Apr 2022 13:01:24 -0500 Subject: [PATCH 052/230] Added missing 'const' to zen bli_gemm_small.c. Details: - Added missing 'const' qualifiers to signatures of functions defined in kernels/zen/3/bli_gemm_small.c. This fixes compile-time errors when targeting 'zen3' subconfig (which apparently is enabling AMD's gemm_small code path by default). Thanks to Devin Matthews for reporting this error. --- kernels/zen/3/bli_gemm_small.c | 126 ++++++++++++++++----------------- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/kernels/zen/3/bli_gemm_small.c b/kernels/zen/3/bli_gemm_small.c index b04ffea58..890c5bc2d 100644 --- a/kernels/zen/3/bli_gemm_small.c +++ b/kernels/zen/3/bli_gemm_small.c @@ -55,46 +55,46 @@ #define AT_MR 4 // The kernel dimension of the A transpose GEMM kernel.(AT_MR * NR). static err_t bli_sgemm_small ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + cntl_t* cntl ); static err_t bli_dgemm_small ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + cntl_t* cntl ); static err_t bli_sgemm_small_atbn ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + cntl_t* cntl ); static err_t bli_dgemm_small_atbn ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + cntl_t* cntl ); /* * The bli_gemm_small function will use the @@ -103,13 +103,13 @@ static err_t bli_dgemm_small_atbn */ err_t bli_gemm_small ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + cntl_t* cntl ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); @@ -168,13 +168,13 @@ err_t bli_gemm_small static err_t bli_sgemm_small ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + cntl_t* cntl ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); @@ -1719,13 +1719,13 @@ static err_t bli_sgemm_small static err_t bli_dgemm_small ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + cntl_t* cntl ) { @@ -3327,13 +3327,13 @@ static err_t bli_dgemm_small static err_t bli_sgemm_small_atbn ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + cntl_t* cntl ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO); @@ -3804,13 +3804,13 @@ static err_t bli_sgemm_small_atbn static err_t bli_dgemm_small_atbn ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + cntl_t* cntl ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO); From 1c733402a95ab08b20f3332c2397fd52a2627cf6 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Thu, 28 Apr 2022 11:58:44 -0600 Subject: [PATCH 053/230] Fix version check for znver3, which needs gcc >= 10.3 (#628) Apple's clang-12 lacks znver3 support, unlike upstream clang-12. --- build/config.mk.in | 2 +- config/zen3/make_defs.mk | 6 +++++- configure | 16 ++++++++-------- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/build/config.mk.in b/build/config.mk.in index 56d6211c2..1b3468642 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -94,7 +94,7 @@ CC := @CC@ GCC_OT_4_9_0 := @gcc_older_than_4_9_0@ GCC_OT_6_1_0 := @gcc_older_than_6_1_0@ GCC_OT_9_1_0 := @gcc_older_than_9_1_0@ -GCC_OT_10_1_0 := @gcc_older_than_10_1_0@ +GCC_OT_10_3_0 := @gcc_older_than_10_3_0@ CLANG_OT_9_0_0 := @clang_older_than_9_0_0@ CLANG_OT_12_0_0 := @clang_older_than_12_0_0@ AOCC_OT_2_0_0 := @aocc_older_than_2_0_0@ diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk index 5c68855db..cfeca4f5d 100644 --- a/config/zen3/make_defs.mk +++ b/config/zen3/make_defs.mk @@ -71,7 +71,7 @@ ifeq ($(CC_VENDOR),gcc) ifeq ($(GCC_OT_9_1_0),yes) # gcc versions older than 9.1. CVECFLAGS_VER := -march=znver1 -mno-avx256-split-unaligned-store else - ifeq ($(GCC_OT_10_1_0),yes) # gcc versions 9.1 or newer, but older than 10.1. + ifeq ($(GCC_OT_10_3_0),yes) # gcc versions 9.1 or newer, but older than 10.3. CVECFLAGS_VER := -march=znver2 else # gcc versions 10.1 or newer. CVECFLAGS_VER := -march=znver3 @@ -84,10 +84,14 @@ ifeq ($(CC_VENDOR),clang) else ifeq ($(CLANG_OT_12_0_0),yes) # clang versions 9.0 or newer, but older than 12.0. CVECFLAGS_VER := -march=znver2 + else + ifeq ($(OS_NAME),Darwin) # clang version 12.0 on OSX lacks znver3 support + CVECFLAGS_VER := -march=znver2 else # clang versions 12.0 or newer. CVECFLAGS_VER := -march=znver3 endif endif + endif else ifeq ($(CC_VENDOR),aocc) ifeq ($(AOCC_OT_2_0_0),yes) # aocc versions older than 2.0. diff --git a/configure b/configure index f64aac705..7e825f1dc 100755 --- a/configure +++ b/configure @@ -1791,8 +1791,8 @@ check_compiler_version_ranges() # [5] https://gcc.gnu.org/onlinedocs/gcc-8.3.0/gcc/x86-Options.html#x86-Options # [6] https://gcc.gnu.org/onlinedocs/gcc-9.4.0/gcc/x86-Options.html#x86-Options # - # range: gcc < 10.1 (ie: 9.4 or older) - # variable: gcc_older_than_10_1_0 + # range: gcc < 10.3 (ie: 9.4 or older) + # variable: gcc_older_than_10_3_0 # comments: # These older versions of gcc do not explicitly support the Zen3 # microarchitecture; the newest microarchitectural value understood by @@ -1806,7 +1806,7 @@ check_compiler_version_ranges() gcc_older_than_4_9_0='no' gcc_older_than_6_1_0='no' gcc_older_than_9_1_0='no' - gcc_older_than_10_1_0='no' + gcc_older_than_10_3_0='no' clang_older_than_9_0_0='no' clang_older_than_12_0_0='no' @@ -1839,10 +1839,10 @@ check_compiler_version_ranges() gcc_older_than_9_1_0='yes' fi - # Check for gcc < 10.1.0 (ie: 9.4 or older). - if [ ${cc_major} -lt 10 ]; then - echo "${script_name}: note: found ${cc} version older than 10.1." - gcc_older_than_10_1_0='yes' + # Check for gcc < 10.3.0 (ie: 10.2 or older). + if [[ ( ${cc_major} -lt 10 ) || ( ${cc_major} -eq 10 && ${cc_minor} -lt 3 ) ]]; then + echo "${script_name}: note: found ${cc} version older than 10.3." + gcc_older_than_10_3_0='yes' fi fi @@ -3835,7 +3835,7 @@ main() | sed -e "s/@gcc_older_than_4_9_0@/${gcc_older_than_4_9_0}/g" \ | sed -e "s/@gcc_older_than_6_1_0@/${gcc_older_than_6_1_0}/g" \ | sed -e "s/@gcc_older_than_9_1_0@/${gcc_older_than_9_1_0}/g" \ - | sed -e "s/@gcc_older_than_10_1_0@/${gcc_older_than_10_1_0}/g" \ + | sed -e "s/@gcc_older_than_10_3_0@/${gcc_older_than_10_3_0}/g" \ | sed -e "s/@clang_older_than_9_0_0@/${clang_older_than_9_0_0}/g" \ | sed -e "s/@clang_older_than_12_0_0@/${clang_older_than_12_0_0}/g" \ | sed -e "s/@aocc_older_than_2_0_0@/${aocc_older_than_2_0_0}/g" \ From 64a9b061f6032e2b59613aecdbe7bb52161605c1 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 10 May 2022 14:54:22 -0500 Subject: [PATCH 054/230] Fixed misspelling of 'xpbys' in gemm macrokernel. Details: - Fixed a functionally harmless typo in bli_gemm_ker_var2.c where a few instances of the substring "xpbys" were misspelled as "xbpys". The misspellings were harmless because they were consistent, and because they referenced only local symbols. --- frame/3/gemm/bli_gemm_ker_var2.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 814b47c0c..199e72cb6 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -69,10 +69,10 @@ void PASTEMAC2(chx,chy,op) \ ); \ } -INSERT_GENTFUNC2_BASIC0(xbpys_mxn_fn); -INSERT_GENTFUNC2_MIXDP0(xbpys_mxn_fn); +INSERT_GENTFUNC2_BASIC0(xpbys_mxn_fn); +INSERT_GENTFUNC2_MIXDP0(xpbys_mxn_fn); -static xpbys_mxn_vft GENARRAY2_ALL(xbpys_mxn, xbpys_mxn_fn); +static xpbys_mxn_vft GENARRAY2_ALL(xpbys_mxn, xpbys_mxn_fn); void bli_gemm_ker_var2 @@ -345,7 +345,7 @@ void bli_gemm_ker_var2 ); // Accumulate to C with type-casting. - xbpys_mxn[ dt_exec ][ dt_c ] + xpbys_mxn[ dt_exec ][ dt_c ] ( m_cur, n_cur, &ct, rs_ct, cs_ct, From 4603324eb090dfceaad3693a70b2d60544036aa8 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 19 May 2022 14:07:03 -0500 Subject: [PATCH 055/230] Init/finalize via bli_pthread_switch_t API (#634). Details: - Defined and implemented a new pthread-like abstract datatype and API in bli_pthread.c. The new type, bli_pthread_switch_t, is similar to bli_pthread_once_t in some respects. The idea is that like a switch in your home that controls a light or ceiling fan, it can either be on or off. The switch starts in the off state. Moving from one state to the other (on to off; off to on) causes some action (i.e., a startup or shutdown function) to be executed. Trying to move from one state to the same state (on to on; off to off) is safe in that it results in no action. Unlike bli_pthread_once(), the API for bli_pthread_switch_t contains both _on() and _off() interfaces. Also, unlike the _once() function, the _on() and _off() functions return error codes so that the 'int' error code returned from the startup or shutdown functions may be passed back to the caller. Thanks to Devin Matthews for his input and feedback on this feature. - Replaced the previous implementation of bli_init_once() and bli_finalize_once() -- both of which used bli_pthread_once() -- with ones that rely upon bli_pthread_switch_on() and _switch_off(), respectively. This also required updating the return types of _init_apis() and _finalize_apis() to match the function pointer type required by bli_pthread_switch_on()/_switch_off(). - Comment updates. --- frame/base/bli_init.c | 32 +++-------- frame/base/bli_init.h | 6 +- frame/thread/bli_pthread.c | 114 +++++++++++++++++++++++++++++++++++++ frame/thread/bli_pthread.h | 25 ++++++++ 4 files changed, 149 insertions(+), 28 deletions(-) diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index e616ac2d7..f1baa2c21 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -64,25 +64,21 @@ void bli_finalize_auto( void ) // ----------------------------------------------------------------------------- -// A pthread_once_t variable is a pthread structure used in pthread_once(). -// pthread_once() is guaranteed to execute exactly once among all threads that -// pass in this control object (until/unless the variable is reset). -static bli_pthread_once_t once_init = BLIS_PTHREAD_ONCE_INIT; -static bli_pthread_once_t once_finalize = BLIS_PTHREAD_ONCE_INIT; +static bli_pthread_switch_t lib_state = BLIS_PTHREAD_SWITCH_INIT; void bli_init_once( void ) { - bli_pthread_once( &once_init, bli_init_apis ); + bli_pthread_switch_on( &lib_state, bli_init_apis ); } void bli_finalize_once( void ) { - bli_pthread_once( &once_finalize, bli_finalize_apis ); + bli_pthread_switch_off( &lib_state, bli_finalize_apis ); } // ----------------------------------------------------------------------------- -void bli_init_apis( void ) +int bli_init_apis( void ) { // Initialize various sub-APIs. bli_gks_init(); @@ -91,17 +87,10 @@ void bli_init_apis( void ) bli_pack_init(); bli_memsys_init(); - // Reset the control variable that will allow finalization. - // NOTE: We must initialize a fresh pthread_once_t object and THEN copy the - // contents to the static control variable because some implementations of - // pthreads define pthread_once_t as a struct and BLIS_PTHREAD_ONCE_INIT as - // a struct initializer expression (i.e. { ... }), which cannot be used in - // post-declaration struct assignment in strict C99. - const bli_pthread_once_t once_new = BLIS_PTHREAD_ONCE_INIT; - once_finalize = once_new; + return 0; } -void bli_finalize_apis( void ) +int bli_finalize_apis( void ) { // Finalize various sub-APIs. bli_memsys_finalize(); @@ -110,13 +99,6 @@ void bli_finalize_apis( void ) bli_ind_finalize(); bli_gks_finalize(); - // Reset the control variable that will allow (re-)initialization. - // NOTE: We must initialize a fresh pthread_once_t object and THEN copy the - // contents to the static control variable because some implementations of - // pthreads define pthread_once_t as a struct and BLIS_PTHREAD_ONCE_INIT as - // a struct initializer expression (i.e. { ... }), which cannot be used in - // post-declaration struct assignment in strict C99. - const bli_pthread_once_t once_new = BLIS_PTHREAD_ONCE_INIT; - once_init = once_new; + return 0; } diff --git a/frame/base/bli_init.h b/frame/base/bli_init.h index f174ac0f9..d1bea0cb3 100644 --- a/frame/base/bli_init.h +++ b/frame/base/bli_init.h @@ -38,9 +38,9 @@ BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); -void bli_init_apis( void ); -void bli_finalize_apis( void ); - void bli_init_once( void ); void bli_finalize_once( void ); +int bli_init_apis( void ); +int bli_finalize_apis( void ); + diff --git a/frame/thread/bli_pthread.c b/frame/thread/bli_pthread.c index a09935661..804ace46d 100644 --- a/frame/thread/bli_pthread.c +++ b/frame/thread/bli_pthread.c @@ -693,3 +693,117 @@ int bli_pthread_barrier_wait #endif +// -- Non-standard extensions -------------------------------------------------- + +// -- pthread_switch -- + +// +// Note that bli_pthread_switch_t has the following properties: +// +// 1. Access to a switch is protected by a mutex specific to that switch, and +// therefore state changes and thread-safe. +// +// 2. An initialized switch always starts in the "off" state. +// +// 3. Calling _switch_on() when the switch is already "on" results in an early +// return (no action); similar for _switch_off() when it is already "off". +// +// 4. The _switch_on() and _switch_off() functions each return an error code +// that is equal to the return value of their user-supplied functions, +// provided the function in question was actually called rather than being +// skipped. When a function call is skipped (as in (3) above), the return +// value from _switch_on() and/or _switch_off() is 0 (success). +// +// Note that the user-supplied functions must abide by the convention that a +// return value of 0 indicates success and all other values indicate failure +// (of some kind). The switch and the user-supplied function must agree on +// how "success" is conveyed because the switch must know whether to toggle +// its state after inspecting the return value of the user-supplied function. +// + +int bli_pthread_switch_on + ( + bli_pthread_switch_t* sw, + int (*init)(void) + ) +{ + // NOTE: This function assumes that init() will return 0 on success; + // otherwise, it will return some other integer. If the function + // partially succeeds (in such a way that it must be called again in + // order to complete), it should treat that outcome as failure and + // return a non-zero value. + + // Initialize the return value with the error code for success. + int r_val = 0; + + // Proceed only if the switch is currently off; otherwise, we return with + // an error code of 0. + if ( sw->status == 0 ) + { + // Wait for and acquire the switch's lock. + bli_pthread_mutex_lock( &sw->mutex ); + + // Check the status of the switch once more now that we've acquired the + // lock. Proceed with calling the init() function only if the switch + // is still off; otherwise, release the lock with an error code of 0. + if ( sw->status == 0 ) + { + // Call the init() function and catch its return value in r_val. + r_val = init(); + + // If the init() function succeeded, turn the switch on; + // otherwise, leave the switch off. + if ( r_val == 0 ) + sw->status = 1; + } + + // Release the switch's lock. + bli_pthread_mutex_unlock( &sw->mutex ); + } + + return r_val; +} + +int bli_pthread_switch_off + ( + bli_pthread_switch_t* sw, + int (*deinit)(void) + ) +{ + // NOTE: This function assumes that deinit() will return 0 on success; + // otherwise, it will return some other integer. If the function + // partially succeeds (in such a way that it must be called again in + // order to complete), it should treat that outcome as failure and + // return a non-zero value. + + // Initialize the return value with the error code for success. + int r_val = 0; + + // Proceed only if the switch is currently on; otherwise, we return with + // an error code of 0. + if ( sw->status == 1 ) + { + // Wait for and acquire the switch's lock. + bli_pthread_mutex_lock( &sw->mutex ); + + // Check the status of the switch once more now that we've acquired the + // lock. Proceed with calling the deinit() function only if the switch + // is still on; otherwise, release the lock with an error code of 0. + if ( sw->status == 1 ) + { + // Call the deinit() function and catch its return value in r_val. + r_val = deinit(); + + // If the deinit() function succeeded, turn the switch off; + // otherwise, leave the switch on. + if ( r_val == 0 ) + sw->status = 0; + } + + // Release the switch's lock. + bli_pthread_mutex_unlock( &sw->mutex ); + } + + return r_val; +} + diff --git a/frame/thread/bli_pthread.h b/frame/thread/bli_pthread.h index be786aa39..dcf0db212 100644 --- a/frame/thread/bli_pthread.h +++ b/frame/thread/bli_pthread.h @@ -270,4 +270,29 @@ BLIS_EXPORT_BLIS int bli_pthread_barrier_wait bli_pthread_barrier_t* barrier ); +// -- Non-standard extensions -------------------------------------------------- + +// -- pthread_switch -- + +typedef struct +{ + int status; + bli_pthread_mutex_t mutex; +} bli_pthread_switch_t; + +#define BLIS_PTHREAD_SWITCH_INIT { .status = 0, \ + .mutex = BLIS_PTHREAD_MUTEX_INITIALIZER } + +int bli_pthread_switch_on + ( + bli_pthread_switch_t* sw, + int (*init)(void) + ); + +int bli_pthread_switch_off + ( + bli_pthread_switch_t* sw, + int (*deinit)(void) + ); + #endif // BLIS_PTHREAD_H From 56772892450cc92b3fbd6a9d0460153a43fc47ab Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 1 Jun 2022 10:49:33 -0500 Subject: [PATCH 056/230] Added SMU citation to README.md intro. Details: - Added a citation to SMU and the Matthews Research Group to the general attribution of maintainership and development in the Introduction of the README.md file. Thanks to Robert van de Geijn and Devin Matthews for suggesting this change. --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3803acdca..7996cb676 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,9 @@ The BLIS framework is primarily developed and maintained by individuals in the [Science of High-Performance Computing](http://shpc.ices.utexas.edu/) (SHPC) group in the [Oden Institute for Computational Engineering and Sciences](https://www.oden.utexas.edu/) -at [The University of Texas at Austin](https://www.utexas.edu/). +at [The University of Texas at Austin](https://www.utexas.edu/) +and in the [Matthews Research Group](https://matthewsresearchgroup.webstarts.com/) +at [Southern Methodist University](https://www.smu.edu/). Please visit the [SHPC](http://shpc.ices.utexas.edu/) website for more information about our research group, such as a list of [people](http://shpc.ices.utexas.edu/people.html) From d93df023348144e091f7b3e3053995648f348aa7 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 15 Jun 2022 14:09:49 -0500 Subject: [PATCH 057/230] Removed unused dt arg in bli_gks_query_ind_cntx(). Details: - Removed the num_t datatype argument from bli_gks_query_ind_cntx(). This argument stopped being needed by the function in commit e9da642. Its only use in bli_gks_query_ind_cntx() was to be passed through to the context initialization function for the chosen induced method, but even then, commit log notes from e9da642 indicate that I could not recall why the datatype argument was ever needed by the context init function to begin with. - Updated all invocations of bli_gks_query_ind_cntx() to omit the dt argument. Most of these invocations resided in various standalone test drivers (and the testsuite). --- frame/3/bli_l3_oapi_ex.c | 14 +++++++------- frame/3/gemm/bli_gemm_md.c | 3 +-- frame/base/bli_gks.c | 5 ++--- frame/base/bli_gks.h | 2 +- frame/compat/extra/bla_gemm3m.c | 4 ++-- test/1m4m/test_gemm.c | 2 +- test/3/test_gemm.c | 2 +- test/3/test_hemm.c | 2 +- test/3/test_herk.c | 2 +- test/3/test_trmm.c | 2 +- test/3/test_trsm.c | 2 +- test/studies/skx/test_gemm.c | 2 +- test/studies/skx/test_hemm.c | 2 +- test/studies/skx/test_syrk.c | 2 +- test/studies/skx/test_trmm.c | 2 +- test/studies/thunderx2/test_gemm.c | 2 +- test/studies/thunderx2/test_hemm.c | 2 +- test/studies/thunderx2/test_syrk.c | 2 +- test/studies/thunderx2/test_trmm.c | 2 +- testsuite/src/test_libblis.c | 8 +++++--- 20 files changed, 32 insertions(+), 32 deletions(-) diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c index e4c815fe3..20b0294eb 100644 --- a/frame/3/bli_l3_oapi_ex.c +++ b/frame/3/bli_l3_oapi_ex.c @@ -102,7 +102,7 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF) // If necessary, obtain a valid context from the gks using the induced // method id determined above. - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im ); // Check the operands. if ( bli_error_checking_is_enabled() ) @@ -153,7 +153,7 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) // If necessary, obtain a valid context from the gks using the induced // method id determined above. - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im ); // Check the operands. if ( bli_error_checking_is_enabled() ) @@ -281,7 +281,7 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF) // If necessary, obtain a valid context from the gks using the induced // method id determined above. - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im ); // Check the operands. if ( bli_error_checking_is_enabled() ) @@ -331,7 +331,7 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF) // If necessary, obtain a valid context from the gks using the induced // method id determined above. - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im ); // Check the operands. if ( bli_error_checking_is_enabled() ) @@ -381,7 +381,7 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF) // If necessary, obtain a valid context from the gks using the induced // method id determined above. - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im ); // Check the operands. if ( bli_error_checking_is_enabled() ) @@ -487,7 +487,7 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF) // If necessary, obtain a valid context from the gks using the induced // method id determined above. - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im ); // Check the operands. if ( bli_error_checking_is_enabled() ) @@ -534,7 +534,7 @@ void PASTEMAC(trsm,BLIS_OAPI_EX_SUF) // If necessary, obtain a valid context from the gks using the induced // method id determined above. - if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); + if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im ); // Check the operands. if ( bli_error_checking_is_enabled() ) diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c index a283c1235..1e23d058e 100644 --- a/frame/3/gemm/bli_gemm_md.c +++ b/frame/3/gemm/bli_gemm_md.c @@ -439,8 +439,7 @@ mddm_t bli_gemm_md_rcc // the target datatype. (The packm_blk_var1_md() function has "built-in" // support for packing to 1r (and 1e) schemas, whereas the // packm_blk_var1() function relies on packm kernels for packing to 1r. - const num_t dt_complex = bli_obj_dt( a ); - const cntx_t* cntx_1m = bli_gks_query_ind_cntx( BLIS_1M, dt_complex ); + const cntx_t* cntx_1m = bli_gks_query_ind_cntx( BLIS_1M ); const func_t* packm_1m_mr = bli_cntx_get_ukrs( BLIS_PACKM_MRXK_KER, cntx_1m ); const func_t* packm_1m_nr = bli_cntx_get_ukrs( BLIS_PACKM_NRXK_KER, cntx_1m ); diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 4a7ccbbc3..ff80f85ed 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -509,8 +509,7 @@ static bli_pthread_mutex_t gks_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER; const cntx_t* bli_gks_query_ind_cntx ( - ind_t ind, - num_t dt + ind_t ind ) { bli_init_once(); @@ -675,7 +674,7 @@ const char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt ) // Query the context for the current induced method and datatype, and // then query the ukernel function pointer for the given datatype from // that context. - const cntx_t* cntx = bli_gks_query_ind_cntx( method, dt ); + const cntx_t* cntx = bli_gks_query_ind_cntx( method ); void_fp fp = bli_cntx_get_ukr_dt( dt, ukr, cntx ); // Check whether the ukernel function pointer is NULL for the given diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h index 30e3b2e39..3a93fd59e 100644 --- a/frame/base/bli_gks.h +++ b/frame/base/bli_gks.h @@ -50,7 +50,7 @@ BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_nat_cntx( void ); const cntx_t* bli_gks_query_cntx_noinit( void ); -BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); +BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_ind_cntx( ind_t ind ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); diff --git a/frame/compat/extra/bla_gemm3m.c b/frame/compat/extra/bla_gemm3m.c index 31f677db6..258ac5bbb 100644 --- a/frame/compat/extra/bla_gemm3m.c +++ b/frame/compat/extra/bla_gemm3m.c @@ -103,7 +103,7 @@ void PASTEF77(ch,blasname) \ abbreviated version of bli_gemm_ex() so that we can bypass consideration of sup, which doesn't make sense in this context. */ \ { \ - cntx_t* cntx = ( cntx_t* )bli_gks_query_ind_cntx( BLIS_1M, dt ); \ + cntx_t* cntx = ( cntx_t* )bli_gks_query_ind_cntx( BLIS_1M ); \ \ rntm_t rntm_l; \ rntm_t* rntm = &rntm_l; \ @@ -222,7 +222,7 @@ void PASTEF77(ch,blasname) \ abbreviated version of bli_gemm_ex() so that we can bypass consideration of sup, which doesn't make sense in this context. */ \ { \ - cntx_t* cntx = ( cntx_t* )bli_gks_query_ind_cntx( BLIS_1M, dt ); \ + cntx_t* cntx = ( cntx_t* )bli_gks_query_ind_cntx( BLIS_1M ); \ \ rntm_t rntm_l; \ rntm_t* rntm = &rntm_l; \ diff --git a/test/1m4m/test_gemm.c b/test/1m4m/test_gemm.c index f9a855125..87bdceb11 100644 --- a/test/1m4m/test_gemm.c +++ b/test/1m4m/test_gemm.c @@ -109,7 +109,7 @@ int main( int argc, char** argv ) ind_t ind_mod = ind; // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + cntx = bli_gks_query_ind_cntx( ind_mod ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); diff --git a/test/3/test_gemm.c b/test/3/test_gemm.c index 745dae07c..96992f4a1 100644 --- a/test/3/test_gemm.c +++ b/test/3/test_gemm.c @@ -109,7 +109,7 @@ int main( int argc, char** argv ) ind_t ind_mod = ind; // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + cntx = bli_gks_query_ind_cntx( ind_mod ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); diff --git a/test/3/test_hemm.c b/test/3/test_hemm.c index 8df46f0f0..537378d43 100644 --- a/test/3/test_hemm.c +++ b/test/3/test_hemm.c @@ -87,7 +87,7 @@ int main( int argc, char** argv ) ind_t ind_mod = ind; // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + cntx = bli_gks_query_ind_cntx( ind_mod ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); diff --git a/test/3/test_herk.c b/test/3/test_herk.c index 65dcb9f6c..6dbaf1936 100644 --- a/test/3/test_herk.c +++ b/test/3/test_herk.c @@ -89,7 +89,7 @@ int main( int argc, char** argv ) ind_t ind_mod = ind; // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + cntx = bli_gks_query_ind_cntx( ind_mod ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); diff --git a/test/3/test_trmm.c b/test/3/test_trmm.c index 425630a2a..4e58b95fa 100644 --- a/test/3/test_trmm.c +++ b/test/3/test_trmm.c @@ -92,7 +92,7 @@ int main( int argc, char** argv ) ind_t ind_mod = ind; // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + cntx = bli_gks_query_ind_cntx( ind_mod ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); diff --git a/test/3/test_trsm.c b/test/3/test_trsm.c index 678be4330..4897d4627 100644 --- a/test/3/test_trsm.c +++ b/test/3/test_trsm.c @@ -92,7 +92,7 @@ int main( int argc, char** argv ) ind_t ind_mod = ind; // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + cntx = bli_gks_query_ind_cntx( ind_mod ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); diff --git a/test/studies/skx/test_gemm.c b/test/studies/skx/test_gemm.c index 64311753c..53a227c2b 100644 --- a/test/studies/skx/test_gemm.c +++ b/test/studies/skx/test_gemm.c @@ -94,7 +94,7 @@ int main( int argc, char** argv ) if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + cntx = bli_gks_query_ind_cntx( ind_mod ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); diff --git a/test/studies/skx/test_hemm.c b/test/studies/skx/test_hemm.c index 4ed9b2b67..1b0b1a609 100644 --- a/test/studies/skx/test_hemm.c +++ b/test/studies/skx/test_hemm.c @@ -93,7 +93,7 @@ int main( int argc, char** argv ) if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + cntx = bli_gks_query_ind_cntx( ind_mod ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); diff --git a/test/studies/skx/test_syrk.c b/test/studies/skx/test_syrk.c index 5e1c43159..37b8c54da 100644 --- a/test/studies/skx/test_syrk.c +++ b/test/studies/skx/test_syrk.c @@ -92,7 +92,7 @@ int main( int argc, char** argv ) if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + cntx = bli_gks_query_ind_cntx( ind_mod ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); diff --git a/test/studies/skx/test_trmm.c b/test/studies/skx/test_trmm.c index 1c7db7956..235e1e224 100644 --- a/test/studies/skx/test_trmm.c +++ b/test/studies/skx/test_trmm.c @@ -94,7 +94,7 @@ int main( int argc, char** argv ) if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + cntx = bli_gks_query_ind_cntx( ind_mod ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); diff --git a/test/studies/thunderx2/test_gemm.c b/test/studies/thunderx2/test_gemm.c index f212c570b..7f1880558 100644 --- a/test/studies/thunderx2/test_gemm.c +++ b/test/studies/thunderx2/test_gemm.c @@ -93,7 +93,7 @@ int main( int argc, char** argv ) if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + cntx = bli_gks_query_ind_cntx( ind_mod ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); diff --git a/test/studies/thunderx2/test_hemm.c b/test/studies/thunderx2/test_hemm.c index 5bf0373b4..11e0bea68 100644 --- a/test/studies/thunderx2/test_hemm.c +++ b/test/studies/thunderx2/test_hemm.c @@ -93,7 +93,7 @@ int main( int argc, char** argv ) if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + cntx = bli_gks_query_ind_cntx( ind_mod ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); diff --git a/test/studies/thunderx2/test_syrk.c b/test/studies/thunderx2/test_syrk.c index 4b240e25a..5b9a9957b 100644 --- a/test/studies/thunderx2/test_syrk.c +++ b/test/studies/thunderx2/test_syrk.c @@ -92,7 +92,7 @@ int main( int argc, char** argv ) if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + cntx = bli_gks_query_ind_cntx( ind_mod ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); diff --git a/test/studies/thunderx2/test_trmm.c b/test/studies/thunderx2/test_trmm.c index 0fb153444..4851ac222 100644 --- a/test/studies/thunderx2/test_trmm.c +++ b/test/studies/thunderx2/test_trmm.c @@ -94,7 +94,7 @@ int main( int argc, char** argv ) if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + cntx = bli_gks_query_ind_cntx( ind_mod ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index f267ae158..da729b3a9 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -1080,9 +1080,11 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) bli_ind_oper_get_avail_impl_string( BLIS_GEMM, BLIS_DCOMPLEX ) ); libblis_test_fprintf_c( os, "\n" ); - // Query a native context. - cntx_c = ( cntx_t* )bli_gks_query_ind_cntx( im, BLIS_SCOMPLEX ); - cntx_z = ( cntx_t* )bli_gks_query_ind_cntx( im, BLIS_DCOMPLEX ); + // Query a native context. NOTE: Now that we've removed the dt argument from + // bli_gks_query_ind_cntx(), we can consolidate cntx_c and cntx_z; there is + // no need to query two contexts since they are the same. + cntx_c = ( cntx_t* )bli_gks_query_ind_cntx( im ); + cntx_z = ( cntx_t* )bli_gks_query_ind_cntx( im ); libblis_test_fprintf_c( os, "level-3 blocksizes c z \n" ); libblis_test_fprintf_c( os, " mc %7d %7d\n", From d429b6bfced21a63bf711224ac402f93f0080b52 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 28 Jun 2022 15:34:10 -0500 Subject: [PATCH 058/230] Support clang targetting MinGW (#639) * Support clang targetting MinGW * Fix pthread linking --- build/config.mk.in | 1 + common.mk | 16 ++++++++++++---- configure | 7 ++++++- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/build/config.mk.in b/build/config.mk.in index 1b3468642..7ef8c6bd0 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -73,6 +73,7 @@ OS_NAME := @os_name@ # Check for whether the operating system is Windows. IS_WIN := @is_win@ +IS_MSVC := @is_msvc@ # The directory path to the top level of the source distribution. When # building in-tree, this path is ".". When building out-of-tree, this path diff --git a/common.mk b/common.mk index a93f8ab24..6661f84c5 100644 --- a/common.mk +++ b/common.mk @@ -436,7 +436,7 @@ LIBBLIS := libblis ifeq ($(OS_NAME),Darwin) SHLIB_EXT := dylib else ifeq ($(IS_WIN),yes) -ifeq ($(CC_VENDOR),gcc) +ifeq ($(IS_MSVC),no) SHLIB_EXT := dll.a else SHLIB_EXT := lib @@ -524,7 +524,7 @@ GIT_LOG := $(GIT) log --decorate # manually override whatever they need. # Define the external libraries we may potentially need at link-time. -ifeq ($(IS_WIN),yes) +ifeq ($(IS_MSVC),yes) LIBM := else LIBM := -lm @@ -566,7 +566,7 @@ else SOFLAGS := -shared ifeq ($(IS_WIN),yes) # Windows shared library link flags. -ifeq ($(CC_VENDOR),clang) +ifeq ($(IS_MSVC),yes) SOFLAGS += -Wl,-implib:$(BASE_LIB_PATH)/$(LIBBLIS).lib else SOFLAGS += -Wl,--out-implib,$(BASE_LIB_PATH)/$(LIBBLIS).dll.a @@ -687,7 +687,7 @@ $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CWARNFLAGS,$(c)))) # --- Position-independent code flags (shared libraries only) --- # Emit position-independent code for dynamic linking. -ifeq ($(IS_WIN),yes) +ifeq ($(IS_MSVC),yes) # Note: Don't use any fPIC flags for Windows builds since all code is position- # independent. CPICFLAGS := @@ -739,6 +739,14 @@ endif # Determine default export behavior / visibility of symbols for clang. ifeq ($(CC_VENDOR),clang) ifeq ($(IS_WIN),yes) +ifeq ($(IS_MSVC),no) +# This is a clang build targetting MinGW-w64 env +ifeq ($(EXPORT_SHARED),all) +BUILD_SYMFLAGS := -Wl,--export-all-symbols, -Wl,--enable-auto-import +else # ifeq ($(EXPORT_SHARED),all) +BUILD_SYMFLAGS := -Wl,--exclude-all-symbols +endif +endif # ifeq ($(IS_MSVC),no) ifeq ($(EXPORT_SHARED),all) # NOTE: clang on Windows does not appear to support exporting all symbols # by default, and therefore we ignore the value of EXPORT_SHARED. diff --git a/configure b/configure index 7e825f1dc..5ff877317 100755 --- a/configure +++ b/configure @@ -1170,7 +1170,7 @@ auto_detect() # Set the linker flags. We typically need pthreads (or BLIS's homerolled # equiavlent) because it is needed for parts of bli_arch.c unrelated to # bli_arch_string(), which is called by the main() function in ${main_c}. - if [[ "$is_win" == "no" || "$cc_vendor" != "clang" ]]; then + if [[ "$is_msvc" == "no" ]]; then ldflags="${LIBPTHREAD--lpthread}" fi @@ -2828,6 +2828,10 @@ main() if ${found_cc} -dM -E - < /dev/null 2> /dev/null | grep -q _WIN32; then is_win=yes fi + is_msvc=no + if ${found_cc} -dM -E - < /dev/null 2> /dev/null | grep -q _MSC_VER; then + is_msvc=yes + fi # -- Check the compiler version -------------------------------------------- @@ -3830,6 +3834,7 @@ main() | sed -e "s/@kconfig_map@/${kconfig_map}/g" \ | sed -e "s/@os_name@/${os_name_esc}/g" \ | sed -e "s/@is_win@/${is_win}/g" \ + | sed -e "s/@is_msvc@/${is_msvc}/g" \ | sed -e "s/@dist_path@/${dist_path_esc}/g" \ | sed -e "s/@CC_VENDOR@/${cc_vendor}/g" \ | sed -e "s/@gcc_older_than_4_9_0@/${gcc_older_than_4_9_0}/g" \ From 667f201b7871da68622027d02bd6b7da3262f8e8 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 7 Jul 2022 16:44:21 -0500 Subject: [PATCH 059/230] Fixed type bug in bli_cntx_set_ukr_prefs(). Details: - Fixed a bug in bli_cntx_set_ukr_prefs() which erroneously typecast the num_t value read from va_args() down to a bool before being stored within the cntx_t. This bug was introduced on April 6th 2022, in ae10d94. This caused the ukernel preferences for double real and double complex to go unchanged while the preferences for single real and single complex were corrupted by the former datatypes' preference values. The bug manifested as degraded performance for subconfigurations that registered column-preferential ukernels. The reason is that the erroneous preferences trigger unnecessary transpositions in the operation, which forces the gemm ukernel to compute on matrices that are not stored according to its preference. Thanks to Devin Matthews, Jeff Diamond, and Leick Robinson for their extensive efforts and assistance in tracking down this issue. - Augmented the informational header that is output by the testsuite to include ukernel preferences for gemm, gemmtrsm_[lu], and trsm_[lu]. - CREDITS file update. --- CREDITS | 1 + frame/base/bli_cntx.c | 2 +- testsuite/src/test_libblis.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index b701598cf..43c7b3ed5 100644 --- a/CREDITS +++ b/CREDITS @@ -84,6 +84,7 @@ but many others have contributed code and feedback, including Michael Rader @mrader1248 Pradeep Rao @pradeeptrgit (AMD) Aleksei Rechinskii + Leick Robinson @LeickR (Oracle) Karl Rupp @karlrupp Martin Schatz (The University of Texas at Austin) Nico Schlömer @nschloe diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 70057060f..8c6cafc13 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -316,7 +316,7 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... ) // - the datatype of the kernel, and // - the kernel function pointer const ukr_pref_t ukr_pref_id = ( ukr_pref_t )ukr_pref_id0; - const bool ukr_pref_dt = ( num_t )va_arg( args, num_t ); + const num_t ukr_pref_dt = ( num_t )va_arg( args, num_t ); const bool ukr_pref = ( bool )va_arg( args, int ); // Index into the func_t and mbool_t for the current kernel id diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index da729b3a9..eaa0a9cef 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -962,6 +962,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) bli_info_get_trsm_impl_string( BLIS_SCOMPLEX ), bli_info_get_trsm_impl_string( BLIS_DCOMPLEX ) ); libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "\n" ); //bli_ind_disable_all(); @@ -1062,6 +1063,34 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) bli_info_get_trsm_u_ukr_impl_string( BLIS_NAT, BLIS_DCOMPLEX ) ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "micro-kernel prefers rows? s d c z\n" ); + libblis_test_fprintf_c( os, " gemm %7d %7d %7d %7d\n", + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_FLOAT, BLIS_GEMM_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DOUBLE, BLIS_GEMM_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_SCOMPLEX, BLIS_GEMM_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DCOMPLEX, BLIS_GEMM_UKR, cntx ) ); + libblis_test_fprintf_c( os, " gemmtrsm_l %7d %7d %7d %7d\n", + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_FLOAT, BLIS_GEMMTRSM_L_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DOUBLE, BLIS_GEMMTRSM_L_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_SCOMPLEX, BLIS_GEMMTRSM_L_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DCOMPLEX, BLIS_GEMMTRSM_L_UKR, cntx ) ); + libblis_test_fprintf_c( os, " gemmtrsm_u %7d %7d %7d %7d\n", + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_FLOAT, BLIS_GEMMTRSM_U_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DOUBLE, BLIS_GEMMTRSM_U_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_SCOMPLEX, BLIS_GEMMTRSM_U_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DCOMPLEX, BLIS_GEMMTRSM_U_UKR, cntx ) ); + libblis_test_fprintf_c( os, " trsm_l %7d %7d %7d %7d\n", + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_FLOAT, BLIS_TRSM_L_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DOUBLE, BLIS_TRSM_L_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_SCOMPLEX, BLIS_TRSM_L_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DCOMPLEX, BLIS_TRSM_L_UKR, cntx ) ); + libblis_test_fprintf_c( os, " trsm_u %7d %7d %7d %7d\n", + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_FLOAT, BLIS_TRSM_U_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DOUBLE, BLIS_TRSM_U_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_SCOMPLEX, BLIS_TRSM_U_UKR, cntx ), + ( int )bli_cntx_ukr_prefers_rows_dt( BLIS_DCOMPLEX, BLIS_TRSM_U_UKR, cntx ) ); + libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "--- BLIS induced implementation info ---\n" ); libblis_test_fprintf_c( os, "\n" ); From 7cba7ce3dd1533fcc4ca96ac902bdf218686139a Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 8 Jul 2022 11:15:18 -0500 Subject: [PATCH 060/230] Minor cleanups, comment updates to bli_gks.c. Details: - Removed a redundant registration of 'a64fx' subconfig in bli_gks_init(). - Reordered registration of 'armsve', 'a64fx', and 'firestorm' subconfigs. Thanks to Jeff Diamond for his input on this reordering. - Comment updates to bli_gks.c and arch_t enum in bli_type_defs.h. --- frame/base/bli_gks.c | 49 ++++++++++++++++++++--------------- frame/include/bli_type_defs.h | 6 ++++- 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index ff80f85ed..094810d9d 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -64,7 +64,8 @@ void bli_gks_init( void ) // Register a context for each architecture that was #define'd in // bli_config.h. - // Intel architectures + // -- Intel architectures ---------------------------------------------- + #ifdef BLIS_CONFIG_SKX bli_gks_register_cntx( BLIS_ARCH_SKX, bli_cntx_init_skx, bli_cntx_init_skx_ref, @@ -96,7 +97,8 @@ void bli_gks_init( void ) bli_cntx_init_penryn_ind ); #endif - // AMD architectures + // -- AMD architectures ------------------------------------------------ + #ifdef BLIS_CONFIG_ZEN3 bli_gks_register_cntx( BLIS_ARCH_ZEN3, bli_cntx_init_zen3, bli_cntx_init_zen3_ref, @@ -133,12 +135,28 @@ void bli_gks_init( void ) bli_cntx_init_bulldozer_ind ); #endif - // ARM architectures + // -- ARM architectures ------------------------------------------------ + + // -- ARM-SVE -- +#ifdef BLIS_CONFIG_ARMSVE + bli_gks_register_cntx( BLIS_ARCH_ARMSVE, bli_cntx_init_armsve, + bli_cntx_init_armsve_ref, + bli_cntx_init_armsve_ind ); +#endif #ifdef BLIS_CONFIG_A64FX - bli_gks_register_cntx( BLIS_ARCH_A64FX, bli_cntx_init_a64fx, + bli_gks_register_cntx( BLIS_ARCH_A64FX, bli_cntx_init_a64fx, bli_cntx_init_a64fx_ref, bli_cntx_init_a64fx_ind ); #endif + + // -- ARM-NEON (4 pipes x 128-bit vectors) -- +#ifdef BLIS_CONFIG_FIRESTORM + bli_gks_register_cntx( BLIS_ARCH_FIRESTORM, bli_cntx_init_firestorm, + bli_cntx_init_firestorm_ref, + bli_cntx_init_firestorm_ind ); +#endif + + // -- ARM (2 pipes x 128-bit vectors) -- #ifdef BLIS_CONFIG_THUNDERX2 bli_gks_register_cntx( BLIS_ARCH_THUNDERX2, bli_cntx_init_thunderx2, bli_cntx_init_thunderx2_ref, @@ -154,21 +172,8 @@ void bli_gks_init( void ) bli_cntx_init_cortexa53_ref, bli_cntx_init_cortexa53_ind ); #endif -#ifdef BLIS_CONFIG_ARMSVE - bli_gks_register_cntx( BLIS_ARCH_ARMSVE, bli_cntx_init_armsve, - bli_cntx_init_armsve_ref, - bli_cntx_init_armsve_ind ); -#endif -#ifdef BLIS_CONFIG_A64FX - bli_gks_register_cntx( BLIS_ARCH_A64FX, bli_cntx_init_a64fx, - bli_cntx_init_a64fx_ref, - bli_cntx_init_a64fx_ind ); -#endif -#ifdef BLIS_CONFIG_FIRESTORM - bli_gks_register_cntx( BLIS_ARCH_FIRESTORM, bli_cntx_init_firestorm, - bli_cntx_init_firestorm_ref, - bli_cntx_init_firestorm_ind ); -#endif + + // -- ARM (older 32-bit microarchitectures) -- #ifdef BLIS_CONFIG_CORTEXA15 bli_gks_register_cntx( BLIS_ARCH_CORTEXA15, bli_cntx_init_cortexa15, bli_cntx_init_cortexa15_ref, @@ -180,7 +185,8 @@ void bli_gks_init( void ) bli_cntx_init_cortexa9_ind ); #endif - // IBM architectures + // -- IBM architectures ------------------------------------------------ + #ifdef BLIS_CONFIG_POWER10 bli_gks_register_cntx( BLIS_ARCH_POWER10, bli_cntx_init_power10, bli_cntx_init_power10_ref, @@ -202,7 +208,8 @@ void bli_gks_init( void ) bli_cntx_init_bgq_ind ); #endif - // Generic architectures + // -- Generic architectures -------------------------------------------- + #ifdef BLIS_CONFIG_GENERIC bli_gks_register_cntx( BLIS_ARCH_GENERIC, bli_cntx_init_generic, bli_cntx_init_generic_ref, diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index e957fc6b2..08c7ddc4a 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -928,10 +928,14 @@ typedef enum BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, - // ARM + // ARM-SVE BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, + + // ARM-NEON (4 pipes x 128-bit vectors) BLIS_ARCH_FIRESTORM, + + // ARM (2 pipes x 128-bit vectors) BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, From ffde54cc5c334aca8eff4d6072ba49496bf3104c Mon Sep 17 00:00:00 2001 From: jdiamondGitHub Date: Mon, 11 Jul 2022 16:47:30 -0500 Subject: [PATCH 061/230] Minor changes to .gitignore and LICENSE files. (#642) Details: - Macs create .DS_Store files in every directory visited. Updated .gitignore file so these files won't be reported as untracked by 'git status'. - Added Oracle Corporation to the LICENSE file. - Updated UT copyright on behalf of SHPC. --- .gitignore | 3 +++ LICENSE | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index a24fe2b0e..6d51f6f51 100644 --- a/.gitignore +++ b/.gitignore @@ -53,3 +53,6 @@ out.* GPATH GRTAGS GTAGS + +# Mac DS.store files +.DS_Store diff --git a/LICENSE b/LICENSE index b9cde54b8..8168814a9 100644 --- a/LICENSE +++ b/LICENSE @@ -6,6 +6,7 @@ while other portions are copyrighted by Hewlett Packard Enterprise Development LP Advanced Micro Devices, Inc. + Oracle Corporation with some overlap. Please see file-level license headers for file-specific copyright info. All parties provide their portions of the code under the @@ -13,9 +14,10 @@ copyright info. All parties provide their portions of the code under the --- -Copyright (C) 2018, The University of Texas at Austin +Copyright (C) 2012 - 2022, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. +Copyright (C) 2022, Oracle Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are From 98d467891b74021ace7f248cb0856bec734e39b6 Mon Sep 17 00:00:00 2001 From: bartoldeman Date: Mon, 11 Jul 2022 19:40:53 -0400 Subject: [PATCH 062/230] Change complex_return='intel' for ifx. (#637) Details: - When checking the version string of the Fortran compiler for the purposes of determining a default return convention for complex domain values, grep for "IFORT" instead of "ifort" since that string is common to both the 'ifx' and 'ifort' binaries provided by Intel: $ ifx --version ifx (IFORT) 2022.1.0 20220316 Copyright (C) 1985-2022 Intel Corporation. All rights reserved. $ ifort --version ifort (IFORT) 2021.6.0 20220226 Copyright (C) 1985-2022 Intel Corporation. All rights reserved. --- configure | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configure b/configure index 5ff877317..a6018edab 100755 --- a/configure +++ b/configure @@ -3688,9 +3688,9 @@ main() # Query the compiler "vendor" (ie: the compiler's simple name). # The last part ({ read first rest ; echo $first ; }) is a workaround # to OS X's egrep only returning the first match. - fc_vendor=$(echo "${vendor_string}" | egrep -o 'ifort|GNU' | { read first rest ; echo $first ; }) + fc_vendor=$(echo "${vendor_string}" | egrep -o 'IFORT|GNU' | { read first rest ; echo $first ; }) - if [ "x${fc_vendor}" = "xifort" ]; then + if [ "x${fc_vendor}" = "xIFORT" ]; then complex_return='intel' elif [ "x${fc_vendor}" = "xGNU" ]; then complex_return='gnu' From 9b1beec60be31c6ea20b85806d61551497b699e4 Mon Sep 17 00:00:00 2001 From: bartoldeman Date: Mon, 11 Jul 2022 20:15:12 -0400 Subject: [PATCH 063/230] Use BLIS_ENABLE_COMPLEX_RETURN_INTEL in blastest files (#636) Details: - Fixed a crash that occurs when either cblat1 or zblat1 are linked with a build of BLIS that was compiled with '--complex-return=intel'. This fix involved inserting preprocessor macro guards based on BLIS_ENABLE_COMPLEX_RETURN_INTEL into blastest/src/cblat1.c and blastest/src/zblat1.c to correctly handle situations where BLIS is compiled with Intel/f2c-style calling conventions for complex numbers. - Updated blastest/src/fortran/run-f2c.sh so that future executions will insert the aforementioned cpp macro conditional where appropriate. --- blastest/src/cblat1.c | 32 ++++++++++++++++++++++++++++---- blastest/src/fortran/run-f2c.sh | 20 +++++++++++--------- blastest/src/zblat1.c | 32 ++++++++++++++++++++++++++++---- 3 files changed, 67 insertions(+), 17 deletions(-) diff --git a/blastest/src/cblat1.c b/blastest/src/cblat1.c index daccb2f6c..606511662 100644 --- a/blastest/src/cblat1.c +++ b/blastest/src/cblat1.c @@ -475,11 +475,23 @@ static real c_b52 = 0.f; integer mx, my; complex cdot[1]; integer lenx, leny; - extern /* Complex */ complex cdotc_(integer *, complex *, integer + extern /* Complex */ +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + void cdotc_(complex *, +#else +complex cdotc_( +#endif + integer *, complex *, integer *, complex *, integer *); extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, complex *, integer *); - extern /* Complex */ complex cdotu_(integer *, complex *, integer + extern /* Complex */ +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + void cdotu_(complex *, +#else +complex cdotu_( +#endif + integer *, complex *, integer *, complex *, integer *); extern /* Subroutine */ int cswap_(integer *, complex *, integer *, complex *, integer *), ctest_(integer *, complex *, complex *, @@ -526,14 +538,26 @@ static real c_b52 = 0.f; } if (combla_1.icase == 1) { /* .. CDOTC .. */ - q__1 = cdotc_(&combla_1.n, cx, &combla_1.incx, cy, & + +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + cdotc_(&q__1, +#else + q__1 = cdotc_( +#endif + &combla_1.n, cx, &combla_1.incx, cy, & combla_1.incy); cdot[0].r = q__1.r, cdot[0].i = q__1.i; ctest_(&c__1, cdot, &ct6[kn + (ki << 2) - 5], &csize1[kn - 1], sfac); } else if (combla_1.icase == 2) { /* .. CDOTU .. */ - q__1 = cdotu_(&combla_1.n, cx, &combla_1.incx, cy, & + +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + cdotu_(&q__1, +#else + q__1 = cdotu_( +#endif + &combla_1.n, cx, &combla_1.incx, cy, & combla_1.incy); cdot[0].r = q__1.r, cdot[0].i = q__1.i; ctest_(&c__1, cdot, &ct7[kn + (ki << 2) - 5], &csize1[kn - 1], diff --git a/blastest/src/fortran/run-f2c.sh b/blastest/src/fortran/run-f2c.sh index fdad4fd34..f0df2f5b8 100755 --- a/blastest/src/fortran/run-f2c.sh +++ b/blastest/src/fortran/run-f2c.sh @@ -50,13 +50,15 @@ recursive-sed.sh -c "s/-4.f };/-4.f }};/g" -p "s*1.c" # Convert from brain-dead f2c complex calling conventions to normal # return-based conventions. -recursive-sed.sh -c "s/void cdotc_(complex \*, /complex cdotc_(/g" -p "c*1.c" -recursive-sed.sh -c "s/void cdotu_(complex \*, /complex cdotu_(/g" -p "c*1.c" -recursive-sed.sh -c "s/cdotc_(&q__1, /q__1 = cdotc_(/g" -p "c*1.c" -recursive-sed.sh -c "s/cdotu_(&q__1, /q__1 = cdotu_(/g" -p "c*1.c" - -recursive-sed.sh -c "s/void zdotc_(doublecomplex \*, /doublecomplex zdotc_(/g" -p "z*1.c" -recursive-sed.sh -c "s/void zdotu_(doublecomplex \*, /doublecomplex zdotu_(/g" -p "z*1.c" -recursive-sed.sh -c "s/zdotc_(\&z__1, /z__1 = zdotc_(/g" -p "z*1.c" -recursive-sed.sh -c "s/zdotu_(\&z__1, /z__1 = zdotu_(/g" -p "z*1.c" +subst1='\n#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL\n&\n#else\n' +subst2='\n#endif\n' +recursive-sed.sh -c "s/ void cdotc_(complex \*,/${subst1}complex cdotc_(${subst2}/g" -p "c*1.c" +recursive-sed.sh -c "s/ void cdotu_(complex \*,/${subst1}complex cdotu_(${subst2}/g" -p "c*1.c" +recursive-sed.sh -c "s/\(.*\)cdotc_(&q__1,/${subst1}\1q__1 = cdotc_(${subst2}\1/g" -p "c*1.c" +recursive-sed.sh -c "s/\(.*\)cdotu_(&q__1,/${subst1}\1q__1 = cdotu_(${subst2}\1/g" -p "c*1.c" + +recursive-sed.sh -c "s/ void zdotc_(doublecomplex \*,/${subst1}doublecomplex zdotc_(${subst2}/g" -p "z*1.c" +recursive-sed.sh -c "s/ void zdotu_(doublecomplex \*,/${subst1}doublecomplex zdotu_(${subst2}/g" -p "z*1.c" +recursive-sed.sh -c "s/\(.*\)zdotc_(\&z__1,/${subst1}\1z__1 = zdotc_(${subst2}\1/g" -p "z*1.c" +recursive-sed.sh -c "s/\(.*\)zdotu_(\&z__1,/${subst1}\1z__1 = zdotu_(${subst2}\1/g" -p "z*1.c" diff --git a/blastest/src/zblat1.c b/blastest/src/zblat1.c index c34a57262..b620910be 100644 --- a/blastest/src/zblat1.c +++ b/blastest/src/zblat1.c @@ -459,12 +459,24 @@ static doublereal c_b52 = 0.; integer lenx, leny; extern /* Subroutine */ int ctest_(integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublereal *); - extern /* Double Complex */ doublecomplex zdotc_(integer *, + extern /* Double Complex */ +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + void zdotc_(doublecomplex *, +#else +doublecomplex zdotc_( +#endif + integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ksize; extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Double Complex */ doublecomplex zdotu_(integer *, + extern /* Double Complex */ +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + void zdotu_(doublecomplex *, +#else +doublecomplex zdotu_( +#endif + integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, @@ -508,14 +520,26 @@ static doublereal c_b52 = 0.; } if (combla_1.icase == 1) { /* .. ZDOTC .. */ - z__1 = zdotc_(&combla_1.n, cx, &combla_1.incx, cy, & + +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + zdotc_(&z__1, +#else + z__1 = zdotc_( +#endif + &combla_1.n, cx, &combla_1.incx, cy, & combla_1.incy); cdot[0].r = z__1.r, cdot[0].i = z__1.i; ctest_(&c__1, cdot, &ct6[kn + (ki << 2) - 5], &csize1[kn - 1], sfac); } else if (combla_1.icase == 2) { /* .. ZDOTU .. */ - z__1 = zdotu_(&combla_1.n, cx, &combla_1.incx, cy, & + +#ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL + zdotu_(&z__1, +#else + z__1 = zdotu_( +#endif + &combla_1.n, cx, &combla_1.incx, cy, & combla_1.incy); cdot[0].r = z__1.r, cdot[0].i = z__1.i; ctest_(&c__1, cdot, &ct7[kn + (ki << 2) - 5], &csize1[kn - 1], From cc260fd7068f0fe449d818435aa11adb14c17fed Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 13 Jul 2022 16:16:01 -0500 Subject: [PATCH 064/230] Allow uniform max problem sizes in test/3/runme.sh. Details: - Tweaked test/3/runme.sh so that the test driver binaries for single- threaded (st), single-socket (1s), and dual-socket (2s) execution can be built using identical problem size ranges. Previously, this was not possible because runme.sh used the maximum problem size, which was embedded into the binary filename, to tell the three classes of binaries apart from one another. Now, runme.sh uses the binary suffix ("st", "1s", or "2s") to tell them apart. This required only a few changes to the logic, but it also required a change in format to the threading config strings themselves (replacing the max problem size with "st", "1s", or "2s"). Thanks to Jeff Diamond for inspiring this improvement. - Comment updates. --- test/3/runme.sh | 90 +++++++++++++++++++++++++++++-------------------- 1 file changed, 54 insertions(+), 36 deletions(-) diff --git a/test/3/runme.sh b/test/3/runme.sh index 56c192809..cf84bd121 100755 --- a/test/3/runme.sh +++ b/test/3/runme.sh @@ -5,12 +5,12 @@ exec_root="test" out_root="output" delay=0.1 -#sys="blis" +sys="blis" #sys="stampede2" #sys="lonestar5" #sys="ul252" #sys="ul264" -sys="ul2128" +#sys="ul2128" # Bind threads to processors. #export OMP_PROC_BIND=true @@ -22,9 +22,9 @@ if [ ${sys} = "blis" ]; then export GOMP_CPU_AFFINITY="0-3" numactl="" - threads="jc1ic1jr1_2400 - jc2ic3jr2_6000 - jc4ic3jr2_8000" + threads="jc1ic1jr1_st + jc2ic1jr1_1s + jc2ic2jr1_2s" elif [ ${sys} = "stampede2" ]; then @@ -32,9 +32,9 @@ elif [ ${sys} = "stampede2" ]; then exit 1 numactl="" - threads="jc1ic1jr1_2400 - jc4ic6jr1_6000 - jc4ic12jr1_8000" + threads="jc1ic1jr1_st + jc4ic6jr1_1s + jc4ic12jr1_2s" elif [ ${sys} = "lonestar5" ]; then @@ -44,9 +44,9 @@ elif [ ${sys} = "lonestar5" ]; then #export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64" numactl="" - threads="jc1ic1jr1_2400 - jc2ic3jr2_6000 - jc4ic3jr2_8000" + threads="jc1ic1jr1_st + jc2ic3jr2_1s + jc4ic3jr2_2s" elif [ ${sys} = "ul252" ]; then @@ -54,9 +54,9 @@ elif [ ${sys} = "ul252" ]; then export GOMP_CPU_AFFINITY="0-51" numactl="" - threads="jc1ic1jr1_2400 - jc2ic13jr1_6000 - jc4ic13jr1_8000" + threads="jc1ic1jr1_st + jc2ic13jr1_1s + jc4ic13jr1_2s" elif [ ${sys} = "ul264" ]; then @@ -64,9 +64,9 @@ elif [ ${sys} = "ul264" ]; then export GOMP_CPU_AFFINITY="0-63" numactl="numactl --interleave=all" - threads="jc1ic1jr1_2400 - jc1ic8jr4_6000 - jc2ic8jr4_8000" + threads="jc1ic1jr1_st + jc1ic8jr4_1s + jc2ic8jr4_2s" elif [ ${sys} = "ul2128" ]; then @@ -74,14 +74,14 @@ elif [ ${sys} = "ul2128" ]; then export GOMP_CPU_AFFINITY="0-127" numactl="numactl --interleave=all" - threads="jc1ic1jr1_2400 - jc4ic4jr4_6000 - jc8ic4jr4_8000" - #threads="jc4ic4jr4_6000 - # jc8ic4jr4_8000" - #threads="jc1ic1jr1_2400" - #threads="jc4ic4jr4_6000" - #threads="jc8ic4jr4_8000" + threads="jc1ic1jr1_st + jc4ic4jr4_1s + jc8ic4jr4_2s" + #threads="jc4ic4jr4_1s + # jc8ic4jr4_2s" + #threads="jc1ic1jr1_st" + #threads="jc4ic4jr4_1s" + #threads="jc8ic4jr4_2s" fi # Datatypes to test. @@ -93,12 +93,12 @@ test_ops="gemm hemm herk trmm trsm" #test_ops="herk" # Implementations to test. -#impls="blis" +impls="blis" #impls="openblas" #impls="vendor" #impls="other" #impls="eigen" -impls="all" +#impls="all" if [ "${impls}" = "blis" ]; then @@ -129,7 +129,7 @@ fi GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY} -# First perform real test cases. +# Iterate over the threading configs. for th in ${threads}; do # Start with one way of parallelism in each loop. We will now begin @@ -139,7 +139,8 @@ for th in ${threads}; do # Strip everything before and after the underscore so that what remains # is the problem size and threading parameter string, respectively. - psize=${th##*_}; thinfo=${th%%_*} + #psize=${th##*_}; thinfo=${th%%_*} + tsuf=${th##*_}; thinfo=${th%%_*} # Identify each threading parameter and insert a space before it. thsep=$(echo -e ${thinfo} | sed -e "s/\([jip][cr]\)/ \1/g" ) @@ -166,13 +167,32 @@ for th in ${threads}; do done - echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt}) p_max${psize}" + # Find a binary using the test driver prefix and the threading suffix. + # Then strip everything before and after the max problem size that's + # encoded into the name of the binary. + binname=$(ls -1 ${exec_root}_*_${tsuf}.x | head -n1) + temp1=${binname#${exec_root}_*_} + psize=${temp1%%_*} + + # Sanity check: If 'ls' couldn't find any binaries, then the user + # probably didn't build them. Inform the user and proceed to the next + # threading config. + if [ "${binname}" = "" ]; then + echo "Could not find binaries corresponding to '${tsuf}' threading config. Skipping." + continue + fi + + # Let the user know what threading config we are working on. + echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt}) p_max${psize}" + # Iterate over the datatypes. for dt in ${test_dts}; do + # Iterate over the implementations. for im in ${test_impls}; do + # Iterate over the operations. for op in ${test_ops}; do # Eigen does not support multithreading for hemm, herk, trmm, @@ -185,14 +205,12 @@ for th in ${threads}; do fi # Find the threading suffix by probing the executable. - binname=$(ls ${exec_root}_${dt}${op}_${psize}_${im}_*.x) - suf_ext=${binname##*_} - suf=${suf_ext%%.*} + binname=$(ls ${exec_root}_${dt}${op}_*_${im}_${tsuf}.x) #echo "found file: ${binname} with suffix ${suf}" # Set the number of threads according to th. - if [ "${suf}" = "1s" ] || [ "${suf}" = "2s" ]; then + if [ "${tsuf}" = "1s" ] || [ "${tsuf}" = "2s" ]; then # Set the threading parameters based on the implementation # that we are preparing to run. @@ -237,10 +255,10 @@ for th in ${threads}; do fi # Construct the name of the test executable. - exec_name="${exec_root}_${dt}${op}_${psize}_${im}_${suf}.x" + exec_name="${exec_root}_${dt}${op}_${psize}_${im}_${tsuf}.x" # Construct the name of the output file. - out_file="${out_root}_${suf}_${dt}${op}_${im}.m" + out_file="${out_root}_${tsuf}_${dt}${op}_${im}.m" #echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}" echo "Running ${numactl} ./${exec_name} > ${out_file}" From 17b0caa2b2bff439feb6d2b39cfa16e7591882b0 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 14 Jul 2022 17:55:34 -0500 Subject: [PATCH 065/230] Fixed out-of-bounds read in haswell gemmsup kernels. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Details: - Fixed memory access bugs in the bli_sgemmsup_rv_haswell_asm_Mx2() kernels, where M = {1,2,3,4,5,6}. The bugs were caused by loading four single-precision elements of C, via instructions such as: vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) in situations where only two elements are guaranteed to exist. (These bugs may not have manifested in earlier tests due to the leading dimension alignment that BLIS employs by default.) The issue was fixed by replacing lines like the one above with: vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) Thus, we use vmovsd to explicitly load only two elements of C into registers, and then operate on those values using register addressing. Thanks to Daniël de Kok for reporting these bugs in #635, and to Bhaskar Nallani for proposing the fix). - CREDITS file update. --- CREDITS | 1 + .../s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c | 63 ++++++++++++------- 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/CREDITS b/CREDITS index 43c7b3ed5..bb2b3798f 100644 --- a/CREDITS +++ b/CREDITS @@ -23,6 +23,7 @@ but many others have contributed code and feedback, including Dilyn Corner @dilyn-corner Mat Cross @matcross (NAG) @decandia50 + Daniël de Kok @danieldk (Explosion) Kay Dewhurst @jkd2016 (Max Planck Institute, Halle, Germany) Jeff Diamond (Oracle) Johannes Dieterich @iotamudelta diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c index 53a70d15f..efb336395 100644 --- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c +++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c @@ -389,32 +389,38 @@ void bli_sgemmsup_rv_haswell_asm_6x2 label(.SROWSTORED) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm10) vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm12) vmovsd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm14) vmovsd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) @@ -848,27 +854,32 @@ void bli_sgemmsup_rv_haswell_asm_5x2 label(.SROWSTORED) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm10) vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm12) vmovsd(xmm12, mem(rcx, 0*32)) //add(rdi, rcx) @@ -1288,22 +1299,26 @@ void bli_sgemmsup_rv_haswell_asm_4x2 label(.SROWSTORED) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm10) vmovsd(xmm10, mem(rcx, 0*32)) //add(rdi, rcx) @@ -1683,17 +1698,20 @@ void bli_sgemmsup_rv_haswell_asm_3x2 label(.SROWSTORED) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) //add(rdi, rcx) @@ -2066,12 +2084,14 @@ void bli_sgemmsup_rv_haswell_asm_2x2 label(.SROWSTORED) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) //add(rdi, rcx) @@ -2404,7 +2424,8 @@ void bli_sgemmsup_rv_haswell_asm_1x2 label(.SROWSTORED) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) + vmovsd(mem(rcx), xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) From af3a41e02534befdae026377592ce437bab83023 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 21 Jul 2022 18:05:48 +0200 Subject: [PATCH 066/230] Add autodetection for POWER7, POWER9 & POWER10 (#647) Read from `/proc/cpuinfo` as done for ARM. Fixes #501 --- frame/base/bli_cpuid.c | 27 +++++++++++++++++++++++---- frame/base/bli_cpuid.h | 6 +++++- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c index ff0f386e6..527db1f5d 100644 --- a/frame/base/bli_cpuid.c +++ b/frame/base/bli_cpuid.c @@ -485,7 +485,7 @@ bool bli_cpuid_is_bulldozer return TRUE; } -#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) +#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_ARCH_PPC) arch_t bli_cpuid_query_id( void ) { @@ -530,9 +530,14 @@ arch_t bli_cpuid_query_id( void ) return BLIS_ARCH_GENERIC; } } - else if ( vendor == VENDOR_UNKNOWN ) + else if ( vendor == VENDOR_IBM ) { - return BLIS_ARCH_GENERIC; + if ( model == MODEL_POWER7) + return BLIS_ARCH_POWER7; + else if ( model == MODEL_POWER9) + return BLIS_ARCH_POWER9; + else if ( model == MODEL_POWER10) + return BLIS_ARCH_POWER10; } return BLIS_ARCH_GENERIC; @@ -1203,7 +1208,7 @@ uint32_t bli_cpuid_query return VENDOR_ARM; } -#elif defined(__arm__) || defined(_M_ARM) +#elif defined(__arm__) || defined(_M_ARM) || defined(_ARCH_PPC) /* I can't easily find documentation to do this as for aarch64, though @@ -1240,6 +1245,20 @@ uint32_t bli_cpuid_query char feat_str[ TEMP_BUFFER_SIZE ]; char* r_val; +#ifdef _ARCH_PPC + r_val = find_string_in( "cpu", proc_str, TEMP_BUFFER_SIZE, pci_str ); + if ( r_val == NULL ) return VENDOR_IBM; + + if ( strstr( proc_str, "POWER7" ) != NULL ) + *model = MODEL_POWER7; + else if ( strstr( proc_str, "POWER9" ) != NULL ) + *model = MODEL_POWER9; + else if ( strstr( proc_str, "POWER10" ) != NULL ) + *model = MODEL_POWER10; + + return VENDOR_IBM; +#endif + //printf( "bli_cpuid_query(): beginning search\n" ); // Search /proc/cpuinfo for the 'Processor' entry. diff --git a/frame/base/bli_cpuid.h b/frame/base/bli_cpuid.h index 3fea78e5a..c10f36a1c 100644 --- a/frame/base/bli_cpuid.h +++ b/frame/base/bli_cpuid.h @@ -161,19 +161,23 @@ enum FEATURE_AVX512VL = 0x4000 }; -#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) +#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_ARCH_PPC) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, + VENDOR_IBM, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, + MODEL_POWER7, + MODEL_POWER9, + MODEL_POWER10, MODEL_UNKNOWN }; enum From 6826c1cdfba855513786d9e3d606681316453398 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 25 Jul 2022 18:21:05 -0500 Subject: [PATCH 067/230] Add `#line` directives to flattened `blis.h`. (#643) Details: - Modified flatten-headers.py so that #line directives are inserted into the flattened blis.h file. This facilitates easier debugging when something is amiss in the flattened blis.h because the compiler will be able to refer to the line number within the original constituent header file (which is where the fix would go) rather than the line number within the flattened header (which is not as helpful). --- build/flatten-headers.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/build/flatten-headers.py b/build/flatten-headers.py index 563725a7e..40fc2a450 100755 --- a/build/flatten-headers.py +++ b/build/flatten-headers.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# BLIS +# BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # @@ -215,9 +215,19 @@ def flatten_header( inputfile, header_dirpaths, cursp ): # Open the input file to process. ifile = open( inputfile, "r" ) + # A counter to track the line number being parsed within the current file. + # This counter, when selectively encoded into the flattened header via #line + # directives, facilitates easier debugging. (When the compiler finds an + # issue, it will be able to refer to the line number within the constituent + # header file rather than the flattened one.) + lineno = 0 + # Iterate over the lines in the file. while True: + # Increment the line number. + lineno += 1 + # Read a line in the file. line = ifile.readline() @@ -268,12 +278,14 @@ def flatten_header( inputfile, header_dirpaths, cursp ): # Mark the beginning of the header being inserted. ostring += "%s%s%c" % ( beginstr, header, '\n' ) + ostring += "#line %d \"%s\"%c\n" % ( 1, header_path, '\n' ) # Recurse on the header, accumulating the string. ostring += flatten_header( header_path, header_dirpaths, cursp + " " ) # Mark the end of the header being inserted. ostring += "%s%s%c" % ( endstr, header, '\n' ) + ostring += "#line %d \"%s\"%c\n" % ( lineno+1, inputfile, '\n' ) echov2( "%sheader file '%s' fully processed." \ % ( cursp, header_path ) ) @@ -300,7 +312,7 @@ def flatten_header( inputfile, header_dirpaths, cursp ): # endif # endwhile - + # Close the input file. ifile.close() @@ -330,7 +342,6 @@ def find_header_dirs( dirpath ): #endfor return header_dirpaths - # ------------------------------------------------------------------------------ From 4dde947e2ec9e139c162801320c94e6a01a39708 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 26 Jul 2022 17:29:32 -0500 Subject: [PATCH 068/230] Fixed out-of-bounds bug in sup s6x16m haswell kernel. Details: - Fixed another out-of-bounds read access bug in the haswell sup assembly kernels. This bug is similar to the one fixed in 17b0caa and affects bli_sgemmsup_rv_haswell_asm_6x2m(). Thanks to Madeesh Kannan for reporting this bug (and a suitable fix) in #635. - CREDITS file update. --- CREDITS | 1 + .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/CREDITS b/CREDITS index bb2b3798f..aa1591334 100644 --- a/CREDITS +++ b/CREDITS @@ -47,6 +47,7 @@ but many others have contributed code and feedback, including Matthew Honnibal @honnibal Stefan Husmann @stefanhusmann Francisco Igual @figual (Universidad Complutense de Madrid) + Madeesh Kannan @shadeMe Tony Kelman @tkelman Lee Killough @leekillough (Cray) Mike Kistler @mkistler (IBM, Austin Research Laboratory) diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c index aacfd8d1f..b5424f09a 100644 --- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c +++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c @@ -4477,32 +4477,38 @@ void bli_sgemmsup_rv_haswell_asm_6x2m label(.SROWSTORED) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) + vmovsd(mem(rcx, 0*32), xmm0) + vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) + vmovsd(mem(rcx, 0*32), xmm0) + vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) + vmovsd(mem(rcx, 0*32), xmm0) + vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) + vmovsd(mem(rcx, 0*32), xmm0) + vfmadd231ps(xmm0, xmm3, xmm10) vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) + vmovsd(mem(rcx, 0*32), xmm0) + vfmadd231ps(xmm0, xmm3, xmm12) vmovsd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) - vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14) + vmovsd(mem(rcx, 0*32), xmm0) + vfmadd231ps(xmm0, xmm3, xmm14) vmovsd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) From 56de31b00fa0f1ba866321817cd1e5d83000ff11 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 27 Jul 2022 13:54:17 -0500 Subject: [PATCH 069/230] Disable modification of KC in the gemmsup kernels. (#648) This led to a ~50% performance reduction for certain gemm operations (but not others?). See #644 for example. --- frame/3/bli_l3_sup_var1n2m.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c index a5d66783f..61c85d6e9 100644 --- a/frame/3/bli_l3_sup_var1n2m.c +++ b/frame/3/bli_l3_sup_var1n2m.c @@ -295,6 +295,9 @@ void PASTEMAC(ch,varname) \ const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ \ + /* Disable modification of KC since it seems to negatively impact certain operations (#644). */ \ + dim_t KC = KC0; \ + /* \ dim_t KC; \ if ( packa && packb ) \ { \ @@ -320,7 +323,7 @@ void PASTEMAC(ch,varname) \ stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ else KC = KC0; \ } \ - else /* if ( !packa && !packb ) */ \ + else *//* if ( !packa && !packb ) *//* \ { \ if ( FALSE ) KC = KC0; \ else if ( stor_id == BLIS_RRC || \ @@ -330,7 +333,7 @@ void PASTEMAC(ch,varname) \ else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \ else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ else KC = (( KC0 / 5 ) / 4 ) * 4; \ - } \ + }*/ \ \ /* Nudge NC up to a multiple of MR and MC up to a multiple of NR. NOTE: This is unique to variant 1 (ie: not performed in variant 2) @@ -932,6 +935,9 @@ void PASTEMAC(ch,varname) \ const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ \ + /* Disable modification of KC since it seems to negatively impact certain operations (#644). */ \ + dim_t KC = KC0; \ + /* \ dim_t KC; \ if ( packa && packb ) \ { \ @@ -957,7 +963,7 @@ void PASTEMAC(ch,varname) \ stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ else KC = KC0; \ } \ - else /* if ( !packa && !packb ) */ \ + else *//* if ( !packa && !packb ) *//* \ { \ if ( stor_id == BLIS_RRR || \ stor_id == BLIS_CCC ) KC = KC0; \ @@ -968,7 +974,7 @@ void PASTEMAC(ch,varname) \ else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \ else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ else KC = (( KC0 / 5 ) / 4 ) * 4; \ - } \ + }*/ \ \ /* Query the maximum blocksize for NR, which implies a maximum blocksize extension for the final iteration. */ \ From 5b298935de7f20462bfad1893ed34ecd691cec5a Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 27 Jul 2022 19:14:15 -0500 Subject: [PATCH 070/230] Removed buggy cruft from power10 subconfig. Details: - Removed #defines for BLIS_BBN_s and BLIS_BBN_d from bli_kernel_defs_power10.h. These were inadvertently set in ae10d949 because the power10 subconfig was registering bb packm ukernels, but only for 6xk (power10 uses s8x16 and d8x8 ukernels) and only because the original author (probably) copy-pasted from power9 when getting started. That 6xk packm registration was effectively "dead code" prior to ae10d949, but was then mistaken as not-dead code during the ae10d949 refactor. These improper bb factors may have been causing bugs in power10 builds. Thanks to Nicholai Tukanov for helping remind me what the power10 subconfig was supposed to look like. - Removed extraneous microkernel preference registrations from power10 subconfig. Preferences for single and double complex gemm were being registered despite there being no complex gemm ukernels registered to go with them. Similarly, there were trsm preferences registered without any trsm ukernels registered (and BLIS doesn't actually use a preference for the trsm ukernel anyway). These extraneous registrations were almost surely not hurting anything, even if they were quite misleading. --- config/power10/bli_cntx_init_power10.c | 10 ---------- config/power10/bli_kernel_defs_power10.h | 2 -- 2 files changed, 12 deletions(-) diff --git a/config/power10/bli_cntx_init_power10.c b/config/power10/bli_cntx_init_power10.c index 12d9f51c6..f662d5791 100644 --- a/config/power10/bli_cntx_init_power10.c +++ b/config/power10/bli_cntx_init_power10.c @@ -63,16 +63,6 @@ void bli_cntx_init_power10( cntx_t* cntx ) // level-3 BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, - BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, - BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, - BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT, FALSE, - BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT, FALSE, - BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, - BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, - BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, - BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE, - BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, - BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE, BLIS_VA_END ); diff --git a/config/power10/bli_kernel_defs_power10.h b/config/power10/bli_kernel_defs_power10.h index 4e32f1173..9b47a77c0 100644 --- a/config/power10/bli_kernel_defs_power10.h +++ b/config/power10/bli_kernel_defs_power10.h @@ -44,8 +44,6 @@ #define BLIS_NR_s 16 #define BLIS_NR_d 8 -#define BLIS_BBN_s 4 -#define BLIS_BBN_d 2 //#endif From a48e29d799091a833213efeafaf2d342ebdafde9 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 28 Jul 2022 10:11:07 -0500 Subject: [PATCH 071/230] CREDITS file update. Details: - Thanks to Kihiro Bando for assisting with issue #644. --- CREDITS | 1 + 1 file changed, 1 insertion(+) diff --git a/CREDITS b/CREDITS index aa1591334..49361c801 100644 --- a/CREDITS +++ b/CREDITS @@ -16,6 +16,7 @@ but many others have contributed code and feedback, including Alex Arslan @ararslan Vernon Austel (IBM, T.J. Watson Research Center) Satish Balay @balay (Argonne National Laboratory) + Kihiro Bando @bandokihiro Matthew Brett @matthew-brett (University of Birmingham) Jérémie du Boisberranger @jeremiedbb Jed Brown @jedbrown (Argonne National Laboratory) From bbaf29abd942de47a3a99a80a67d12bab41b27db Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 4 Aug 2022 17:51:37 -0500 Subject: [PATCH 072/230] Very minor variable updates to common.mk. Details: - Fixed a harmless bug that would have allowed C++ headers into the list of header suffices specifically reserved for C99 headers. In practice, this would have had no substantive effect on anything since the core BLIS framework does not use C++ headers. --- common.mk | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/common.mk b/common.mk index 6661f84c5..33713e9f5 100644 --- a/common.mk +++ b/common.mk @@ -342,7 +342,8 @@ SANDBOX_CXX_SUFS := cc cpp cxx SANDBOX_SRC_SUFS := $(SANDBOX_C99_SUFS) $(SANDBOX_CXX_SUFS) # Header suffixes. -FRAME_HDR_SUFS := h +FRAME_H99_SUFS := h +FRAME_HDR_SUFS := $(FRAME_H99_SUFS) ADDON_H99_SUFS := h ADDON_HXX_SUFS := hh hpp hxx @@ -357,8 +358,8 @@ ALL_HDR_SUFS := $(sort $(FRAME_HDR_SUFS) \ $(ADDON_HDR_SUFS) \ $(SANDBOX_HDR_SUFS) ) -ALL_H99_SUFS := $(sort $(FRAME_HDR_SUFS) \ - $(ADDON_HDR_SUFS) \ +ALL_H99_SUFS := $(sort $(FRAME_H99_SUFS) \ + $(ADDON_H99_SUFS) \ $(SANDBOX_H99_SUFS) ) # The names of scripts that check output from the BLAS test drivers and From 775148bcdbb1014b4881a76306f35f5d0fedecbe Mon Sep 17 00:00:00 2001 From: jdiamondGitHub Date: Fri, 5 Aug 2022 12:01:24 -0500 Subject: [PATCH 073/230] Updated ARMv8a kernels to fix 2 prefetching issues. (#649) Details: - The ARMv8a dgemm/sgemm microkernels had 2 prefetching issues that impacted performance on modern ARM platforms. The most significant issue was that only a single prefetch per C tile column was issued. When a column of C was not cache aligned, the second cache line would not be prefetched at all, forcing the kernel to wait for an entire load to update elements of C. This happened with roughly 50% of the C prefetches. The fix was to have two prefetches per column, spaced 64 bytes (1 cache line) apart. - A secondary performance issue was that all the C prefetch instructions were issued sequentially at the beginning of the kernel call. This caused a noticeable performance slowdown. Interleaving the prefetch calls every 2-3 instructions in the prologue code solved the issue. --- kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 356 ++++++++++++-------- 1 file changed, 211 insertions(+), 145 deletions(-) diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index 94f0090bc..12c670a9f 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -1,4 +1,4 @@ - /* +/* BLIS An object-based framework for developing high-performance BLAS-like @@ -30,12 +30,20 @@ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ #include "blis.h" #include "armv8a_asm_utils.h" +// #define DISPLAY_DEBUG_INFO + +// Added prefetch fix for non-cacheline aligned C columns +// (with the prefetches interleaved with other instructions) +// to both sgemm and dgemm versions. + +// Added sgemm prefetch fix for non-cacheline aligned C columns +// (with the prefetches interleaved with other instructions) + /* o 4x4 Single precision micro-kernel fully functional. o Runnable on ARMv8, compiled with aarch64 GCC. @@ -50,7 +58,13 @@ * Tested on Juno Board. Around 15.9 GFLOPS, 2 x A57 cores @ 1.1 GHz. * Tested on Juno board. Around 3.1 GFLOPS, 1 x A53 core @ 850 MHz. * Tested on Juno board. Around 12 GFLOPS, 4 x A53 cores @ 850 MHz. + + * UPDATE JULY 2021 - Leick Robinson + * Both Microkernels changed to fix two prefetching performance bugs + * Tested on 2s Altra. Around 6,900 GFLOPS, 160 x N2 cores @ 3.0 GHz + * Tested on 1s Altra Max. Arnd 5,800 GFLOPS. 128 x N2 cores @ 3.0 GHz */ + void bli_sgemm_armv8a_asm_8x12 ( dim_t m, @@ -86,73 +100,111 @@ void bli_sgemm_armv8a_asm_8x12 " ldr x1,%[baddr] \n\t" // Load address of B. " ldr x2,%[caddr] \n\t" // Load address of C. " \n\t" - " ldr x5,%[k_iter] \n\t" // Number of unrolled iterations (k_iter). - " ldr x6,%[k_left] \n\t" // Number of remaining iterations (k_left). - " \n\t" " ldr x10,%[cs_c] \n\t" // Load cs_c. " lsl x10,x10,#2 \n\t" // cs_c * sizeof(float) -- AUX. " \n\t" + " ldr x5,%[k_iter] \n\t" // Number of unrolled iterations (k_iter). + " ldr x6,%[k_left] \n\t" // Number of remaining iterations (k_left). + " add x16,x2,x10 \n\t" // Load address Column 1 of C + " \n\t" // " ldr x14,%[rs_c] \n\t" // Load rs_c. // " lsl x14,x14,#2 \n\t" // rs_c * sizeof(float). " \n\t" - " add x16,x2,x10 \n\t" //Load address Column 1 of C - " add x17,x16,x10 \n\t" //Load address Column 2 of C - " add x19,x17,x10 \n\t" //Load address Column 3 of C - " add x20,x19,x10 \n\t" //Load address Column 4 of C - " add x21,x20,x10 \n\t" //Load address Column 5 of C - " add x22,x21,x10 \n\t" //Load address Column 6 of C - " add x23,x22,x10 \n\t" //Load address Column 7 of C - " add x24,x23,x10 \n\t" //Load address Column 8 of C - " add x25,x24,x10 \n\t" //Load address Column 9 of C - " add x26,x25,x10 \n\t" //Load address Column 10 of C - " add x27,x26,x10 \n\t" //Load address Column 11 of C - " \n\t" - " prfm pldl1keep,[x2] \n\t" // Prefetch c. - " prfm pldl1keep,[x16] \n\t" // Prefetch c. - " prfm pldl1keep,[x17] \n\t" // Prefetch c. - " prfm pldl1keep,[x19] \n\t" // Prefetch c. - " prfm pldl1keep,[x20] \n\t" // Prefetch c. - " prfm pldl1keep,[x21] \n\t" // Prefetch c. - " prfm pldl1keep,[x22] \n\t" // Prefetch c. - " prfm pldl1keep,[x23] \n\t" // Prefetch c. - " prfm pldl1keep,[x24] \n\t" // Prefetch c. - " prfm pldl1keep,[x25] \n\t" // Prefetch c. - " prfm pldl1keep,[x26] \n\t" // Prefetch c. - " prfm pldl1keep,[x27] \n\t" // Prefetch c. " \n\t" " dup v8.4s, wzr \n\t" // Vector for accummulating column 0 " prfm PLDL1KEEP, [x1, #192] \n\t" + " prfm pldl1keep,[x2] \n\t" // Prefetch c. + " add x17,x16,x10 \n\t" // Load address Column 2 of C + " dup v9.4s, wzr \n\t" // Vector for accummulating column 0 " prfm PLDL1KEEP, [x1, #256] \n\t" + " \n\t" // Since columns of C can cross a cache + // line boundary, we also need to prefetch + // the "ends." + " prfm pldl1keep,[x2, #16] \n\t" // Prefetch c. + " add x19,x17,x10 \n\t" // Load address Column 3 of C + " dup v10.4s, wzr \n\t" // Vector for accummulating column 1 " prfm PLDL1KEEP, [x1, #320] \n\t" + " prfm pldl1keep,[x16] \n\t" // Prefetch c. + " add x20,x19,x10 \n\t" // Load address Column 4 of C + " dup v11.4s, wzr \n\t" // Vector for accummulating column 1 + " prfm pldl1keep,[x16] \n\t" // Prefetch c. + " prfm pldl1keep,[x16, #16] \n\t" // Prefetch c. + " dup v12.4s, wzr \n\t" // Vector for accummulating column 2 + " prfm pldl1keep,[x17] \n\t" // Prefetch c. + " add x21,x20,x10 \n\t" // Load address Column 5 of C + " dup v13.4s, wzr \n\t" // Vector for accummulating column 2 + " prfm pldl1keep,[x17, #16] \n\t" // Prefetch c. " \n\t" " dup v14.4s, wzr \n\t" // Vector for accummulating column 3 " prfm PLDL1KEEP, [x0, #128] \n\t" + " prfm pldl1keep,[x19] \n\t" // Prefetch c. + " add x22,x21,x10 \n\t" // Load address Column 6 of C + " dup v15.4s, wzr \n\t" // Vector for accummulating column 3 " prfm PLDL1KEEP, [x0, #192] \n\t" + " prfm pldl1keep,[x19, #16] \n\t" // Prefetch c. + " dup v16.4s, wzr \n\t" // Vector for accummulating column 4 + " prfm pldl1keep,[x20] \n\t" // Prefetch c. + " add x23,x22,x10 \n\t" // Load address Column 7 of C + " dup v17.4s, wzr \n\t" // Vector for accummulating column 4 + " prfm pldl1keep,[x20, #16] \n\t" // Prefetch c. + " dup v18.4s, wzr \n\t" // Vector for accummulating column 5 + " prfm pldl1keep,[x21] \n\t" // Prefetch c. + " add x24,x23,x10 \n\t" // Load address Column 8 of C + " dup v19.4s, wzr \n\t" // Vector for accummulating column 5 + " prfm pldl1keep,[x21, #16] \n\t" // Prefetch c. + " \n\t" " dup v20.4s, wzr \n\t" // Vector for accummulating column 6 + " prfm pldl1keep,[x22] \n\t" // Prefetch c. + " add x25,x24,x10 \n\t" // Load address Column 9 of C + " dup v21.4s, wzr \n\t" // Vector for accummulating column 6 + " prfm pldl1keep,[x22, #16] \n\t" // Prefetch c. + " dup v22.4s, wzr \n\t" // Vector for accummulating column 7 + " prfm pldl1keep,[x23] \n\t" // Prefetch c. + " add x26,x25,x10 \n\t" // Load address Column 10 of C + " dup v23.4s, wzr \n\t" // Vector for accummulating column 7 + " prfm pldl1keep,[x23, #16] \n\t" // Prefetch c. + " dup v24.4s, wzr \n\t" // Vector for accummulating column 8 + " prfm pldl1keep,[x24] \n\t" // Prefetch c. + " add x27,x26,x10 \n\t" // Load address Column 11 of C + " dup v25.4s, wzr \n\t" // Vector for accummulating column 8 + " prfm pldl1keep,[x24, #16] \n\t" // Prefetch c. " \n\t" " dup v26.4s, wzr \n\t" // Vector for accummulating column 9 + " prfm pldl1keep,[x25] \n\t" // Prefetch c. + " dup v27.4s, wzr \n\t" // Vector for accummulating column 9 + " prfm pldl1keep,[x25, #16] \n\t" // Prefetch c. + " dup v28.4s, wzr \n\t" // Vector for accummulating column 10 + " prfm pldl1keep,[x26] \n\t" // Prefetch c. + " dup v29.4s, wzr \n\t" // Vector for accummulating column 10 + " prfm pldl1keep,[x26, #16] \n\t" // Prefetch c. + " dup v30.4s, wzr \n\t" // Vector for accummulating column 11 + " prfm pldl1keep,[x27] \n\t" // Prefetch c. + " dup v31.4s, wzr \n\t" // Vector for accummulating column 11 + " prfm pldl1keep,[x27, #16] \n\t" // Prefetch c. + " \n\t" " \n\t" + " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. BEQ(SCONSIDERKLEFT) " \n\t" @@ -163,10 +215,10 @@ void bli_sgemm_armv8a_asm_8x12 " ldr q3, [x1, #16] \n\t" " ldr q4, [x1, #32] \n\t" " \n\t" - " add x0, x0, #32 \n\t" //update address of A - " add x1, x1, #48 \n\t" //update address of B + " add x0, x0, #32 \n\t" // Update address of A + " add x1, x1, #48 \n\t" // Update address of B " \n\t" - " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. + " cmp x5,1 \n\t" // If there's only one k_iter, jump to it BEQ(SLASTITER) // (as loop is do-while-like). " \n\t" LABEL(SLOOPKITER) // Body of the k_iter loop. @@ -206,7 +258,7 @@ void bli_sgemm_armv8a_asm_8x12 " fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. " ldr q4, [x1, #32] \n\t" - " \n\t" //End It 1 + " \n\t" // End It 1 " \n\t" " ldr q0, [x0, #32] \n\t" " fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. @@ -242,7 +294,7 @@ void bli_sgemm_armv8a_asm_8x12 " fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. " ldr q4, [x1, #80] \n\t" - " \n\t" //End It 2 + " \n\t" // End It 2 " \n\t" " ldr q5, [x0, #64] \n\t" " fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. @@ -276,7 +328,7 @@ void bli_sgemm_armv8a_asm_8x12 " fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. " ldr q4, [x1, #128] \n\t" - " \n\t" //End It 3 + " \n\t" // End It 3 " \n\t" " ldr q0, [x0, #96] \n\t" " fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. @@ -312,7 +364,7 @@ void bli_sgemm_armv8a_asm_8x12 " ldr q4, [x1, #176] \n\t" " add x1, x1, #192 \n\t" " add x0, x0, #128 \n\t" - " \n\t" //End It 4 + " \n\t" // End It 4 " sub x5,x5,1 \n\t" // i-=1. " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. BNE(SLOOPKITER) @@ -352,7 +404,7 @@ void bli_sgemm_armv8a_asm_8x12 " fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. " ldr q4, [x1, #32] \n\t" - " \n\t" //End It 1 + " \n\t" // End It 1 " \n\t" " ldr q0, [x0, #32] \n\t" " fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. @@ -386,7 +438,7 @@ void bli_sgemm_armv8a_asm_8x12 " fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. " ldr q4, [x1, #80] \n\t" - " \n\t" //End It 2 + " \n\t" // End It 2 " \n\t" " ldr q5, [x0, #64] \n\t" " fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. @@ -420,7 +472,7 @@ void bli_sgemm_armv8a_asm_8x12 " fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. " ldr q4, [x1, #128] \n\t" - " \n\t" //End It 3 + " \n\t" // End It 3 " \n\t" " fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. " fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. @@ -451,7 +503,7 @@ void bli_sgemm_armv8a_asm_8x12 " fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. " add x1, x1, #144 \n\t" " add x0, x0, #96 \n\t" - " \n\t" //End It 4 + " \n\t" // End It 4 " \n\t" LABEL(SCONSIDERKLEFT) " cmp x6,0 \n\t" // If k_left == 0, we are done. @@ -521,11 +573,11 @@ void bli_sgemm_armv8a_asm_8x12 " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case. " \n\t" - " ldr q0, [x2] \n\t" //Load column 0 of C + " ldr q0, [x2] \n\t" // Load column 0 of C " ldr q1, [x2, #16] \n\t" - " ldr q2, [x16] \n\t" //Load column 1 of C + " ldr q2, [x16] \n\t" // Load column 1 of C " ldr q3, [x16, #16] \n\t" - " ldr q4, [x17] \n\t" //Load column 2 of C + " ldr q4, [x17] \n\t" // Load column 2 of C " ldr q5, [x17, #16] \n\t" " \n\t" " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta @@ -544,11 +596,11 @@ void bli_sgemm_armv8a_asm_8x12 " fmla v4.4s,v12.4s,v6.s[0] \n\t" // Scale by alpha " fmla v5.4s,v13.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" - " str q0, [x2] \n\t" //Store column 0 of C + " str q0, [x2] \n\t" // Store column 0 of C " str q1, [x2, #16] \n\t" - " str q2, [x16] \n\t" //Store column 1 of C + " str q2, [x16] \n\t" // Store column 1 of C " str q3, [x16, #16] \n\t" - " str q4, [x17] \n\t" //Store column 2 of C + " str q4, [x17] \n\t" // Store column 2 of C " str q5, [x17, #16] \n\t" " \n\t" " dup v8.4s, wzr \n\t" @@ -561,11 +613,11 @@ void bli_sgemm_armv8a_asm_8x12 " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. " \n\t" - " ldr q8, [x19] \n\t" //Load column 3 of C + " ldr q8, [x19] \n\t" // Load column 3 of C " ldr q9, [x19, #16] \n\t" - " ldr q10, [x20] \n\t" //Load column 4 of C + " ldr q10, [x20] \n\t" // Load column 4 of C " ldr q11, [x20, #16] \n\t" - " ldr q12, [x21] \n\t" //Load column 5 of C + " ldr q12, [x21] \n\t" // Load column 5 of C " ldr q13, [x21, #16] \n\t" " \n\t" " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta @@ -584,11 +636,11 @@ void bli_sgemm_armv8a_asm_8x12 " fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha " fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" - " str q8, [x19] \n\t" //Store column 3 of C + " str q8, [x19] \n\t" // Store column 3 of C " str q9, [x19, #16] \n\t" - " str q10, [x20] \n\t" //Store column 4 of C + " str q10, [x20] \n\t" // Store column 4 of C " str q11, [x20, #16] \n\t" - " str q12, [x21] \n\t" //Store column 5 of C + " str q12, [x21] \n\t" // Store column 5 of C " str q13, [x21, #16] \n\t" " \n\t" " dup v0.4s, wzr \n\t" @@ -601,11 +653,11 @@ void bli_sgemm_armv8a_asm_8x12 " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. " \n\t" - " ldr q0, [x22] \n\t" //Load column 6 of C + " ldr q0, [x22] \n\t" // Load column 6 of C " ldr q1, [x22, #16] \n\t" - " ldr q2, [x23] \n\t" //Load column 7 of C + " ldr q2, [x23] \n\t" // Load column 7 of C " ldr q3, [x23, #16] \n\t" - " ldr q4, [x24] \n\t" //Load column 8 of C + " ldr q4, [x24] \n\t" // Load column 8 of C " ldr q5, [x24, #16] \n\t" " \n\t" " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta @@ -624,11 +676,11 @@ void bli_sgemm_armv8a_asm_8x12 " fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha " fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" - " str q0, [x22] \n\t" //Store column 6 of C + " str q0, [x22] \n\t" // Store column 6 of C " str q1, [x22, #16] \n\t" - " str q2, [x23] \n\t" //Store column 7 of C + " str q2, [x23] \n\t" // Store column 7 of C " str q3, [x23, #16] \n\t" - " str q4, [x24] \n\t" //Store column 8 of C + " str q4, [x24] \n\t" // Store column 8 of C " str q5, [x24, #16] \n\t" " \n\t" " dup v8.4s, wzr \n\t" @@ -641,11 +693,11 @@ void bli_sgemm_armv8a_asm_8x12 " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. " \n\t" - " ldr q8, [x25] \n\t" //Load column 9 of C + " ldr q8, [x25] \n\t" // Load column 9 of C " ldr q9, [x25, #16] \n\t" - " ldr q10, [x26] \n\t" //Load column 10 of C + " ldr q10, [x26] \n\t" // Load column 10 of C " ldr q11, [x26, #16] \n\t" - " ldr q12, [x27] \n\t" //Load column 11 of C + " ldr q12, [x27] \n\t" // Load column 11 of C " ldr q13, [x27, #16] \n\t" " \n\t" " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta @@ -667,11 +719,11 @@ void bli_sgemm_armv8a_asm_8x12 " fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha " fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" - " str q8, [x25] \n\t" //Store column 9 of C + " str q8, [x25] \n\t" // Store column 9 of C " str q9, [x25, #16] \n\t" - " str q10, [x26] \n\t" //Store column 10 of C + " str q10, [x26] \n\t" // Store column 10 of C " str q11, [x26, #16] \n\t" - " str q12, [x27] \n\t" //Store column 11 of C + " str q12, [x27] \n\t" // Store column 11 of C " str q13, [x27, #16] \n\t" " \n\t" " \n\t" @@ -729,7 +781,13 @@ void bli_sgemm_armv8a_asm_8x12 * Tested on Juno Board. Around 7.6 GFLOPS, 2 x A57 cores @ 1.1 GHz. * Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core @ 850 MHz. * Tested on Juno board. Around 5.5 GFLOPS, 4 x A53 cores @ 850 MHz. -*/ + + * UPDATE JULY 2021 - Leick Robinson + * Both Microkernels changed to fix two prefetching performance bugs + * Tested on 2s Altra. Around 3,200 GFLOPS, 160 x N2 cores @ 3.0 GHz + * Tested on 1s Altra, Around 1,700 GFLOPS, 80 x N2 cores @ 3.0 GHz + * Tested on 1s Altra Max, ~ 2,600 GFLOPS, 128 x N2 cores @ 3.0 GHz + */ void bli_dgemm_armv8a_asm_6x8 ( dim_t m, @@ -744,6 +802,21 @@ void bli_dgemm_armv8a_asm_6x8 cntx_t* cntx ) { +#ifdef DISPLAY_DEBUG_INFO + + static bool bFirstTime = true; + + if ( bFirstTime ) + { + printf( "In bli_dgemm_armv8a_asm_6x8: rs_c0=%d, cs_c0=%d \n", + (int) rs_c0, (int) cs_c0 ); + fflush( stdout ); + bFirstTime = false; + } + +#endif + + void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); @@ -763,31 +836,17 @@ void bli_dgemm_armv8a_asm_6x8 " ldr x1,%[baddr] \n\t" // Load address of B " ldr x2,%[caddr] \n\t" // Load address of C " \n\t" - " ldr x5,%[k_iter] \n\t" // Init guard (k_iter) - " ldr x6,%[k_left] \n\t" // Init guard (k_iter) - " \n\t" " ldr x10,%[cs_c] \n\t" // Load cs_c " lsl x10,x10,#3 \n\t" // cs_c * sizeof(double) " \n\t" + " ldr x5,%[k_iter] \n\t" // Init guard (k_iter) + " ldr x6,%[k_left] \n\t" // Init guard (k_iter) + " add x20,x2,x10 \n\t" // Load address Column 1 of C + " \n\t" // " ldr x14,%[rs_c] \n\t" // Load rs_c. // " lsl x14,x14,#3 \n\t" // rs_c * sizeof(double). " \n\t" - " add x20,x2,x10 \n\t" //Load address Column 1 of C - " add x21,x20,x10 \n\t" //Load address Column 2 of C - " add x22,x21,x10 \n\t" //Load address Column 3 of C - " add x23,x22,x10 \n\t" //Load address Column 4 of C - " add x24,x23,x10 \n\t" //Load address Column 5 of C - " add x25,x24,x10 \n\t" //Load address Column 6 of C - " add x26,x25,x10 \n\t" //Load address Column 7 of C " \n\t" - " prfm pldl1keep,[x2] \n\t" // Prefetch c. - " prfm pldl1keep,[x20] \n\t" // Prefetch c. - " prfm pldl1keep,[x21] \n\t" // Prefetch c. - " prfm pldl1keep,[x22] \n\t" // Prefetch c. - " prfm pldl1keep,[x23] \n\t" // Prefetch c. - " prfm pldl1keep,[x24] \n\t" // Prefetch c. - " prfm pldl1keep,[x25] \n\t" // Prefetch c. - " prfm pldl1keep,[x26] \n\t" // Prefetch c. " \n\t" " dup v8.2d, xzr \n\t" // Vector for accummulating column 0 " prfm PLDL1KEEP, [x1, #256] \n\t" @@ -798,33 +857,75 @@ void bli_dgemm_armv8a_asm_6x8 " dup v11.2d, xzr \n\t" // Vector for accummulating column 1 " prfm PLDL1KEEP, [x1, #448] \n\t" " dup v12.2d, xzr \n\t" // Vector for accummulating column 1 + " prfm PLDL1KEEP, [x0, #192] \n\t" + " add x21,x20,x10 \n\t" // Load address Column 2 of C + " dup v13.2d, xzr \n\t" // Vector for accummulating column 1 + " prfm PLDL1KEEP, [x0, #256] \n\t" " \n\t" " dup v14.2d, xzr \n\t" // Vector for accummulating column 2 - " prfm PLDL1KEEP, [x0, #192] \n\t" + " prfm PLDL1KEEP, [x0, #320] \n\t" + " add x22,x21,x10 \n\t" // Load address Column 3 of C + " dup v15.2d, xzr \n\t" // Vector for accummulating column 2 - " prfm PLDL1KEEP, [x0, #256] \n\t" + " prfm pldl1keep,[x2] \n\t" // Prefetch c. " dup v16.2d, xzr \n\t" // Vector for accummulating column 2 - " prfm PLDL1KEEP, [x0, #320] \n\t" + " \n\t" // Since columns of C can cross a cache + // line boundary, we also need to prefetch + // the "ends." + " prfm pldl1keep,[x2, #32] \n\t" // Prefetch c. + " add x23,x22,x10 \n\t" // Load address Column 4 of C + " dup v17.2d, xzr \n\t" // Vector for accummulating column 3 + " prfm pldl1keep,[x20] \n\t" // Prefetch c. + " dup v18.2d, xzr \n\t" // Vector for accummulating column 3 + " prfm pldl1keep,[x20, #32] \n\t" // Prefetch c. + " add x24,x23,x10 \n\t" // Load address Column 5 of C + " dup v19.2d, xzr \n\t" // Vector for accummulating column 3 + " prfm pldl1keep,[x21] \n\t" // Prefetch c. " \n\t" + " dup v20.2d, xzr \n\t" // Vector for accummulating column 4 + " prfm pldl1keep,[x21, #32] \n\t" // Prefetch c. + " add x25,x24,x10 \n\t" // Load address Column 6 of C + " dup v21.2d, xzr \n\t" // Vector for accummulating column 4 + " prfm pldl1keep,[x22] \n\t" // Prefetch c. + " dup v22.2d, xzr \n\t" // Vector for accummulating column 4 + " prfm pldl1keep,[x22, #32] \n\t" // Prefetch c. + " add x26,x25,x10 \n\t" // Load address Column 7 of C + " dup v23.2d, xzr \n\t" // Vector for accummulating column 5 + " prfm pldl1keep,[x23] \n\t" // Prefetch c. + " dup v24.2d, xzr \n\t" // Vector for accummulating column 5 + " prfm pldl1keep,[x23, #32] \n\t" // Prefetch c. + " dup v25.2d, xzr \n\t" // Vector for accummulating column 5 + " prfm pldl1keep,[x24] \n\t" // Prefetch c. " \n\t" " dup v26.2d, xzr \n\t" // Vector for accummulating column 6 + " prfm pldl1keep,[x24, #32] \n\t" // Prefetch c. + " dup v27.2d, xzr \n\t" // Vector for accummulating column 6 + " prfm pldl1keep,[x25] \n\t" // Prefetch c. + " dup v28.2d, xzr \n\t" // Vector for accummulating column 6 + " prfm pldl1keep,[x25, #32] \n\t" // Prefetch c. + " dup v29.2d, xzr \n\t" // Vector for accummulating column 7 + " prfm pldl1keep,[x26] \n\t" // Prefetch c. + " dup v30.2d, xzr \n\t" // Vector for accummulating column 7 + " prfm pldl1keep,[x26, #32] \n\t" // Prefetch c. + " dup v31.2d, xzr \n\t" // Vector for accummulating column 7 " \n\t" " \n\t" + " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. BEQ(DCONSIDERKLEFT) " \n\t" @@ -837,10 +938,10 @@ void bli_dgemm_armv8a_asm_6x8 " ldr q5, [x1, #32] \n\t" " ldr q6, [x1, #48] \n\t" " \n\t" - " add x0, x0, #48 \n\t" //update address of A - " add x1, x1, #64 \n\t" //update address of B + " add x0, x0, #48 \n\t" // Update address of A + " add x1, x1, #64 \n\t" // Update address of B " \n\t" - " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. + " cmp x5,1 \n\t" // If there's only one k_iter, jump to it BEQ(DLASTITER) // (as loop is do-while-like). " \n\t" LABEL(DLOOP) // Body @@ -930,7 +1031,7 @@ void bli_dgemm_armv8a_asm_6x8 " fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate " fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate " ldr q6, [x1, #112] \n\t" - " \n\t" //End it 2 + " \n\t" // End it 2 " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " prfm PLDL1KEEP, [x0, #464] \n\t" " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate @@ -1011,7 +1112,7 @@ void bli_dgemm_armv8a_asm_6x8 " fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate " fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate " ldr q6, [x1, #240] \n\t" - " \n\t" //End it 4 + " \n\t" // End it 4 " add x0, x0, #192 \n\t" " add x1, x1, #256 \n\t" " \n\t" @@ -1100,7 +1201,7 @@ void bli_dgemm_armv8a_asm_6x8 " fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate " fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate " ldr q6, [x1, #112] \n\t" - " \n\t" //End it 2 + " \n\t" // End it 2 " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate " fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate @@ -1174,7 +1275,7 @@ void bli_dgemm_armv8a_asm_6x8 " \n\t" " fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate " fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate - " \n\t" //End it 4 + " \n\t" // End it 4 " add x0, x0, #144 \n\t" " \n\t" LABEL(DCONSIDERKLEFT) @@ -1253,11 +1354,11 @@ void bli_dgemm_armv8a_asm_6x8 " fcmp d7,#0.0 \n\t" BEQ(DBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case. " \n\t" - " ldr q0, [x2] \n\t" //Load column 0 of C + " ldr q0, [x2] \n\t" // Load column 0 of C " ldr q1, [x2, #16] \n\t" " ldr q2, [x2, #32] \n\t" " \n\t" - " ldr q3, [x20] \n\t" //Load column 1 of C + " ldr q3, [x20] \n\t" // Load column 1 of C " ldr q4, [x20, #16] \n\t" " ldr q5, [x20, #32] \n\t" " \n\t" @@ -1277,11 +1378,11 @@ void bli_dgemm_armv8a_asm_6x8 " fmla v4.2d,v12.2d,v6.d[0] \n\t" // Scale by alpha " fmla v5.2d,v13.2d,v6.d[0] \n\t" // Scale by alpha " \n\t" - " str q0, [x2] \n\t" //Store column 0 of C + " str q0, [x2] \n\t" // Store column 0 of C " str q1, [x2, #16] \n\t" " str q2, [x2, #32] \n\t" " \n\t" - " str q3, [x20] \n\t" //Store column 1 of C + " str q3, [x20] \n\t" // Store column 1 of C " str q4, [x20, #16] \n\t" " str q5, [x20, #32] \n\t" " \n\t" @@ -1295,11 +1396,11 @@ void bli_dgemm_armv8a_asm_6x8 " fcmp d7,#0.0 \n\t" BEQ(DBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. " \n\t" - " ldr q8, [x21] \n\t" //Load column 2 of C + " ldr q8, [x21] \n\t" // Load column 2 of C " ldr q9, [x21, #16] \n\t" " ldr q10, [x21, #32] \n\t" " \n\t" - " ldr q11, [x22] \n\t" //Load column 3 of C + " ldr q11, [x22] \n\t" // Load column 3 of C " ldr q12, [x22, #16] \n\t" " ldr q13, [x22, #32] \n\t" " \n\t" @@ -1319,11 +1420,11 @@ void bli_dgemm_armv8a_asm_6x8 " fmla v12.2d,v18.2d,v6.d[0] \n\t" // Scale by alpha " fmla v13.2d,v19.2d,v6.d[0] \n\t" // Scale by alpha " \n\t" - " str q8, [x21] \n\t" //Store column 2 of C + " str q8, [x21] \n\t" // Store column 2 of C " str q9, [x21, #16] \n\t" " str q10, [x21, #32] \n\t" " \n\t" - " str q11, [x22] \n\t" //Store column 3 of C + " str q11, [x22] \n\t" // Store column 3 of C " str q12, [x22, #16] \n\t" " str q13, [x22, #32] \n\t" " \n\t" @@ -1337,11 +1438,11 @@ void bli_dgemm_armv8a_asm_6x8 " fcmp d7,#0.0 \n\t" BEQ(DBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. " \n\t" - " ldr q0, [x23] \n\t" //Load column 4 of C + " ldr q0, [x23] \n\t" // Load column 4 of C " ldr q1, [x23, #16] \n\t" " ldr q2, [x23, #32] \n\t" " \n\t" - " ldr q3, [x24] \n\t" //Load column 5 of C + " ldr q3, [x24] \n\t" // Load column 5 of C " ldr q4, [x24, #16] \n\t" " ldr q5, [x24, #32] \n\t" " \n\t" @@ -1361,11 +1462,11 @@ void bli_dgemm_armv8a_asm_6x8 " fmla v4.2d,v24.2d,v6.d[0] \n\t" // Scale by alpha " fmla v5.2d,v25.2d,v6.d[0] \n\t" // Scale by alpha " \n\t" - " str q0, [x23] \n\t" //Store column 4 of C + " str q0, [x23] \n\t" // Store column 4 of C " str q1, [x23, #16] \n\t" " str q2, [x23, #32] \n\t" " \n\t" - " str q3, [x24] \n\t" //Store column 5 of C + " str q3, [x24] \n\t" // Store column 5 of C " str q4, [x24, #16] \n\t" " str q5, [x24, #32] \n\t" " \n\t" @@ -1379,11 +1480,11 @@ void bli_dgemm_armv8a_asm_6x8 " fcmp d7,#0.0 \n\t" BEQ(DBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. " \n\t" - " ldr q8, [x25] \n\t" //Load column 6 of C + " ldr q8, [x25] \n\t" // Load column 6 of C " ldr q9, [x25, #16] \n\t" " ldr q10, [x25, #32] \n\t" " \n\t" - " ldr q11, [x26] \n\t" //Load column 7 of C + " ldr q11, [x26] \n\t" // Load column 7 of C " ldr q12, [x26, #16] \n\t" " ldr q13, [x26, #32] \n\t" " \n\t" @@ -1406,11 +1507,11 @@ void bli_dgemm_armv8a_asm_6x8 " fmla v12.2d,v30.2d,v6.d[0] \n\t" // Scale by alpha " fmla v13.2d,v31.2d,v6.d[0] \n\t" // Scale by alpha " \n\t" - " str q8, [x25] \n\t" //Store column 6 of C + " str q8, [x25] \n\t" // Store column 6 of C " str q9, [x25, #16] \n\t" " str q10, [x25, #32] \n\t" " \n\t" - " str q11, [x26] \n\t" //Store column 7 of C + " str q11, [x26] \n\t" // Store column 7 of C " str q12, [x26, #16] \n\t" " str q13, [x26, #32] \n\t" " \n\t" @@ -1450,39 +1551,4 @@ void bli_dgemm_armv8a_asm_6x8 GEMM_UKR_FLUSH_CT( d ); } - -#if 0 -void bli_cgemm_armv8a_opt_4x4 - ( - dim_t m, - dim_t n, - dim_t k, - scomplex* restrict alpha, - scomplex* restrict a, - scomplex* restrict b, - scomplex* restrict beta, - scomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - cntx_t* cntx - ) -{ -} - -void bli_zgemm_armv8a_opt_4x4 - ( - dim_t m, - dim_t n, - dim_t k, - dcomplex* restrict alpha, - dcomplex* restrict a, - dcomplex* restrict b, - dcomplex* restrict beta, - dcomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - cntx_t* cntx - ) -{ -} - -#endif - +// June 2022, removed unused stubs for ancient 4x4 kernels From 9e5594ad5fc41df8ef2825a025d7844ac2275c27 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 11 Aug 2022 14:36:38 -0500 Subject: [PATCH 074/230] Temporarily disabled #line directives from 6826c1c. Details: - Commented out the inclusion of #line preprocessor directives in the flattened header output provided by build/flatten-headers.py. This output was added recently in 6826c1c, but was later found to have thrown off the line numbering referenced by compiler warnings and errors (possibly due to license comment blocks, which are stripped from source headers as they are inlined into the monolithic header). --- build/flatten-headers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/flatten-headers.py b/build/flatten-headers.py index 40fc2a450..ecd4635d1 100755 --- a/build/flatten-headers.py +++ b/build/flatten-headers.py @@ -278,14 +278,14 @@ def flatten_header( inputfile, header_dirpaths, cursp ): # Mark the beginning of the header being inserted. ostring += "%s%s%c" % ( beginstr, header, '\n' ) - ostring += "#line %d \"%s\"%c\n" % ( 1, header_path, '\n' ) + #ostring += "#line %d \"%s\"%c\n" % ( 1, header_path, '\n' ) # Recurse on the header, accumulating the string. ostring += flatten_header( header_path, header_dirpaths, cursp + " " ) # Mark the end of the header being inserted. ostring += "%s%s%c" % ( endstr, header, '\n' ) - ostring += "#line %d \"%s\"%c\n" % ( lineno+1, inputfile, '\n' ) + #ostring += "#line %d \"%s\"%c\n" % ( lineno+1, inputfile, '\n' ) echov2( "%sheader file '%s' fully processed." \ % ( cursp, header_path ) ) From dfa54139664a42d29774e140ec9e5597af869a76 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Tue, 30 Aug 2022 08:07:50 +0800 Subject: [PATCH 075/230] Arm64 dgemmsup with extended MR&NR (#655) Details: - Since the number of registers in NEON is large but their lengths are short, I'm here extending both MR and NR. - The approach is to represent the C microtile in registers optionally in columns, so for sizes like 6x7m, the 'crr' kernel is the default with 'rrr' supported through an in-register transpose. - A few asm kernels are crafted for 'rv' to complete this extended size support. - For 'rd' I'm still relying heavily on C99 intrinsic kernels with branching so the performance might not be optimal. (Sorry for that.) - So far, these changes only affect the 'firestorm' subconfig. - This commit also contains row-preferential s12x8 and d6x8 gemm ukernels. These microkernels are templatized versions of the existing s8x12 and d6x8 ukernels defined in bli_gemm_armv8a_asm_d6x8.c. --- config/firestorm/bli_cntx_init_firestorm.c | 32 +- kernels/armv8a/3/armv8a_asm_utils.h | 40 ++ kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c | 605 ++++++++++++++++++ .../sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c | 0 kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c | 450 ------------- .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c | 190 ++++-- .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c | 268 ++++---- .../3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c | 3 +- .../3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c | 482 ++++++++++++++ .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c | 475 ++++++++++++++ .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c | 477 ++++++++++++++ .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c | 513 +++++++++++++++ .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c | 126 ++-- .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c | 64 +- .../3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c | 1 - kernels/armv8a/bli_kernels_armv8a.h | 6 + 16 files changed, 3020 insertions(+), 712 deletions(-) create mode 100644 kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c rename kernels/armv8a/3/{ => old}/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c (100%) delete mode 100644 kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c create mode 100644 kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c create mode 100644 kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c create mode 100644 kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c create mode 100644 kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c diff --git a/config/firestorm/bli_cntx_init_firestorm.c b/config/firestorm/bli_cntx_init_firestorm.c index 8e4d0088d..bfc7f24b9 100644 --- a/config/firestorm/bli_cntx_init_firestorm.c +++ b/config/firestorm/bli_cntx_init_firestorm.c @@ -49,14 +49,14 @@ void bli_cntx_init_firestorm( cntx_t* cntx ) cntx, // level-3 - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_12x8r, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_8x6r, // packm - BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk, - BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk, - BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk, - BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk, + BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk, + BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk, // gemmsup BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, @@ -77,8 +77,8 @@ void bli_cntx_init_firestorm( cntx_t* cntx ) cntx, // level-3 - BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, FALSE, - BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, // gemmsup BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, @@ -95,11 +95,11 @@ void bli_cntx_init_firestorm( cntx_t* cntx ) // Initialize level-3 blocksize objects with architecture-specific values. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 120, 252, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 3072, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 8192, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 12, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 8, 6, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 256, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 4096, 3072, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 9600, 8184, -1, -1 ); // Initialize sup thresholds with architecture-appropriate values. // s d c z @@ -110,8 +110,10 @@ void bli_cntx_init_firestorm( cntx_t* cntx ) // Initialize level-3 sup blocksize objects with architecture-specific // values. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR_SUP ], -1, 6, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ], -1, 8, -1, -1 ); + bli_blksz_init ( &blkszs[ BLIS_MR_SUP ], -1, 6, -1, -1, + -1, 9, -1, -1 ); + bli_blksz_init ( &blkszs[ BLIS_NR_SUP ], -1, 8, -1, -1, + -1, 13, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ], -1, 240, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ], -1, 1024, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ], -1, 3072, -1, -1 ); diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h index 0c405dfd2..061cea66d 100644 --- a/kernels/armv8a/3/armv8a_asm_utils.h +++ b/kernels/armv8a/3/armv8a_asm_utils.h @@ -61,6 +61,18 @@ CLEAR4V(V4,V5,V6,V7) // Scale vectors. +#define SSCALE1V(V,A,IDX) \ +" fmul v"#V".4s, v"#V".4s, v"#A".s["#IDX"] \n\t" +#define SSCALE2V(V0,V1,A,IDX) \ + SSCALE1V(V0,A,IDX) \ + SSCALE1V(V1,A,IDX) +#define SSCALE4V(V0,V1,V2,V3,A,IDX) \ + SSCALE2V(V0,V1,A,IDX) \ + SSCALE2V(V2,V3,A,IDX) +#define SSCALE8V(V0,V1,V2,V3,V4,V5,V6,V7,A,IDX) \ + SSCALE4V(V0,V1,V2,V3,A,IDX) \ + SSCALE4V(V4,V5,V6,V7,A,IDX) + #define DSCALE1V(V,A,IDX) \ " fmul v"#V".2d, v"#V".2d, v"#A".d["#IDX"] \n\t" #define DSCALE2V(V0,V1,A,IDX) \ @@ -74,6 +86,18 @@ DSCALE4V(V4,V5,V6,V7,A,IDX) // Scale-accumulate. +#define SSCALEA1V(D,S,A,IDX) \ +" fmla v"#D".4s, v"#S".4s, v"#A".s["#IDX"] \n\t" +#define SSCALEA2V(D0,D1,S0,S1,A,IDX) \ + SSCALEA1V(D0,S0,A,IDX) \ + SSCALEA1V(D1,S1,A,IDX) +#define SSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ + SSCALEA2V(D0,D1,S0,S1,A,IDX) \ + SSCALEA2V(D2,D3,S2,S3,A,IDX) +#define SSCALEA8V(D0,D1,D2,D3,D4,D5,D6,D7,S0,S1,S2,S3,S4,S5,S6,S7,A,IDX) \ + SSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ + SSCALEA4V(D4,D5,D6,D7,S4,S5,S6,S7,A,IDX) + #define DSCALEA1V(D,S,A,IDX) \ " fmla v"#D".2d, v"#S".2d, v"#A".d["#IDX"] \n\t" #define DSCALEA2V(D0,D1,S0,S1,A,IDX) \ @@ -95,8 +119,16 @@ #define DLOAD4V(V0,V1,V2,V3,ADDR,SHIFT) \ DLOAD2V(V0,V1,ADDR,SHIFT) \ DLOAD2V(V2,V3,ADDR,SHIFT+32) +#define SLOAD1V DLOAD1V +#define SLOAD2V DLOAD2V +#define SLOAD4V DLOAD4V // Generic: load one line. +#define SLOAD1V_GATHER_ELMFWD(V,ADDR,INC) \ +" ld1 {v"#V".s}[0], ["#ADDR"], "#INC" \n\t" \ +" ld1 {v"#V".s}[1], ["#ADDR"], "#INC" \n\t" \ +" ld1 {v"#V".s}[2], ["#ADDR"], "#INC" \n\t" \ +" ld1 {v"#V".s}[3], ["#ADDR"], "#INC" \n\t" #define DLOAD1V_GATHER_ELMFWD(V,ADDR,INC) \ " ld1 {v"#V".d}[0], ["#ADDR"], "#INC" \n\t" \ " ld1 {v"#V".d}[1], ["#ADDR"], "#INC" \n\t" @@ -110,8 +142,16 @@ #define DSTORE4V(V0,V1,V2,V3,ADDR,SHIFT) \ DSTORE2V(V0,V1,ADDR,SHIFT) \ DSTORE2V(V2,V3,ADDR,SHIFT+32) +#define SSTORE1V DSTORE1V +#define SSTORE2V DSTORE2V +#define SSTORE4V DSTORE4V // Generic: store one line. +#define SSTORE1V_SCATTER_ELMFWD(V,ADDR,INC) \ +" st1 {v"#V".s}[0], ["#ADDR"], "#INC" \n\t" \ +" st1 {v"#V".s}[1], ["#ADDR"], "#INC" \n\t" \ +" st1 {v"#V".s}[2], ["#ADDR"], "#INC" \n\t" \ +" st1 {v"#V".s}[3], ["#ADDR"], "#INC" \n\t" #define DSTORE1V_SCATTER_ELMFWD(V,ADDR,INC) \ " st1 {v"#V".d}[0], ["#ADDR"], "#INC" \n\t" \ " st1 {v"#V".d}[1], ["#ADDR"], "#INC" \n\t" diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c new file mode 100644 index 000000000..b0df23fb0 --- /dev/null +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d8x6r.c @@ -0,0 +1,605 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +#include "blis.h" + +// Label locality & misc. +#include "armv8a_asm_utils.h" + +// Nanokernel operations. +#include "armv8a_asm_d2x2.h" + +/* Order of row-major SGEMM_12x8's execution in 4x5 blocks: + * + * +---+ +---+ + * | 0 | | 1 | + * +---+ +---+ + * +---+ +---+ + * | 2 | | 3 | + * +---+ +---+ + * +---+ +---+ + * | 4 | | 5 | + * +---+ +---+ + */ +#define SGEMM_12X8_MKER_LOOP_PLAIN(C00,C01,C10,C11,C20,C21,C30,C31,C40,C41,C50,C51,C60,C61,C70,C71,C80,C81,C90,C91,CA0,CA1,CB0,CB1,A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ + SGEMM_4X4_NANOKERNEL(C00,C10,C20,C30,B0,A0) \ + SGEMM_4X4_NANOKERNEL(C01,C11,C21,C31,B1,A0) \ + DGEMM_LOAD1V_ ##LOADNEXT (A0,AADDR,ASHIFT) /* Contiguous load is the same across S/D. */ \ + SGEMM_4X4_NANOKERNEL(C40,C50,C60,C70,B0,A1) \ + SGEMM_4X4_NANOKERNEL(C41,C51,C61,C71,B1,A1) \ + DGEMM_LOAD1V_ ##LOADNEXT (A1,AADDR,ASHIFT+16) \ + SGEMM_4X4_NANOKERNEL(C80,C90,CA0,CB0,B0,A2) \ + DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ + SGEMM_4X4_NANOKERNEL(C81,C91,CA1,CB1,B1,A2) + +// For contiguous storage of C, SLOAD is the same as DLOAD. +#define SLOADC_2V_R_FWD(C0,C1,CADDR,CSHIFT,RSC) \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" +#define SSTOREC_2V_R_FWD(C0,C1,CADDR,CSHIFT,RSC) \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" + +/* Order of row-major DGEMM_8x6's execution in 2x2 blocks: + * + * +---+ +---+ +---+ + * | 0 | | 2 | | 4 | + * +---+ +---+ +---+ + * +---+ +---+ +---+ + * | 1 | | 3 | | 5 | + * +---+ +---+ +---+ + * +---+ +---+ +---+ + * | 6 | | 8 | | 10| + * +---+ +---+ +---+ + * +---+ +---+ +---+ + * | 7 | | 9 | | 11| + * +---+ +---+ +---+ + * + */ +#define DGEMM_8X6_MKER_LOOP_PLAIN(C00,C01,C02,C10,C11,C12,C20,C21,C22,C30,C31,C32,C40,C41,C42,C50,C51,C52,C60,C61,C62,C70,C71,C72,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \ + DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \ + DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \ + DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \ + DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \ + DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \ + DGEMM_LOAD2V_ ##LOADNEXT (A0,A1,AADDR,ASHIFT) \ + DGEMM_2X2_NANOKERNEL(C40,C50,B0,A2) \ + DGEMM_2X2_NANOKERNEL(C60,C70,B0,A3) \ + DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ + DGEMM_2X2_NANOKERNEL(C41,C51,B1,A2) \ + DGEMM_2X2_NANOKERNEL(C61,C71,B1,A3) \ + DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT+16) \ + DGEMM_2X2_NANOKERNEL(C42,C52,B2,A2) \ + DGEMM_2X2_NANOKERNEL(C62,C72,B2,A3) + +// Interleaving load or not. +#define DGEMM_LOAD1V_noload(V1,ADDR,IMM) +#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ + DLOAD1V(V1,ADDR,IMM) + +#define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM) +#define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \ + DGEMM_LOAD1V_load(V1,ADDR,IMM) \ + DGEMM_LOAD1V_load(V2,ADDR,IMM+16) + +// For contiguous storage of C. +#define DLOADC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ + DLOAD1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" +#define DSTOREC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ + DSTORE1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" + +// Prefetch C. +#define PRFMC_FWD(CADDR,RSC,LASTB) \ +" prfm PLDL1KEEP, ["#CADDR"] \n\t" \ +" prfm PLDL1KEEP, ["#CADDR", "#LASTB"] \n\t" \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" + +void bli_sgemm_armv8a_asm_12x8r + ( + dim_t m, + dim_t n, + dim_t k, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + const void* a_next = bli_auxinfo_next_a( data ); + const void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k / 4; + uint64_t k_left = k % 4; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + GEMM_UKR_SETUP_CT( s, 12, 8, true ); + + __asm__ volatile + ( +" ldr x0, %[a] \n\t" +" ldr x1, %[b] \n\t" +" mov x2, #12 \n\t" // Column-skip of A. +" mov x3, #8 \n\t" // Row-skip of B. +" \n\t" +" ldr x5, %[c] \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. (column-skip == 1) +" \n\t" +" \n\t" // Multiply some address skips by sizeof(float). +" lsl x2, x2, #2 \n\t" // cs_a +" lsl x3, x3, #2 \n\t" // rs_b +" lsl x6, x6, #2 \n\t" // rs_c +" \n\t" +" cmp %w[ct], wzr \n\t" +" mov x9, x5 \n\t" +BNE(SEND_PRFMC_FH) +PRFMC_FWD(x9,x6,32) // Prefetch C 01/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 02/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 03/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 04/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 05/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 06/12. +LABEL(SEND_PRFMC_FH) +" \n\t" +" ldr x4, %[k_mker] \n\t" // Number of loops. +" ldr x8, %[k_left] \n\t" +" \n\t" +// Storage scheme: +// V[ 0:23] <- C +// V[24:27] <- A +// V[28:31] <- B +// Under this scheme, the following is defined: +#define SGEMM_12X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ + SGEMM_12X8_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) +// Load from memory. +LABEL(SLOAD_ABC) +" \n\t" // No-microkernel early return is a must +" cmp x4, #0 \n\t" // to avoid out-of-boundary read. +BEQ(SCLEAR_CCOLS) +" \n\t" +" ldr q24, [x0, #16*0] \n\t" // Load A. +" ldr q25, [x0, #16*1] \n\t" +" ldr q26, [x0, #16*2] \n\t" +" add x0, x0, x2 \n\t" +" ldr q27, [x0, #16*0] \n\t" +" \n\t" +" cmp %w[ct], wzr \n\t" +BNE(SEND_PRFMC_LH) +PRFMC_FWD(x9,x6,32) // Prefetch C 07/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 08/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 09/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 10/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 11/12. +PRFMC_FWD(x9,x6,32) // Prefetch C 12/12. +LABEL(SEND_PRFMC_LH) +" cmp x4, #0 \n\t" // Reset branching flag. +" \n\t" +" ldr q28, [x1, #16*0] \n\t" // Load B. +" ldr q29, [x1, #16*1] \n\t" +" add x1, x1, x3 \n\t" +" ldr q30, [x1, #16*0] \n\t" +" ldr q31, [x1, #16*1] \n\t" +" add x1, x1, x3 \n\t" +LABEL(SCLEAR_CCOLS) +CLEAR8V(0,1,2,3,4,5,6,7) +CLEAR8V(8,9,10,11,12,13,14,15) +CLEAR8V(16,17,18,19,20,21,22,23) +// No-microkernel early return, once again. +BEQ(SK_LEFT_LOOP) +// +// Microkernel is defined here as: +#define SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1) \ + SGEMM_12X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,x0,16,x1,0,load) \ + "add x0, x0, x2 \n\t" \ + "ldr q"#A2", [x0, #16*0] \n\t" \ + "ldr q"#B1", [x1, #16*1] \n\t" \ + "add x1, x1, x3 \n\t" +// Start microkernel loop. +LABEL(SK_MKER_LOOP) +SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,28,29) +SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(27,24,25,30,31) +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(SFIN_MKER_LOOP) +SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(26,27,24,28,29) +SGEMM_12X8_MKER_LOOP_PLAIN_LOC_FWD(25,26,27,30,31) +BRANCH(SK_MKER_LOOP) +// +// Final microkernel loop. +LABEL(SFIN_MKER_LOOP) +SGEMM_12X8_MKER_LOOP_PLAIN_LOC(26,27,24,28,29,xzr,-1,xzr,-1,noload) +" ldr q26, [x0, #16*1] \n\t" +" ldr q27, [x0, #16*2] \n\t" +" add x0, x0, x2 \n\t" +SGEMM_12X8_MKER_LOOP_PLAIN_LOC(25,26,27,30,31,xzr,-1,xzr,-1,noload) +// +// Loops left behind microkernels. +LABEL(SK_LEFT_LOOP) +" cmp x8, #0 \n\t" // End of exec. +BEQ(SWRITE_MEM_PREP) +" ldr q24, [x0, #16*0] \n\t" // Load A col. +" ldr q25, [x0, #16*1] \n\t" +" ldr q26, [x0, #16*2] \n\t" +" add x0, x0, x2 \n\t" +" ldr q28, [x1, #16*0] \n\t" // Load B row. +" ldr q29, [x1, #16*1] \n\t" +" add x1, x1, x3 \n\t" +" sub x8, x8, #1 \n\t" +SGEMM_12X8_MKER_LOOP_PLAIN_LOC(24,25,26,28,29,xzr,-1,xzr,-1,noload) +BRANCH(SK_LEFT_LOOP) +// +// Scale and write to memory. +LABEL(SWRITE_MEM_PREP) +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ld1r {v24.4s}, [x4] \n\t" // Load alpha & beta. +" ld1r {v25.4s}, [x8] \n\t" +" \n\t" +LABEL(SPREFETCH_ABNEXT) +" ldr x0, %[a_next] \n\t" +" ldr x1, %[b_next] \n\t" +" prfm PLDL1STRM, [x0, 64*0] \n\t" // Do not know cache line size, +" prfm PLDL1STRM, [x0, 64*1] \n\t" // issue some number of prfm instructions +" prfm PLDL1STRM, [x0, 64*2] \n\t" // to try to activate hardware prefetcher. +" prfm PLDL1STRM, [x1, 64*0] \n\t" +" prfm PLDL1STRM, [x1, 64*1] \n\t" +" prfm PLDL1STRM, [x1, 64*3] \n\t" +" \n\t" +" fmov d26, #1.0 \n\t" +" fcvt s26, d26 \n\t" +" fcmp s24, s26 \n\t" +BEQ(SUNIT_ALPHA) +SSCALE8V(0,1,2,3,4,5,6,7,24,0) +SSCALE8V(8,9,10,11,12,13,14,15,24,0) +SSCALE8V(16,17,18,19,20,21,22,23,24,0) +LABEL(SUNIT_ALPHA) +" \n\t" +" mov x9, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +// +// Contiguous C-storage. +LABEL(SWRITE_MEM_R) +" fcmp s25, #0.0 \n\t" // Sets conditional flag whether *beta == 0. +" \n\t" // This conditional flag will be used +" \n\t" // multiple times for skipping load. +// Row 0 & 1 & 2: +BEQ(SZERO_BETA_R_0_1_2) +SLOADC_2V_R_FWD(26,27,x9,0,x6) +SLOADC_2V_R_FWD(28,29,x9,0,x6) +SLOADC_2V_R_FWD(30,31,x9,0,x6) +SSCALEA2V(0,1,26,27,25,0) +SSCALEA2V(2,3,28,29,25,0) +SSCALEA2V(4,5,30,31,25,0) +LABEL(SZERO_BETA_R_0_1_2) +SSTOREC_2V_R_FWD(0,1,x5,0,x6) +SSTOREC_2V_R_FWD(2,3,x5,0,x6) +SSTOREC_2V_R_FWD(4,5,x5,0,x6) +// Row 3 & 4 & 5 & 6 & 7 & 8: +BEQ(SZERO_BETA_R_3_4_5_6_7_8) +SLOADC_2V_R_FWD(26,27,x9,0,x6) +SLOADC_2V_R_FWD(28,29,x9,0,x6) +SLOADC_2V_R_FWD(30,31,x9,0,x6) +SLOADC_2V_R_FWD(0,1,x9,0,x6) +SLOADC_2V_R_FWD(2,3,x9,0,x6) +SLOADC_2V_R_FWD(4,5,x9,0,x6) +SSCALEA4V(6,7,8,9,26,27,28,29,25,0) +SSCALEA4V(10,11,12,13,30,31,0,1,25,0) +SSCALEA4V(14,15,16,17,2,3,4,5,25,0) +LABEL(SZERO_BETA_R_3_4_5_6_7_8) +SSTOREC_2V_R_FWD(6,7,x5,0,x6) +SSTOREC_2V_R_FWD(8,9,x5,0,x6) +SSTOREC_2V_R_FWD(10,11,x5,0,x6) +SSTOREC_2V_R_FWD(12,13,x5,0,x6) +SSTOREC_2V_R_FWD(14,15,x5,0,x6) +SSTOREC_2V_R_FWD(16,17,x5,0,x6) +// Row 9 & 10 & 11 +BEQ(SZERO_BETA_R_9_10_11) +SLOADC_2V_R_FWD(26,27,x9,0,x6) +SLOADC_2V_R_FWD(28,29,x9,0,x6) +SLOADC_2V_R_FWD(30,31,x9,0,x6) +SSCALEA2V(18,19,26,27,25,0) +SSCALEA2V(20,21,28,29,25,0) +SSCALEA2V(22,23,30,31,25,0) +LABEL(SZERO_BETA_R_9_10_11) +SSTOREC_2V_R_FWD(18,19,x5,0,x6) +SSTOREC_2V_R_FWD(20,21,x5,0,x6) +SSTOREC_2V_R_FWD(22,23,x5,0,x6) +// Done. +LABEL(SEND_WRITE_MEM) +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_c] "m" (rs_c), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta), + [a_next] "m" (a_next), + [b_next] "m" (b_next), + [ct] "r" (_use_ct) // Defined by macro. +: "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9", + "v0","v1","v2","v3","v4","v5","v6","v7", + "v8","v9","v10","v11","v12","v13","v14","v15", + "v16","v17","v18","v19", + "v20","v21","v22","v23", + "v24","v25","v26","v27", + "v28","v29","v30","v31" + ); + + GEMM_UKR_FLUSH_CT( s ); +} + +/* + * Differences from the col-major 6x8 in HW modeling: + * * Stream HW prefetcher is assumed s.t. PRFM instructions for packed A&B are omitted. + */ +void bli_dgemm_armv8a_asm_8x6r + ( + dim_t m, + dim_t n, + dim_t k, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + const void* a_next = bli_auxinfo_next_a( data ); + const void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k / 4; + uint64_t k_left = k % 4; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + GEMM_UKR_SETUP_CT( d, 8, 6, true ); + + __asm__ volatile + ( +" ldr x0, %[a] \n\t" +" ldr x1, %[b] \n\t" +" mov x2, #8 \n\t" // Column-skip of A. +" mov x3, #6 \n\t" // Row-skip of B. +" \n\t" +" ldr x5, %[c] \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. (column-skip == 1) +" \n\t" +" \n\t" // Multiply some address skips by sizeof(double). +" lsl x2, x2, #3 \n\t" // cs_a +" lsl x3, x3, #3 \n\t" // rs_b +" lsl x6, x6, #3 \n\t" // rs_c +" \n\t" +" cmp %w[ct], wzr \n\t" +" mov x9, x5 \n\t" +BNE(DEND_PRFMC) +PRFMC_FWD(x9,x6,40) // Prefetch C 1/8. +PRFMC_FWD(x9,x6,40) // Prefetch C 2/8. +PRFMC_FWD(x9,x6,40) // Prefetch C 3/8. +PRFMC_FWD(x9,x6,40) // Prefetch C 4/8. +PRFMC_FWD(x9,x6,40) // Prefetch C 5/8. +PRFMC_FWD(x9,x6,40) // Prefetch C 6/8. +PRFMC_FWD(x9,x6,40) // Prefetch C 7/8. +PRFMC_FWD(x9,x6,40) // Prefetch C 8/8. +LABEL(DEND_PRFMC) +" \n\t" +" ldr x4, %[k_mker] \n\t" // Number of loops. +" ldr x8, %[k_left] \n\t" +" \n\t" +// Storage scheme: +// V[ 0:23] <- C +// V[24:27] <- A +// V[28:31] <- B +// Under this scheme, the following is defined: +#define DGEMM_8X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_8X6_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,A3,B0,B1,B2,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) +// Load from memory. +LABEL(DLOAD_ABC) +" \n\t" // No-microkernel early return is a must +" cmp x4, #0 \n\t" // to avoid out-of-boundary read. +BEQ(DCLEAR_CCOLS) +" \n\t" +" ldr q24, [x0, #16*0] \n\t" // Load A. +" ldr q25, [x0, #16*1] \n\t" +" ldr q26, [x0, #16*2] \n\t" +" ldr q27, [x0, #16*3] \n\t" +" add x0, x0, x2 \n\t" +" \n\t" +" ldr q28, [x1, #16*0] \n\t" // Load B. +" ldr q29, [x1, #16*1] \n\t" +" ldr q30, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +" ldr q31, [x1, #16*0] \n\t" +LABEL(DCLEAR_CCOLS) +CLEAR8V(0,1,2,3,4,5,6,7) +CLEAR8V(8,9,10,11,12,13,14,15) +CLEAR8V(16,17,18,19,20,21,22,23) +// No-microkernel early return, once again. +BEQ(DK_LEFT_LOOP) +// +// Microkernel is defined here as: +#define DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,A3,B0,B1,B2) \ + DGEMM_8X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,x0,0,x1,16,load) \ + "add x1, x1, x3 \n\t" \ + "ldr q"#B2", [x1, #16*0] \n\t" \ + "ldr q"#A2", [x0, #16*2] \n\t" \ + "ldr q"#A3", [x0, #16*3] \n\t" \ + "add x0, x0, x2 \n\t" +// Start microkernel loop. +LABEL(DK_MKER_LOOP) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,28,29,30) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,31,28,29) +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(DFIN_MKER_LOOP) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,30,31,28) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,29,30,31) +BRANCH(DK_MKER_LOOP) +// +// Final microkernel loop. +LABEL(DFIN_MKER_LOOP) +DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,30,31,28,x0,0,x1,16,load) +" add x1, x1, x3 \n\t" +" ldr q26, [x0, #16*2] \n\t" +" ldr q27, [x0, #16*3] \n\t" +" add x0, x0, x2 \n\t" +DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,29,30,31,xzr,-1,xzr,-1,noload) +// +// Loops left behind microkernels. +LABEL(DK_LEFT_LOOP) +" cmp x8, #0 \n\t" // End of exec. +BEQ(DWRITE_MEM_PREP) +" ldr q24, [x0, #16*0] \n\t" // Load A col. +" ldr q25, [x0, #16*1] \n\t" +" ldr q26, [x0, #16*2] \n\t" +" ldr q27, [x0, #16*3] \n\t" +" add x0, x0, x2 \n\t" +" ldr q28, [x1, #16*0] \n\t" // Load B row. +" ldr q29, [x1, #16*1] \n\t" +" ldr q30, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +" sub x8, x8, #1 \n\t" +DGEMM_8X6_MKER_LOOP_PLAIN_LOC(24,25,26,27,28,29,30,xzr,-1,xzr,-1,noload) +BRANCH(DK_LEFT_LOOP) +// +// Scale and write to memory. +LABEL(DWRITE_MEM_PREP) +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ld1r {v24.2d}, [x4] \n\t" // Load alpha & beta. +" ld1r {v25.2d}, [x8] \n\t" +" \n\t" +LABEL(DPREFETCH_ABNEXT) +" ldr x0, %[a_next] \n\t" +" ldr x1, %[b_next] \n\t" +" prfm PLDL1STRM, [x0, 64*0] \n\t" // Do not know cache line size, +" prfm PLDL1STRM, [x0, 64*1] \n\t" // issue some number of prfm instructions +" prfm PLDL1STRM, [x0, 64*2] \n\t" // to try to activate hardware prefetcher. +" prfm PLDL1STRM, [x1, 64*0] \n\t" +" prfm PLDL1STRM, [x1, 64*1] \n\t" +" prfm PLDL1STRM, [x1, 64*3] \n\t" +" \n\t" +" fmov d26, #1.0 \n\t" +" fcmp d24, d26 \n\t" +BEQ(DUNIT_ALPHA) +DSCALE8V(0,1,2,3,4,5,6,7,24,0) +DSCALE8V(8,9,10,11,12,13,14,15,24,0) +DSCALE8V(16,17,18,19,20,21,22,23,24,0) +LABEL(DUNIT_ALPHA) +" \n\t" +" mov x9, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +// +// Contiguous C-storage. +LABEL(DWRITE_MEM_R) +" fcmp d25, #0.0 \n\t" // Sets conditional flag whether *beta == 0. +" \n\t" // This conditional flag will be used +" \n\t" // multiple times for skipping load. +// Row 0 & 1: +BEQ(DZERO_BETA_R_0_1) +DLOADC_3V_R_FWD(26,27,28,x9,0,x6) +DLOADC_3V_R_FWD(29,30,31,x9,0,x6) +DSCALEA2V(0,1,26,27,25,0) +DSCALEA2V(2,3,28,29,25,0) +DSCALEA2V(4,5,30,31,25,0) +LABEL(DZERO_BETA_R_0_1) +DSTOREC_3V_R_FWD(0,1,2,x5,0,x6) +DSTOREC_3V_R_FWD(3,4,5,x5,0,x6) +// Row 2 & 3 & 4 & 5: +BEQ(DZERO_BETA_R_2_3_4_5) +DLOADC_3V_R_FWD(26,27,28,x9,0,x6) +DLOADC_3V_R_FWD(29,30,31,x9,0,x6) +DLOADC_3V_R_FWD(0,1,2,x9,0,x6) +DLOADC_3V_R_FWD(3,4,5,x9,0,x6) +DSCALEA4V(6,7,8,9,26,27,28,29,25,0) +DSCALEA4V(10,11,12,13,30,31,0,1,25,0) +DSCALEA4V(14,15,16,17,2,3,4,5,25,0) +LABEL(DZERO_BETA_R_2_3_4_5) +DSTOREC_3V_R_FWD(6,7,8,x5,0,x6) +DSTOREC_3V_R_FWD(9,10,11,x5,0,x6) +DSTOREC_3V_R_FWD(12,13,14,x5,0,x6) +DSTOREC_3V_R_FWD(15,16,17,x5,0,x6) +// Row 6 & 7 +BEQ(DZERO_BETA_R_6_7) +DLOADC_3V_R_FWD(26,27,28,x9,0,x6) +DLOADC_3V_R_FWD(29,30,31,x9,0,x6) +DSCALEA2V(18,19,26,27,25,0) +DSCALEA2V(20,21,28,29,25,0) +DSCALEA2V(22,23,30,31,25,0) +LABEL(DZERO_BETA_R_6_7) +DSTOREC_3V_R_FWD(18,19,20,x5,0,x6) +DSTOREC_3V_R_FWD(21,22,23,x5,0,x6) +// Done. +LABEL(DEND_WRITE_MEM) +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_c] "m" (rs_c), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta), + [a_next] "m" (a_next), + [b_next] "m" (b_next), + [ct] "r" (_use_ct) // Defined by macro. +: "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9", + "v0","v1","v2","v3","v4","v5","v6","v7", + "v8","v9","v10","v11","v12","v13","v14","v15", + "v16","v17","v18","v19", + "v20","v21","v22","v23", + "v24","v25","v26","v27", + "v28","v29","v30","v31" + ); + + GEMM_UKR_FLUSH_CT( d ); +} + diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c b/kernels/armv8a/3/old/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c similarity index 100% rename from kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c rename to kernels/armv8a/3/old/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c diff --git a/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c b/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c deleted file mode 100644 index 44e0ac419..000000000 --- a/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c +++ /dev/null @@ -1,450 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -// Separate instantiation for Armv8-A reference kernels. -// Temporary workaround. Will be removed after upstream has switched to a better way -// of exposing gemmsup interface. - -// -// -- Row storage case --------------------------------------------------------- -// - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t rs_a, inc_t cs_a, \ - ctype* restrict b, inc_t rs_b, inc_t cs_b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data, \ - cntx_t* cntx \ - ) \ -{ \ - /* NOTE: This microkernel can actually handle arbitrarily large - values of m, n, and k. */ \ -\ - if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ - { \ - /* Traverse c by rows. */ \ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict ci = &c[ i*rs_c ]; \ - ctype* restrict ai = &a[ i*rs_a ]; \ -\ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cij = &ci[ j*cs_c ]; \ - ctype* restrict bj = &b [ j*cs_b ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dots)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ - { \ - /* Traverse c by rows. */ \ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict ci = &c[ i*rs_c ]; \ - ctype* restrict ai = &a[ i*rs_a ]; \ -\ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cij = &ci[ j*cs_c ]; \ - ctype* restrict bj = &b [ j*cs_b ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ - { \ - /* Traverse c by rows. */ \ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict ci = &c[ i*rs_c ]; \ - ctype* restrict ai = &a[ i*rs_a ]; \ -\ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cij = &ci[ j*cs_c ]; \ - ctype* restrict bj = &b [ j*cs_b ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ - { \ - /* Traverse c by rows. */ \ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict ci = &c[ i*rs_c ]; \ - ctype* restrict ai = &a[ i*rs_a ]; \ -\ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cij = &ci[ j*cs_c ]; \ - ctype* restrict bj = &b [ j*cs_b ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dots)( *aij, *bij, ab ); \ - } \ -\ - /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ - PASTEMAC(ch,conjs)( ab ); \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( gemmsup_r, _armv8a, _ref2 ) - -// -// -- Column storage case ------------------------------------------------------ -// - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t rs_a, inc_t cs_a, \ - ctype* restrict b, inc_t rs_b, inc_t cs_b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data, \ - cntx_t* cntx \ - ) \ -{ \ - /* NOTE: This microkernel can actually handle arbitrarily large - values of m, n, and k. */ \ -\ - if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ - { \ - /* Traverse c by columns. */ \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cj = &c[ j*cs_c ]; \ - ctype* restrict bj = &b[ j*cs_b ]; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict cij = &cj[ i*rs_c ]; \ - ctype* restrict ai = &a [ i*rs_a ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dots)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ - { \ - /* Traverse c by columns. */ \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cj = &c[ j*cs_c ]; \ - ctype* restrict bj = &b[ j*cs_b ]; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict cij = &cj[ i*rs_c ]; \ - ctype* restrict ai = &a [ i*rs_a ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ - { \ - /* Traverse c by columns. */ \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cj = &c[ j*cs_c ]; \ - ctype* restrict bj = &b[ j*cs_b ]; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict cij = &cj[ i*rs_c ]; \ - ctype* restrict ai = &a [ i*rs_a ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ - { \ - /* Traverse c by columns. */ \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cj = &c[ j*cs_c ]; \ - ctype* restrict bj = &b[ j*cs_b ]; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict cij = &cj[ i*rs_c ]; \ - ctype* restrict ai = &a [ i*rs_a ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dots)( *aij, *bij, ab ); \ - } \ -\ - /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ - PASTEMAC(ch,conjs)( ab ); \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( gemmsup_c, _armv8a, _ref2 ) - diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c index cade3ee05..847bfe8da 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c @@ -37,7 +37,6 @@ #include "blis.h" #include "assert.h" -GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" @@ -109,6 +108,83 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) " prfm PLDL1KEEP, ["#CADDR"] \n\t" \ " add "#CADDR", "#CADDR", "#DLONGC" \n\t" + +BLIS_INLINE +void bli_dgemmsup_rd_armv8a_inline_3x4m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( n0 == 4 ); + + for ( ; m0 >= 3; m0 -= 3 ) + { + bli_dgemmsup_rd_armv8a_asm_3x4 + ( + conja, conjb, 3, 4, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + a += 3 * rs_a0; + c += 3 * rs_c0; + } + + if ( m0 > 0 ) + { + bli_dgemmsup_rd_armv8a_int_3x4 + ( + conja, conjb, m0, 4, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + } +} + +BLIS_INLINE +void bli_dgemmsup_rd_armv8a_inline_3xcm + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + for ( ; m0 > 0; m0 -= 3 ) + { + dim_t m_loc = ( m0 < 3 ) ? m0 : 3; + + bli_dgemmsup_rd_armv8a_int_3x4 + ( + conja, conjb, m_loc, n0, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + + a += 3 * rs_a0; + c += 3 * rs_c0; + } +} + + void bli_dgemmsup_rd_armv8a_asm_6x8m ( conj_t conja, @@ -127,58 +203,74 @@ void bli_dgemmsup_rd_armv8a_asm_6x8m { if ( n0 != 8 ) { - if ( n0 < 8 ) + assert( n0 <= 13 ); + + // Manual separation. + dgemmsup_ker_ft ker_fp1 = NULL; + dgemmsup_ker_ft ker_fp2 = NULL; + dgemmsup_ker_ft ker_fp3 = NULL; + dim_t nr1, nr2, nr3; + + switch ( n0 ) { - for ( ; n0 >= 4; n0 -= 4 ) - { - dim_t m = m0; - double *a_loc = a; - double *c_loc = c; - - for ( ; m >= 3; m -= 3 ) - { - bli_dgemmsup_rd_armv8a_asm_3x4 - ( - conja, conjb, 3, 4, k0, - alpha, a_loc, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c_loc, rs_c0, cs_c0, data, cntx - ); - a_loc += 3 * rs_a0; - c_loc += 3 * rs_c0; - } - - if ( m > 0 ) - { - bli_dgemmsup_rd_armv8a_int_3x4 - ( - conja, conjb, m, 4, k0, - alpha, a_loc, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c_loc, rs_c0, cs_c0, data, cntx - ); - } - b += 4 * cs_b0; - c += 4 * cs_c0; - } - - for ( ; m0 > 0; m0 -= 3 ) - { - dim_t m_loc = ( m0 < 3 ) ? m0 : 3; - - bli_dgemmsup_rd_armv8a_int_3x4 - ( - conja, conjb, m_loc, n0, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - - a += 3 * rs_a0; - c += 3 * rs_c0; - } + case 13: + ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function. + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 3; + ker_fp3 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr3 = 2; break; + case 12: + ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function. + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3x4m; nr2 = 4; break; + case 11: + ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function. + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 3; break; + case 10: + ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function. + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 2; break; + case 9: + ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8m; nr1 = 8; // This function. + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 1; break; + case 7: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x4m; nr1 = 4; + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 3; break; + case 6: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x4m; nr1 = 4; + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 2; break; + case 5: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr1 = 3; + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr2 = 2; break; + case 4: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr1 = 4; break; + default: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3xcm; nr1 = n0; break; } - else + + ker_fp1 + ( + conja, conjb, m0, nr1, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + b += nr1 * cs_b0; + c += nr1 * cs_c0; + if ( ker_fp2 ) { - assert( FALSE ); + ker_fp2 + ( + conja, conjb, m0, nr2, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + b += nr2 * cs_b0; + c += nr2 * cs_c0; } + if ( ker_fp3 ) + ker_fp3 + ( + conja, conjb, m0, nr3, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + return; } diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c index 06c9ac32c..c4fb7cac6 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c @@ -37,7 +37,6 @@ #include "blis.h" #include "assert.h" -GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" @@ -102,6 +101,122 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) " prfm PLDL1KEEP, ["#CADDR"] \n\t" \ " add "#CADDR", "#CADDR", "#DLONGC" \n\t" + +BLIS_INLINE +void bli_dgemmsup_rd_armv8a_inline_4x8n + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( m0 == 4 ); + + for ( ; n0 > 0; n0 -= 8 ) + { + // Call twice the 2xc kernel in column order. + dim_t n_loc = ( n0 < 8 ) ? n0 : 8; + bli_dgemmsup_rd_armv8a_int_2x8 + ( + conja, conjb, 2, n_loc, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + bli_dgemmsup_rd_armv8a_int_2x8 + ( + conja, conjb, 2, n_loc, k0, + alpha, a + 2 * rs_a0, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c + 2 * rs_c0, rs_c0, cs_c0, data, cntx + ); + b += 8 * cs_b0; + c += 8 * cs_c0; + } +} + +BLIS_INLINE +void bli_dgemmsup_rd_armv8a_inline_3x8n + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( m0 == 3 ); + + for ( ; n0 >= 4; n0 -= 4 ) + { + bli_dgemmsup_rd_armv8a_asm_3x4 + ( + conja, conjb, 3, 4, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + b += 4 * cs_b0; + c += 4 * cs_c0; + } + if ( n0 > 0 ) + { + bli_dgemmsup_rd_armv8a_int_3x4 + ( + conja, conjb, 3, n0, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + } +} + +BLIS_INLINE +void bli_dgemmsup_rd_armv8a_inline_rx8n + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( m0 <= 2 ); + + for ( ; n0 > 0; n0 -= 8 ) + { + dim_t n_loc = ( n0 < 8 ) ? n0 : 8; + bli_dgemmsup_rd_armv8a_int_2x8 + ( + conja, conjb, m0, n_loc, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + b += 8 * cs_b0; + c += 8 * cs_c0; + } +} + + void bli_dgemmsup_rd_armv8a_asm_6x8n ( conj_t conja, @@ -120,116 +235,51 @@ void bli_dgemmsup_rd_armv8a_asm_6x8n { if ( m0 != 6 ) { - if ( m0 < 6 ) - { - if ( m0 == 5 ) - { - // 3xk calls. - dim_t n = n0; - double *b_loc = b; - double *c_loc = c; - for ( ; n >= 4; n -= 4 ) - { - bli_dgemmsup_rd_armv8a_asm_3x4 - ( - conja, conjb, 3, 4, k0, - alpha, a, rs_a0, cs_a0, b_loc, rs_b0, cs_b0, - beta, c_loc, rs_c0, cs_c0, data, cntx - ); - b_loc += 4 * cs_b0; - c_loc += 4 * cs_c0; - } - if ( n > 0 ) - { - bli_dgemmsup_rd_armv8a_int_3x4 - ( - conja, conjb, 3, n, k0, - alpha, a, rs_a0, cs_a0, b_loc, rs_b0, cs_b0, - beta, c_loc, rs_c0, cs_c0, data, cntx - ); - } - a += 3 * rs_a0; - c += 3 * rs_c0; - - // 2xk calls. - for ( ; n0 > 0; n0 -= 8 ) - { - dim_t n_loc = ( n0 < 8 ) ? n0 : 8; - bli_dgemmsup_rd_armv8a_int_2x8 - ( - conja, conjb, 2, n_loc, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - b += 8 * cs_b0; - c += 8 * cs_c0; - } - return; - } - else if ( m0 == 4 ) - { - for ( ; n0 > 0; n0 -= 8 ) - { - dim_t n_loc = ( n0 < 8 ) ? n0 : 8; - bli_dgemmsup_rd_armv8a_int_2x8 - ( - conja, conjb, 2, n_loc, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - bli_dgemmsup_rd_armv8a_int_2x8 - ( - conja, conjb, 2, n_loc, k0, - alpha, a + 2 * rs_a0, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c + 2 * rs_c0, rs_c0, cs_c0, data, cntx - ); - b += 8 * cs_b0; - c += 8 * cs_c0; - } - } - else if ( m0 == 3 ) - { - for ( ; n0 >= 4; n0 -= 4 ) - { - bli_dgemmsup_rd_armv8a_asm_3x4 - ( - conja, conjb, 3, 4, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - b += 4 * cs_b0; - c += 4 * cs_c0; - } - if ( n0 > 0 ) - { - bli_dgemmsup_rd_armv8a_int_3x4 - ( - conja, conjb, 3, n0, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - } - } - else // m0 == 2 or 1. - { - for ( ; n0 > 0; n0 -= 8 ) - { - dim_t n_loc = ( n0 < 8 ) ? n0 : 8; - bli_dgemmsup_rd_armv8a_int_2x8 - ( - conja, conjb, m0, n_loc, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - b += 8 * cs_b0; - c += 8 * cs_c0; - } - } - } - else + assert( m0 <= 9 ); + + // Manual separation. + dgemmsup_ker_ft ker_fp1 = NULL; + dgemmsup_ker_ft ker_fp2 = NULL; + dim_t mr1, mr2; + + switch ( m0 ) { - assert( FALSE ); + case 9: + ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8n; mr1 = 6; // This function. + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr2 = 3; break; + case 8: + ker_fp1 = bli_dgemmsup_rd_armv8a_asm_6x8n; mr1 = 6; // This function. + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_rx8n; mr2 = 2; break; + case 7: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr1 = 3; + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_4x8n; mr2 = 4; break; + case 5: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr1 = 3; + ker_fp2 = bli_dgemmsup_rd_armv8a_inline_rx8n; mr2 = 2; break; + case 4: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_4x8n; mr1 = 4; break; + case 3: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_3x8n; mr1 = 3; break; + default: + ker_fp1 = bli_dgemmsup_rd_armv8a_inline_rx8n; mr1 = m0; break; } + + ker_fp1 + ( + conja, conjb, mr1, n0, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + a += mr1 * rs_a0; + c += mr1 * rs_c0; + if ( ker_fp2 ) + ker_fp2 + ( + conja, conjb, mr2, n0, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + return; } diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c index bc7402a5f..b7d1a7d0f 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c @@ -36,7 +36,6 @@ #include "blis.h" #include "assert.h" -GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" @@ -76,6 +75,7 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) " prfm PLDL1KEEP, ["#CADDR"] \n\t" \ " add "#CADDR", "#CADDR", "#DLONGC" \n\t" +// For row-storage of C. #define DLOADC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" @@ -83,6 +83,7 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" +// For column-storage of C. #define DLOADC_4V_C_FWD(C00,C10,C01,C11,CADDR,CSHIFT,CSC) \ DLOAD2V(C00,C10,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" \ diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c new file mode 100644 index 000000000..eaddfd076 --- /dev/null +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d5x8n.c @@ -0,0 +1,482 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +#include "blis.h" +#include "assert.h" + + +// Label locality & misc. +#include "../armv8a_asm_utils.h" + +// Nanokernel operations. +#include "../armv8a_asm_d2x2.h" + +/* Order of row-major DGEMM_6x8's execution in 2x2 blocks: + * + * +---+ +---+ +---+ +---+ + * | 0 | | 1 | | 6 | | 7 | + * +---+ +---+ +---+ +---+ + * +---+ +---+ +---+ +---+ + * | 2 | | 3 | | 8 | | 9 | + * +---+ +---+ +---+ +---+ + * ----- ----- ----- ----- + * 4 5 10 11 + */ +#define DGEMM_5X8_MKER_LOOP_PLAIN(C00,C01,C02,C03,C10,C11,C12,C13,C20,C21,C22,C23,C30,C31,C32,C33,C40,C41,C42,C43,A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \ + DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \ + DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \ + DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \ +" fmla v"#C40".2d, v"#B0".2d, v"#A2".d[0] \n\t" \ +" fmla v"#C41".2d, v"#B1".2d, v"#A2".d[0] \n\t" \ + DGEMM_LOAD2V_ ##LOADNEXT (B0,B1,BADDR,BSHIFT) \ + DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \ + DGEMM_2X2_NANOKERNEL(C03,C13,B3,A0) \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \ + DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \ + DGEMM_2X2_NANOKERNEL(C23,C33,B3,A1) \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \ +" fmla v"#C42".2d, v"#B2".2d, v"#A2".d[0] \n\t" \ +" fmla v"#C43".2d, v"#B3".2d, v"#A2".d[0] \n\t" + +// Interleaving load or not. +#define DGEMM_LOAD1V_noload(V1,ADDR,IMM) +#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ +" ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" + +#define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM) +#define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \ + DGEMM_LOAD1V_load(V1,ADDR,IMM) \ + DGEMM_LOAD1V_load(V2,ADDR,IMM+16) + +#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST) +#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \ +" ld1 {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \ +" ld1 {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t" + +// Prefetch C in the long direction. +#define DPRFMC_FWD(CADDR,DLONGC) \ +" prfm PLDL1KEEP, ["#CADDR"] \n\t" \ +" add "#CADDR", "#CADDR", "#DLONGC" \n\t" + +// For row-storage of C. +#define DLOADC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ + DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" +#define DSTOREC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ + DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" + +// For column-storage of C: Store 2+1/2 vectors. +#define DLOADC_2PHV_C_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,CSC,CTMP) \ +" add "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ +" ld1 {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" +#define DSTOREC_2PHV_C_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,CSC,CTMP) \ +" add "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ +" st1 {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" + +#define DSCALE5V(V0,V1,V2,V3,V4,A,IDX) \ + DSCALE4V(V0,V1,V2,V3,A,IDX) \ + DSCALE1V(V4,A,IDX) +#define DSCALEA5V(D0,D1,D2,D3,D4,S0,S1,S2,S3,S4,A,IDX) \ + DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ + DSCALEA1V(D4,S4,A,IDX) + + +void bli_dgemmsup_rv_armv8a_asm_5x8n + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( m0 == 5 ); + + // LLVM has very bad routing ability for inline asm. + // Limit number of registers in case of Clang compilation. +#ifndef __clang__ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); +#endif + uint64_t ps_b = bli_auxinfo_ps_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 6; + uint64_t k_left = k0 % 6; + + int64_t n_iter = n0 / 8; + int64_t n_left = n0 % 8; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + // uint64_t cs_b = cs_b0; + assert( cs_b0 == 1 ); + + if ( n_iter == 0 ) goto consider_edge_cases; + + __asm__ volatile + ( +" ldr x10, %[b] \n\t" +" ldr x13, %[c] \n\t" +" ldr x12, %[n_iter] \n\t" +" ldr x11, %[ps_b] \n\t" // Panel-skip of B. +" ldr x3, %[rs_b] \n\t" // Row-skip of B. +" ldr x9, %[rs_a] \n\t" // Row-skip of A. +" ldr x2, %[cs_a] \n\t" // Column-skip of A. +" \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. +" ldr x7, %[cs_c] \n\t" // Column-skip of C. +" \n\t" +" \n\t" // Multiply some address skips by sizeof(double). +" lsl x11, x11, #3 \n\t" // ps_b +" lsl x9, x9, #3 \n\t" // rs_a +" lsl x2, x2, #3 \n\t" // cs_a +" lsl x3, x3, #3 \n\t" // rs_b +" lsl x6, x6, #3 \n\t" // rs_c +" lsl x7, x7, #3 \n\t" // cs_c +" \n\t" +" mov x1, x5 \n\t" +" cmp x7, #8 \n\t" // Prefetch column-strided C. +BEQ(C_PREFETCH_COLS) +DPRFMC_FWD(x1,x6) +DPRFMC_FWD(x1,x6) +DPRFMC_FWD(x1,x6) +DPRFMC_FWD(x1,x6) +DPRFMC_FWD(x1,x6) +BRANCH(C_PREFETCH_END) +LABEL(C_PREFETCH_COLS) +// This prefetch will not cover further mker perts. Skip. +// +// DPRFMC_FWD(x1,x7) +// DPRFMC_FWD(x1,x7) +// DPRFMC_FWD(x1,x7) +// DPRFMC_FWD(x1,x7) +// DPRFMC_FWD(x1,x7) +// DPRFMC_FWD(x1,x7) +// DPRFMC_FWD(x1,x7) +// DPRFMC_FWD(x1,x7) +LABEL(C_PREFETCH_END) +// +// Millikernel. +LABEL(MILLIKER_MLOOP) +" \n\t" +" mov x1, x10 \n\t" // Parameters to be reloaded +" mov x5, x13 \n\t" // within each millikernel loop. +" ldr x0, %[a] \n\t" +" ldr x4, %[k_mker] \n\t" +" ldr x8, %[k_left] \n\t" +" \n\t" +// Storage scheme: +// V[ 0:19] <- C +// V[20:25] <- A +// V[26:31] <- B +// Under this scheme, the following is defined: +#define DGEMM_5X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_5X8_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) +// Load from memory. +LABEL(LOAD_ABC) +" \n\t" // No-microkernel early return is a must +" cmp x4, #0 \n\t" // to avoid out-of-boundary read. +BEQ(CLEAR_CCOLS) +" \n\t" +" ldr q26, [x1, #16*0] \n\t" // Load B first. +" ldr q27, [x1, #16*1] \n\t" +" ldr q28, [x1, #16*2] \n\t" +" ldr q29, [x1, #16*3] \n\t" +" add x1, x1, x3 \n\t" +" ldr q30, [x1, #16*0] \n\t" +" ldr q31, [x1, #16*1] \n\t" +" \n\t" +" mov x14, x0 \n\t" // Load A. +" ld1 {v20.d}[0], [x14], x9 \n\t" // We want A to be kept in L1. +" ld1 {v20.d}[1], [x14], x9 \n\t" +" ld1 {v21.d}[0], [x14], x9 \n\t" +" ld1 {v21.d}[1], [x14], x9 \n\t" +" ld1 {v22.d}[0], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ld1 {v23.d}[0], [x14], x9 \n\t" +" ld1 {v23.d}[1], [x14], x9 \n\t" +" ld1 {v24.d}[0], [x14], x9 \n\t" +" ld1 {v24.d}[1], [x14], x9 \n\t" +" ld1 {v25.d}[0], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +LABEL(CLEAR_CCOLS) +CLEAR8V(0,1,2,3,4,5,6,7) +CLEAR8V(8,9,10,11,12,13,14,15) +CLEAR4V(16,17,18,19) +// No-microkernel early return, once again. +BEQ(K_LEFT_LOOP) +// +// Microkernel is defined here as: +#define DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,B3) \ + DGEMM_5X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,x14,x9,x1,16*2,load) \ + "add x1, x1, x3 \n\t" \ + "ldr q"#B2", [x1, #16*0] \n\t" /* Next B line. */ \ + "ldr q"#B3", [x1, #16*1] \n\t" \ + "ld1 {v"#A2".d}[0], [x14], x9 \n\t" /* Finish A line. */ \ + "add x0, x0, x2 \n\t" \ + "mov x14, x0 \n\t" +// Start microkernel loop. +LABEL(K_MKER_LOOP) +DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,26,27,28,29) +DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,30,31,26,27) +DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,28,29,30,31) +DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,26,27,28,29) +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) +DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,30,31,26,27) +DGEMM_5X8_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,28,29,30,31) +BRANCH(K_MKER_LOOP) +// +// Final microkernel loop. +LABEL(FIN_MKER_LOOP) +DGEMM_5X8_MKER_LOOP_PLAIN_LOC(20,21,22,30,31,26,27,xzr,-1,xzr,-1,noload) +" ldr q30, [x1, #16*2] \n\t" +" ldr q31, [x1, #16*3] \n\t" +" add x1, x1, x3 \n\t" +DGEMM_5X8_MKER_LOOP_PLAIN_LOC(23,24,25,28,29,30,31,xzr,-1,xzr,-1,noload) +// +// Loops left behind microkernels. +LABEL(K_LEFT_LOOP) +" cmp x8, #0 \n\t" // End of exec. +BEQ(WRITE_MEM_PREP) +" ldr q26, [x1, #16*0] \n\t" // Load B row. +" ldr q27, [x1, #16*1] \n\t" +" ldr q28, [x1, #16*2] \n\t" +" ldr q29, [x1, #16*3] \n\t" +" add x1, x1, x3 \n\t" +" mov x14, x0 \n\t" +" ld1 {v20.d}[0], [x14], x9 \n\t" // Load A col. +" ld1 {v20.d}[1], [x14], x9 \n\t" +" ld1 {v21.d}[0], [x14], x9 \n\t" +" ld1 {v21.d}[1], [x14], x9 \n\t" +" ld1 {v22.d}[0], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" sub x8, x8, #1 \n\t" +DGEMM_5X8_MKER_LOOP_PLAIN_LOC(20,21,22,26,27,28,29,xzr,-1,xzr,-1,noload) +BRANCH(K_LEFT_LOOP) +// +// Scale and write to memory. +LABEL(WRITE_MEM_PREP) +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta. +" ld1r {v31.2d}, [x8] \n\t" +" fmov d20, #1.0 \n\t" +" fcmp d30, d20 \n\t" +BEQ(UNIT_ALPHA_R) +DSCALE8V(0,1,2,3,4,5,6,7,30,0) +DSCALE8V(8,9,10,11,12,13,14,15,30,0) +DSCALE4V(16,17,18,19,30,0) +LABEL(UNIT_ALPHA_R) +" \n\t" +" mov x1, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +" cmp x7, #8 \n\t" // Check for column-storage. +BNE(WRITE_MEM_C) +// +// C storage in rows. +LABEL(WRITE_MEM_R) +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_R_1_2) +DLOADC_4V_R_FWD(20,21,22,23,x1,0,x6) +DLOADC_4V_R_FWD(24,25,26,27,x1,0,x6) +DSCALEA4V(0,1,2,3,20,21,22,23,31,0) +DSCALEA4V(4,5,6,7,24,25,26,27,31,0) +LABEL(ZERO_BETA_R_1_2) +DSTOREC_4V_R_FWD(0,1,2,3,x5,0,x6) +DSTOREC_4V_R_FWD(4,5,6,7,x5,0,x6) +BEQ(ZERO_BETA_R_3_4_5) +DLOADC_4V_R_FWD(20,21,22,23,x1,0,x6) +DLOADC_4V_R_FWD(24,25,26,27,x1,0,x6) +DLOADC_4V_R_FWD(0,1,2,3,x1,0,x6) +DSCALEA8V(8,9,10,11,12,13,14,15,20,21,22,23,24,25,26,27,31,0) +DSCALEA4V(16,17,18,19,0,1,2,3,31,0) +LABEL(ZERO_BETA_R_3_4_5) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_R) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_R) +#endif +DSTOREC_4V_R_FWD(8,9,10,11,x5,0,x6) +DSTOREC_4V_R_FWD(12,13,14,15,x5,0,x6) +DSTOREC_4V_R_FWD(16,17,18,19,x5,0,x6) +BRANCH(END_WRITE_MEM) +// +// C storage in columns. +LABEL(WRITE_MEM_C) +// In-register transpose, +// do transposition in row-order. +" trn1 v20.2d, v0.2d, v4.2d \n\t" // Row 0-1. +" trn2 v21.2d, v0.2d, v4.2d \n\t" +" trn1 v22.2d, v1.2d, v5.2d \n\t" +" trn2 v23.2d, v1.2d, v5.2d \n\t" +" trn1 v24.2d, v2.2d, v6.2d \n\t" +" trn2 v25.2d, v2.2d, v6.2d \n\t" +" trn1 v26.2d, v3.2d, v7.2d \n\t" +" trn2 v27.2d, v3.2d, v7.2d \n\t" +" \n\t" +" trn1 v0.2d, v8.2d, v12.2d \n\t" // Row 2-3. +" trn2 v1.2d, v8.2d, v12.2d \n\t" +" trn1 v2.2d, v9.2d, v13.2d \n\t" +" trn2 v3.2d, v9.2d, v13.2d \n\t" +" trn1 v4.2d, v10.2d, v14.2d \n\t" +" trn2 v5.2d, v10.2d, v14.2d \n\t" +" trn1 v6.2d, v11.2d, v15.2d \n\t" +" trn2 v7.2d, v11.2d, v15.2d \n\t" +" \n\t" +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_C_1_2_3_4) +DLOADC_2PHV_C_FWD(8,9,10,0,x1,0,x7,x8) +DLOADC_2PHV_C_FWD(11,12,10,1,x1,0,x7,x8) +DLOADC_2PHV_C_FWD(13,14,15,0,x1,0,x7,x8) +DLOADC_2PHV_C_FWD(28,29,15,1,x1,0,x7,x8) +DSCALEA5V(20,0,21,1,16,8,9,11,12,10,31,0) +DSCALEA5V(22,2,23,3,17,13,14,28,29,15,31,0) +LABEL(ZERO_BETA_C_1_2_3_4) +DSTOREC_2PHV_C_FWD(20,0,16,0,x5,0,x7,x8) +DSTOREC_2PHV_C_FWD(21,1,16,1,x5,0,x7,x8) +DSTOREC_2PHV_C_FWD(22,2,17,0,x5,0,x7,x8) +DSTOREC_2PHV_C_FWD(23,3,17,1,x5,0,x7,x8) +BEQ(ZERO_BETA_C_5_6_7_8) +DLOADC_2PHV_C_FWD(8,9,10,0,x1,0,x7,x8) +DLOADC_2PHV_C_FWD(11,12,10,1,x1,0,x7,x8) +DLOADC_2PHV_C_FWD(13,14,15,0,x1,0,x7,x8) +DLOADC_2PHV_C_FWD(28,29,15,1,x1,0,x7,x8) +DSCALEA5V(24,4,25,5,18,8,9,11,12,10,31,0) +DSCALEA5V(26,6,27,7,19,13,14,28,29,15,31,0) +LABEL(ZERO_BETA_C_5_6_7_8) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_C) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_C) +#endif +DSTOREC_2PHV_C_FWD(24,4,18,0,x5,0,x7,x8) +DSTOREC_2PHV_C_FWD(25,5,18,1,x5,0,x7,x8) +DSTOREC_2PHV_C_FWD(26,6,19,0,x5,0,x7,x8) +DSTOREC_2PHV_C_FWD(27,7,19,1,x5,0,x7,x8) +// +// End of this microkernel. +LABEL(END_WRITE_MEM) +" \n\t" +" subs x12, x12, #1 \n\t" +BEQ(END_EXEC) +" \n\t" +" mov x8, #8 \n\t" +" madd x13, x7, x8, x13 \n\t" // Forward C's base address to the next logic panel. +" add x10, x10, x11 \n\t" // Forward B's base address to the next logic panel. +BRANCH(MILLIKER_MLOOP) +// +// End of execution. +LABEL(END_EXEC) +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_b] "m" (ps_b), + [rs_b] "m" (rs_b), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + // In Clang, even "m"-passed parameter takes 1 register. + // Have to disable prefetching to pass compilation. +#ifndef __clang__ + [a_next] "r" (a_next), + [b_next] "r" (b_next), +#endif + [n_iter] "m" (n_iter), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta) +: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10","x11","x12","x13","x14", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10","v11","v12","v13","v14","v15", + "v16","v17","v18","v19","v20","v21","v22","v23", + "v24","v25","v26","v27","v28","v29","v30","v31" + ); + +consider_edge_cases: + // Forward address. + b = b + n_iter * ps_b; + c = c + n_iter * 8 * cs_c; + if ( n_left ) + { + // Set panel stride to unpacked mode. + // Only 1 millikernel w.r.t. 6x8 is executed. + auxinfo_t data_d6x4mn = *data; + bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn ); + // + bli_dgemmsup_rv_armv8a_int_6x4mn + ( + conja, conjb, 5, n_left, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx + ); + } + +} + diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c new file mode 100644 index 000000000..91d6ca596 --- /dev/null +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x5m.c @@ -0,0 +1,475 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +#include "blis.h" +#include "assert.h" + + +// Label locality & misc. +#include "../armv8a_asm_utils.h" + +// Nanokernel operations. +#include "../armv8a_asm_d2x2.h" + +/* Odd-NR dgemmsup_rv_*m kernels are special in that + * despite of the row-major name, C is laid out in COLUMNS in the register space. + * + * Block order: + * + * +---+ +---+ + * | 0 | | 3 | |6 + * +---+ +---+ | + * +---+ +---+ + * | 1 | | 4 | |7 + * +---+ +---+ | + * +---+ +---+ + * | 2 | | 5 | |8 + * +---+ +---+ | + * + */ +#define DGEMM_C6X5_MKER_LOOP_PLAIN(C00,C01,C02,C03,C04,C10,C11,C12,C13,C14,C20,C21,C22,C23,C24,A0,A1,A2,B0,B1,B2,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_2X2_NANOKERNEL(C00,C01,A0,B0) \ + DGEMM_2X2_NANOKERNEL(C10,C11,A1,B0) \ + DGEMM_2X2_NANOKERNEL(C20,C21,A2,B0) \ + DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ + DGEMM_2X2_NANOKERNEL(C02,C03,A0,B1) \ + DGEMM_2X2_NANOKERNEL(C12,C13,A1,B1) \ + DGEMM_2X2_NANOKERNEL(C22,C23,A2,B1) \ + DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT+16) \ +" fmla v"#C04".2d, v"#A0".2d, v"#B2".d["#BIDX"] \n\t" \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \ +" fmla v"#C14".2d, v"#A1".2d, v"#B2".d["#BIDX"] \n\t" \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \ +" fmla v"#C24".2d, v"#A2".2d, v"#B2".d["#BIDX"] \n\t" + +// Interleaving load or not. +#define DGEMM_LOAD1V_noload(V1,ADDR,IMM) +#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ +" ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" + +#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST) +#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \ +" ld1 {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \ +" ld1 {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t" + +// Prefetch C in the long direction. +#define DPRFMC_FWD(CADDR,DLONGC) \ +" prfm PLDL1KEEP, ["#CADDR"] \n\t" \ +" add "#CADDR", "#CADDR", "#DLONGC" \n\t" + +// For column-storage of C. +#define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ + DLOAD1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" +#define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ + DSTORE1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" + +// For row-storage of C: Store 2+1/2 vectors. +#define DLOADC_2PHV_R_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,RSC,CTMP) \ +" add "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ +" ld1 {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" +#define DSTOREC_2PHV_R_FWD(C0,C1,C2,CIDX,CADDR,CSHIFT,RSC,CTMP) \ +" add "#CTMP", "#CADDR", "#CSHIFT"+32 \n\t" \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ +" st1 {v"#C2".d}["#CIDX"], ["#CTMP"] \n\t" \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" + +#define DSCALE5V(V0,V1,V2,V3,V4,A,IDX) \ + DSCALE4V(V0,V1,V2,V3,A,IDX) \ + DSCALE1V(V4,A,IDX) +#define DSCALEA5V(D0,D1,D2,D3,D4,S0,S1,S2,S3,S4,A,IDX) \ + DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ + DSCALEA1V(D4,S4,A,IDX) + + +void bli_dgemmsup_rv_armv8a_asm_6x5m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( n0 == 5 ); + + // LLVM has very bad routing ability for inline asm. + // Limit number of registers in case of Clang compilation. +#ifndef __clang__ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); +#endif + uint64_t ps_a = bli_auxinfo_ps_a( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 6; + uint64_t k_left = k0 % 6; + + int64_t m_iter = m0 / 6; + int64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + // uint64_t cs_b = cs_b0; + assert( cs_b0 == 1 ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + __asm__ volatile + ( +" ldr x10, %[a] \n\t" +" ldr x13, %[c] \n\t" +" ldr x12, %[m_iter] \n\t" +" ldr x11, %[ps_a] \n\t" // Panel-skip of A. +" ldr x9, %[rs_a] \n\t" // Row-skip of A. +" ldr x2, %[cs_a] \n\t" // Column-skip of A. +" ldr x3, %[rs_b] \n\t" // Row-skip of B. +" \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. +" ldr x7, %[cs_c] \n\t" // Column-skip of C. +" \n\t" +" \n\t" // Multiply some address skips by sizeof(double). +" lsl x11, x11, #3 \n\t" // ps_a +" lsl x9, x9, #3 \n\t" // rs_a +" lsl x2, x2, #3 \n\t" // cs_a +" lsl x3, x3, #3 \n\t" // rs_b +" lsl x6, x6, #3 \n\t" // rs_c +" lsl x7, x7, #3 \n\t" // cs_c +" \n\t" +" mov x1, x5 \n\t" +" cmp x7, #8 \n\t" // Prefetch column-strided C. +BNE(C_PREFETCH_COLS) +// This prefetch will not cover further mker perts. Skip. +// +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +BRANCH(C_PREFETCH_END) +LABEL(C_PREFETCH_COLS) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +LABEL(C_PREFETCH_END) +// +// Millikernel. +LABEL(MILLIKER_MLOOP) +" \n\t" +" mov x0, x10 \n\t" // Parameters to be reloaded +" mov x5, x13 \n\t" // within each millikernel loop. +" ldr x1, %[b] \n\t" +" ldr x4, %[k_mker] \n\t" +" ldr x8, %[k_left] \n\t" +" \n\t" +// Storage scheme: +// V[ 0:14] <- C +// V[15:23] <- A +// V[24:29] <- B +// Under this scheme, the following is defined: +#define DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_C6X5_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,A0,A1,A2,B0,B1,B2,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) +// Load from memory. +LABEL(LOAD_ABC) +" \n\t" // No-microkernel early return is a must +" cmp x4, #0 \n\t" // to avoid out-of-boundary read. +BEQ(CLEAR_CCOLS) +" \n\t" +" mov x14, x0 \n\t" // Load A. +" ld1 {v15.d}[0], [x14], x9 \n\t" +" ld1 {v15.d}[1], [x14], x9 \n\t" +" ld1 {v16.d}[0], [x14], x9 \n\t" +" ld1 {v16.d}[1], [x14], x9 \n\t" +" ld1 {v17.d}[0], [x14], x9 \n\t" +" ld1 {v17.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ld1 {v18.d}[0], [x14], x9 \n\t" +" ld1 {v18.d}[1], [x14], x9 \n\t" +" ld1 {v19.d}[0], [x14], x9 \n\t" +" ld1 {v19.d}[1], [x14], x9 \n\t" +" ld1 {v20.d}[0], [x14], x9 \n\t" +" ld1 {v20.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ld1 {v21.d}[0], [x14], x9 \n\t" +" ld1 {v21.d}[1], [x14], x9 \n\t" +" ld1 {v22.d}[0], [x14], x9 \n\t" +" ld1 {v22.d}[1], [x14], x9 \n\t" +" ld1 {v23.d}[0], [x14], x9 \n\t" +" ld1 {v23.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" \n\t" +" ldr q24, [x1, #16*0] \n\t" // Load B. +" ldr q25, [x1, #16*1] \n\t" +" ldr d26, [x1, #16*2] \n\t" // Scalar loads into idx 0. +" add x1, x1, x3 \n\t" +" ldr q27, [x1, #16*0] \n\t" +" ldr q28, [x1, #16*1] \n\t" +" ldr d29, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +LABEL(CLEAR_CCOLS) +CLEAR4V(0,1,2,3) +CLEAR1V(4) +CLEAR4V(5,6,7,8) +CLEAR1V(9) +CLEAR4V(10,11,12,13) +CLEAR1V(14) +// No-microkernel early return, once again. +BEQ(K_LEFT_LOOP) +// +// Microkernel is defined here as: +#define DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,BIDX) \ + DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,BIDX,x14,x9,x1,0,load) \ + "ld1 {v"#A2".d}[0], [x14], x9 \n\t" \ + "ld1 {v"#A2".d}[1], [x14], x9 \n\t" \ + "add x0, x0, x2 \n\t" \ + "mov x14, x0 \n\t" \ + /* Due to this loading, BIDX can only be 0 here. */ \ + "ldr d"#B2", [x1, #16*2] \n\t" \ + "add x1, x1, x3 \n\t" +// Start microkernel loop. +LABEL(K_MKER_LOOP) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(15,16,17,24,25,26,0) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,27,28,29,0) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,24,25,26,0) +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(15,16,17,27,28,29,0) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,24,25,26,0) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,27,28,29,0) +BRANCH(K_MKER_LOOP) +// +// Final microkernel loop. +LABEL(FIN_MKER_LOOP) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(15,16,17,27,28,29,0,xzr,-1,xzr,-1,noload) +" ldr q27, [x1, #16*0] \n\t" +" ldr q28, [x1, #16*1] \n\t" +" ldr d29, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(18,19,20,24,25,26,0,xzr,-1,xzr,-1,noload) +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(21,22,23,27,28,29,0,xzr,-1,xzr,-1,noload) +// +// Loops left behind microkernels. +LABEL(K_LEFT_LOOP) +" cmp x8, #0 \n\t" // End of exec. +BEQ(WRITE_MEM_PREP) +" mov x14, x0 \n\t" // Load A col. +" ld1 {v15.d}[0], [x14], x9 \n\t" +" ld1 {v15.d}[1], [x14], x9 \n\t" +" ld1 {v16.d}[0], [x14], x9 \n\t" +" ld1 {v16.d}[1], [x14], x9 \n\t" +" ld1 {v17.d}[0], [x14], x9 \n\t" +" ld1 {v17.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" ldr q24, [x1, #16*0] \n\t" // Load B row. +" ldr q25, [x1, #16*1] \n\t" +" ldr d26, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +" sub x8, x8, #1 \n\t" +DGEMM_C6X5_MKER_LOOP_PLAIN_LOC(15,16,17,24,25,26,0,xzr,-1,xzr,-1,noload) +BRANCH(K_LEFT_LOOP) +// +// Scale and write to memory. +LABEL(WRITE_MEM_PREP) +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta. +" ld1r {v31.2d}, [x8] \n\t" +" fmov d26, #1.0 \n\t" +" fcmp d30, d26 \n\t" +BEQ(UNIT_ALPHA) +DSCALE5V(0,1,2,3,4,30,0) +DSCALE5V(5,6,7,8,9,30,0) +DSCALE5V(10,11,12,13,14,30,0) +LABEL(UNIT_ALPHA) +" \n\t" +" mov x1, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +" cmp x7, #8 \n\t" // Check for column-storage. +BNE(WRITE_MEM_C) +// Unlike other RV kernels, here row-storage of C requires +// in-register transpose. +" trn1 v15.2d, v0.2d, v1.2d \n\t" +" trn2 v16.2d, v0.2d, v1.2d \n\t" +" trn1 v17.2d, v2.2d, v3.2d \n\t" +" trn2 v18.2d, v2.2d, v3.2d \n\t" +" \n\t" +" trn1 v19.2d, v5.2d, v6.2d \n\t" +" trn2 v20.2d, v5.2d, v6.2d \n\t" +" trn1 v21.2d, v7.2d, v8.2d \n\t" +" trn2 v22.2d, v7.2d, v8.2d \n\t" +" \n\t" +" trn1 v23.2d, v10.2d, v11.2d \n\t" +" trn2 v24.2d, v10.2d, v11.2d \n\t" +" trn1 v25.2d, v12.2d, v13.2d \n\t" +" trn2 v26.2d, v12.2d, v13.2d \n\t" +" \n\t" +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_R) +DLOADC_2PHV_R_FWD(0,1,28,0,x1,0,x6,x8) +DLOADC_2PHV_R_FWD(2,3,28,1,x1,0,x6,x8) +DLOADC_2PHV_R_FWD(5,6,29,0,x1,0,x6,x8) +DLOADC_2PHV_R_FWD(7,8,29,1,x1,0,x6,x8) +DLOADC_2PHV_R_FWD(10,11,30,0,x1,0,x6,x8) +DLOADC_2PHV_R_FWD(12,13,30,1,x1,0,x6,x8) +DSCALEA5V(15,17,16,18,4,0,1,2,3,28,31,0) +DSCALEA5V(19,21,20,22,9,5,6,7,8,29,31,0) +DSCALEA5V(23,25,24,26,14,10,11,12,13,30,31,0) +LABEL(ZERO_BETA_R) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_R) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_R) +#endif +DSTOREC_2PHV_R_FWD(15,17,4,0,x5,0,x6,x8) +DSTOREC_2PHV_R_FWD(16,18,4,1,x5,0,x6,x8) +DSTOREC_2PHV_R_FWD(19,21,9,0,x5,0,x6,x8) +DSTOREC_2PHV_R_FWD(20,22,9,1,x5,0,x6,x8) +DSTOREC_2PHV_R_FWD(23,25,14,0,x5,0,x6,x8) +DSTOREC_2PHV_R_FWD(24,26,14,1,x5,0,x6,x8) +BRANCH(END_WRITE_MEM) +// +// C storage in columns. +LABEL(WRITE_MEM_C) +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_C) +DLOADC_3V_C_FWD(15,20,25,x1,0,x7) +DLOADC_3V_C_FWD(16,21,26,x1,0,x7) +DLOADC_3V_C_FWD(17,22,27,x1,0,x7) +DLOADC_3V_C_FWD(18,23,28,x1,0,x7) +DLOADC_3V_C_FWD(19,24,29,x1,0,x7) +DSCALEA5V(0,1,2,3,4,15,16,17,18,19,31,0) +DSCALEA5V(5,6,7,8,9,20,21,22,23,24,31,0) +DSCALEA5V(10,11,12,13,14,25,26,27,28,29,31,0) +LABEL(ZERO_BETA_C) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_C) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_C) +#endif +DSTOREC_3V_C_FWD(0,5,10,x5,0,x7) +DSTOREC_3V_C_FWD(1,6,11,x5,0,x7) +DSTOREC_3V_C_FWD(2,7,12,x5,0,x7) +DSTOREC_3V_C_FWD(3,8,13,x5,0,x7) +DSTOREC_3V_C_FWD(4,9,14,x5,0,x7) +// +// End of this microkernel. +LABEL(END_WRITE_MEM) +" \n\t" +" subs x12, x12, #1 \n\t" +BEQ(END_EXEC) +" \n\t" +" mov x8, #6 \n\t" +" madd x13, x6, x8, x13 \n\t" // Forward C's base address to the next logic panel. +" add x10, x10, x11 \n\t" // Forward A's base address to the next logic panel. +BRANCH(MILLIKER_MLOOP) +// +// End of execution. +LABEL(END_EXEC) +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a] "m" (ps_a), + [rs_b] "m" (rs_b), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + // In Clang, even "m"-passed parameter takes 1 register. + // Have to disable prefetching to pass compilation. +#ifndef __clang__ + [a_next] "r" (a_next), + [b_next] "r" (b_next), +#endif + [m_iter] "m" (m_iter), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta) +: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10","x11","x12","x13","x14", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10","v11","v12","v13","v14","v15", + "v16","v17","v18","v19","v20","v21","v22","v23", + "v24","v25","v26","v27","v28","v29","v30","v31" + ); + +consider_edge_cases: + // Forward address. + a = a + m_iter * ps_a; + c = c + m_iter * 6 * rs_c; + auxinfo_t data_d6x4mn = *data; + bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn ); + bli_dgemmsup_rv_armv8a_int_6x4mn + ( + conja, conjb, m_left, 5, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx + ); + +} + diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c new file mode 100644 index 000000000..4273030dd --- /dev/null +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x6m.c @@ -0,0 +1,477 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +#include "blis.h" +#include "assert.h" + + +// Label locality & misc. +#include "../armv8a_asm_utils.h" + +// Nanokernel operations. +#include "../armv8a_asm_d2x2.h" + +/* Order of row-major DGEMM_6x6's execution in 2x2 blocks: + * + * +---+ +---+ +---+ + * | 0 | | 1 | | 2 | + * +---+ +---+ +---+ + * +---+ +---+ +---+ + * | 3 | | 4 | | 5 | + * +---+ +---+ +---+ + * +---+ +---+ +---+ + * | 6 | | 7 | | 8 | + * +---+ +---+ +---+ + * + */ +#define DGEMM_6X6_MKER_LOOP_PLAIN(C00,C01,C02,C10,C11,C12,C20,C21,C22,C30,C31,C32,C40,C41,C42,C50,C51,C52,A0,A1,A2,B0,B1,B2,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \ + DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \ + DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \ + DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \ + DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \ + DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \ + DGEMM_2X2_NANOKERNEL(C40,C50,B0,A2) \ + DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ + DGEMM_2X2_NANOKERNEL(C41,C51,B1,A2) \ + DGEMM_2X2_NANOKERNEL(C42,C52,B2,A2) + +// Interleaving load or not. +#define DGEMM_LOAD1V_noload(V1,ADDR,IMM) +#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ +" ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" + +#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST) +#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \ +" ld1 {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \ +" ld1 {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t" + +// Prefetch C in the long direction. +#define DPRFMC_FWD(CADDR,DLONGC) \ +" prfm PLDL1KEEP, ["#CADDR"] \n\t" \ +" add "#CADDR", "#CADDR", "#DLONGC" \n\t" + +// For row-storage of C. +#define DLOADC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \ + DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) +#define DSTOREC_3V_R_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) \ + DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,RSC) + +// For column-storage of C. +#define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ + DLOAD1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" +#define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ + DSTORE1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" + +#define DSCALE6V(V0,V1,V2,V3,V4,V5,A,IDX) \ + DSCALE4V(V0,V1,V2,V3,A,IDX) \ + DSCALE2V(V4,V5,A,IDX) +#define DSCALEA6V(D0,D1,D2,D3,D4,D5,S0,S1,S2,S3,S4,S5,A,IDX) \ + DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ + DSCALEA2V(D4,D5,S4,S5,A,IDX) + + +void bli_dgemmsup_rv_armv8a_asm_6x6m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( n0 == 6 ); + + // LLVM has very bad routing ability for inline asm. + // Limit number of registers in case of Clang compilation. +#ifndef __clang__ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); +#endif + uint64_t ps_a = bli_auxinfo_ps_a( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 8; + uint64_t k_left = k0 % 8; + + int64_t m_iter = m0 / 6; + int64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + // uint64_t cs_b = cs_b0; + assert( cs_b0 == 1 ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + __asm__ volatile + ( +" ldr x10, %[a] \n\t" +" ldr x13, %[c] \n\t" +" ldr x12, %[m_iter] \n\t" +" ldr x11, %[ps_a] \n\t" // Panel-skip of A. +" ldr x9, %[rs_a] \n\t" // Row-skip of A. +" ldr x2, %[cs_a] \n\t" // Column-skip of A. +" ldr x3, %[rs_b] \n\t" // Row-skip of B. +" \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. +" ldr x7, %[cs_c] \n\t" // Column-skip of C. +" \n\t" +" \n\t" // Multiply some address skips by sizeof(double). +" lsl x11, x11, #3 \n\t" // ps_a +" lsl x9, x9, #3 \n\t" // rs_a +" lsl x2, x2, #3 \n\t" // cs_a +" lsl x3, x3, #3 \n\t" // rs_b +" lsl x6, x6, #3 \n\t" // rs_c +" lsl x7, x7, #3 \n\t" // cs_c +" \n\t" +" mov x1, x5 \n\t" +" cmp x7, #8 \n\t" // Prefetch column-strided C. +BEQ(C_PREFETCH_COLS) +// This prefetch will not cover further mker perts. Skip. +// +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +BRANCH(C_PREFETCH_END) +LABEL(C_PREFETCH_COLS) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +LABEL(C_PREFETCH_END) +// +// Millikernel. +LABEL(MILLIKER_MLOOP) +" \n\t" +" mov x0, x10 \n\t" // Parameters to be reloaded +" mov x5, x13 \n\t" // within each millikernel loop. +" ldr x1, %[b] \n\t" +" ldr x4, %[k_mker] \n\t" +" ldr x8, %[k_left] \n\t" +" \n\t" +// Storage scheme: +// V[ 0:17] <- C +// V[18:23] <- A +// V[24:31] <- B +// Under this scheme, the following is defined: +#define DGEMM_6X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_6X6_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,A0,A1,A2,B0,B1,B2,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) +// Load from memory. +LABEL(LOAD_ABC) +" \n\t" // No-microkernel early return is a must +" cmp x4, #0 \n\t" // to avoid out-of-boundary read. +BEQ(CLEAR_CCOLS) +" \n\t" +" mov x14, x0 \n\t" // Load A. +" ld1 {v18.d}[0], [x14], x9 \n\t" +" ld1 {v18.d}[1], [x14], x9 \n\t" +" ld1 {v19.d}[0], [x14], x9 \n\t" +" ld1 {v19.d}[1], [x14], x9 \n\t" +" ld1 {v20.d}[0], [x14], x9 \n\t" +" ld1 {v20.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ld1 {v21.d}[0], [x14], x9 \n\t" +" ld1 {v21.d}[1], [x14], x9 \n\t" +" ld1 {v22.d}[0], [x14], x9 \n\t" +" ld1 {v22.d}[1], [x14], x9 \n\t" +" ld1 {v23.d}[0], [x14], x9 \n\t" +" ld1 {v23.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" \n\t" +" ldr q24, [x1, #16*0] \n\t" // Load B. +" ldr q25, [x1, #16*1] \n\t" +" ldr q26, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +" ldr q27, [x1, #16*0] \n\t" +" ldr q28, [x1, #16*1] \n\t" +" ldr q29, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +" ldr q30, [x1, #16*0] \n\t" +" ldr q31, [x1, #16*1] \n\t" +LABEL(CLEAR_CCOLS) +CLEAR4V(0,1,2,3) +CLEAR2V(4,5) +CLEAR4V(6,7,8,9) +CLEAR2V(10,11) +CLEAR4V(12,13,14,15) +CLEAR2V(16,17) +// No-microkernel early return, once again. +BEQ(K_LEFT_LOOP) +// +// Microkernel is defined here as: +#define DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2) \ + DGEMM_6X6_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,x14,x9,x1,16*2,load) \ + "ld1 {v"#A2".d}[0], [x14], x9 \n\t" \ + "ld1 {v"#A2".d}[1], [x14], x9 \n\t" \ + "add x0, x0, x2 \n\t" \ + "mov x14, x0 \n\t" \ + "add x1, x1, x3 \n\t" \ + "ldr q"#B1", [x1, #16*0] \n\t" \ + "ldr q"#B2", [x1, #16*1] \n\t" +// Start microkernel loop. +LABEL(K_MKER_LOOP) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,24,25,26) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,27,28,29) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,30,31,24) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,25,26,27) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,28,29,30) +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,31,24,25) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(18,19,20,26,27,28) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,29,30,31) +BRANCH(K_MKER_LOOP) +// +// Final microkernel loop. +LABEL(FIN_MKER_LOOP) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC(21,22,23,31,24,25,x14,x9,x1,16*2,load) +" ld1 {v23.d}[0], [x14], x9 \n\t" +" ld1 {v23.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" add x1, x1, x3 \n\t" +DGEMM_6X6_MKER_LOOP_PLAIN_LOC(18,19,20,26,27,28,xzr,-1,xzr,-1,noload) +DGEMM_6X6_MKER_LOOP_PLAIN_LOC(21,22,23,29,30,31,xzr,-1,xzr,-1,noload) +// +// Loops left behind microkernels. +LABEL(K_LEFT_LOOP) +" cmp x8, #0 \n\t" // End of exec. +BEQ(WRITE_MEM_PREP) +" mov x14, x0 \n\t" +" ld1 {v18.d}[0], [x14], x9 \n\t" // Load A col. +" ld1 {v18.d}[1], [x14], x9 \n\t" +" ld1 {v19.d}[0], [x14], x9 \n\t" +" ld1 {v19.d}[1], [x14], x9 \n\t" +" ld1 {v20.d}[0], [x14], x9 \n\t" +" ld1 {v20.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" ldr q24, [x1, #16*0] \n\t" // Load B row. +" ldr q25, [x1, #16*1] \n\t" +" ldr q26, [x1, #16*2] \n\t" +" add x1, x1, x3 \n\t" +" sub x8, x8, #1 \n\t" +DGEMM_6X6_MKER_LOOP_PLAIN_LOC(18,19,20,24,25,26,xzr,-1,xzr,-1,noload) +BRANCH(K_LEFT_LOOP) +// +// Scale and write to memory. +LABEL(WRITE_MEM_PREP) +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta. +" ld1r {v31.2d}, [x8] \n\t" +" fmov d26, #1.0 \n\t" +" fcmp d30, d26 \n\t" +BEQ(UNIT_ALPHA) +DSCALE6V(0,1,2,3,4,5,30,0) +DSCALE6V(6,7,8,9,10,11,30,0) +DSCALE6V(12,13,14,15,16,17,30,0) +LABEL(UNIT_ALPHA) +" \n\t" +" mov x1, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +" cmp x7, #8 \n\t" // Check for column-storage. +BNE(WRITE_MEM_C) +// +// C storage in rows. +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_R_1_2) +DLOADC_3V_R_FWD(18,19,20,x1,0,x6) +DLOADC_3V_R_FWD(21,22,23,x1,0,x6) +DSCALEA6V(0,1,2,3,4,5,18,19,20,21,22,23,31,0) +LABEL(ZERO_BETA_R_1_2) +DSTOREC_3V_R_FWD(0,1,2,x5,0,x6) +DSTOREC_3V_R_FWD(3,4,5,x5,0,x6) +BEQ(ZERO_BETA_R_3_4_5_6) +DLOADC_3V_R_FWD(18,19,20,x1,0,x6) +DLOADC_3V_R_FWD(21,22,23,x1,0,x6) +DLOADC_3V_R_FWD(0,1,2,x1,0,x6) +DLOADC_3V_R_FWD(3,4,5,x1,0,x6) +DSCALEA6V(6,7,8,9,10,11,18,19,20,21,22,23,31,0) +DSCALEA6V(12,13,14,15,16,17,0,1,2,3,4,5,31,0) +LABEL(ZERO_BETA_R_3_4_5_6) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_R) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_R) +#endif +DSTOREC_3V_R_FWD(6,7,8,x5,0,x6) +DSTOREC_3V_R_FWD(9,10,11,x5,0,x6) +DSTOREC_3V_R_FWD(12,13,14,x5,0,x6) +DSTOREC_3V_R_FWD(15,16,17,x5,0,x6) +BRANCH(END_WRITE_MEM) +// +// C storage in columns. +LABEL(WRITE_MEM_C) +// In-register transpose, +// do transposition in row-order. +" trn1 v18.2d, v0.2d, v3.2d \n\t" // Row 0-1. +" trn2 v19.2d, v0.2d, v3.2d \n\t" +" trn1 v20.2d, v1.2d, v4.2d \n\t" +" trn2 v21.2d, v1.2d, v4.2d \n\t" +" trn1 v22.2d, v2.2d, v5.2d \n\t" +" trn2 v23.2d, v2.2d, v5.2d \n\t" +" \n\t" +" trn1 v24.2d, v6.2d, v9.2d \n\t" // Row 2-3. +" trn2 v25.2d, v6.2d, v9.2d \n\t" +" trn1 v26.2d, v7.2d, v10.2d \n\t" +" trn2 v27.2d, v7.2d, v10.2d \n\t" +" trn1 v28.2d, v8.2d, v11.2d \n\t" +" trn2 v29.2d, v8.2d, v11.2d \n\t" +" \n\t" +" trn1 v0.2d, v12.2d, v15.2d \n\t" // Row 4-5. +" trn2 v1.2d, v12.2d, v15.2d \n\t" +" trn1 v2.2d, v13.2d, v16.2d \n\t" +" trn2 v3.2d, v13.2d, v16.2d \n\t" +" trn1 v4.2d, v14.2d, v17.2d \n\t" +" trn2 v5.2d, v14.2d, v17.2d \n\t" +" \n\t" +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_C_1_2) +DLOADC_3V_C_FWD(6,7,8,x1,0,x7) +DLOADC_3V_C_FWD(9,10,11,x1,0,x7) +DSCALEA6V(18,24,0,19,25,1,6,7,8,9,10,11,31,0) +LABEL(ZERO_BETA_C_1_2) +DSTOREC_3V_C_FWD(18,24,0,x5,0,x7) +DSTOREC_3V_C_FWD(19,25,1,x5,0,x7) +BEQ(ZERO_BETA_C_3_4_5_6) +DLOADC_3V_C_FWD(6,7,8,x1,0,x7) +DLOADC_3V_C_FWD(9,10,11,x1,0,x7) +DLOADC_3V_C_FWD(12,13,14,x1,0,x7) +DLOADC_3V_C_FWD(15,16,17,x1,0,x7) +DSCALEA6V(20,26,2,21,27,3,6,7,8,9,10,11,31,0) +DSCALEA6V(22,28,4,23,29,5,12,13,14,15,16,17,31,0) +LABEL(ZERO_BETA_C_3_4_5_6) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_C) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_C) +#endif +DSTOREC_3V_C_FWD(20,26,2,x5,0,x7) +DSTOREC_3V_C_FWD(21,27,3,x5,0,x7) +DSTOREC_3V_C_FWD(22,28,4,x5,0,x7) +DSTOREC_3V_C_FWD(23,29,5,x5,0,x7) +// +// End of this microkernel. +LABEL(END_WRITE_MEM) +" \n\t" +" subs x12, x12, #1 \n\t" +BEQ(END_EXEC) +" \n\t" +" mov x8, #6 \n\t" +" madd x13, x6, x8, x13 \n\t" // Forward C's base address to the next logic panel. +" add x10, x10, x11 \n\t" // Forward A's base address to the next logic panel. +BRANCH(MILLIKER_MLOOP) +// +// End of execution. +LABEL(END_EXEC) +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a] "m" (ps_a), + [rs_b] "m" (rs_b), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + // In Clang, even "m"-passed parameter takes 1 register. + // Have to disable prefetching to pass compilation. +#ifndef __clang__ + [a_next] "r" (a_next), + [b_next] "r" (b_next), +#endif + [m_iter] "m" (m_iter), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta) +: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10","x11","x12","x13","x14", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10","v11","v12","v13","v14","v15", + "v16","v17","v18","v19","v20","v21","v22","v23", + "v24","v25","v26","v27","v28","v29","v30","v31" + ); + +consider_edge_cases: + // Forward address. + a = a + m_iter * ps_a; + c = c + m_iter * 6 * rs_c; + auxinfo_t data_d6x4mn = *data; + bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn ); + bli_dgemmsup_rv_armv8a_int_6x4mn + ( + conja, conjb, m_left, 6, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx + ); + +} + diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c new file mode 100644 index 000000000..afdd13e28 --- /dev/null +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x7m.c @@ -0,0 +1,513 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +#include "blis.h" +#include "assert.h" + + +// Label locality & misc. +#include "../armv8a_asm_utils.h" + +// Nanokernel operations. +#include "../armv8a_asm_d2x2.h" + +/* Odd-NR dgemmsup_rv_*m kernels are special in that + * despite of the row-major name, C is laid out in COLUMNS in the register space. + * + * Block order: + * + * +---+ +---+ +---+ + * | 0 | | 3 | | 6 | |9 + * +---+ +---+ +---+ | + * +---+ +---+ +---+ + * | 1 | | 4 | | 7 | |10 + * +---+ +---+ +---+ | + * +---+ +---+ +---+ + * | 2 | | 5 | | 8 | |11 + * +---+ +---+ +---+ | + * + */ +#define DGEMM_C6X7_MKER_LOOP_PLAIN(C00,C01,C02,C03,C04,C05,C06,C10,C11,C12,C13,C14,C15,C16,C20,C21,C22,C23,C24,C25,C26,A0,A1,A2,B0,B1,B2,B3,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_2X2_NANOKERNEL(C00,C01,A0,B0) \ + DGEMM_2X2_NANOKERNEL(C10,C11,A1,B0) \ + DGEMM_2X2_NANOKERNEL(C20,C21,A2,B0) \ + DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ + DGEMM_2X2_NANOKERNEL(C02,C03,A0,B1) \ + DGEMM_2X2_NANOKERNEL(C12,C13,A1,B1) \ + DGEMM_2X2_NANOKERNEL(C22,C23,A2,B1) \ + DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT+16) \ + DGEMM_2X2_NANOKERNEL(C04,C05,A0,B2) \ + DGEMM_2X2_NANOKERNEL(C14,C15,A1,B2) \ + DGEMM_2X2_NANOKERNEL(C24,C25,A2,B2) \ + DGEMM_LOAD1V_ ##LOADNEXT (B2,BADDR,BSHIFT+32) \ +" fmla v"#C06".2d, v"#A0".2d, v"#B3".d["#BIDX"] \n\t" \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \ +" fmla v"#C16".2d, v"#A1".2d, v"#B3".d["#BIDX"] \n\t" \ + DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \ +" fmla v"#C26".2d, v"#A2".2d, v"#B3".d["#BIDX"] \n\t" + +// Interleaving load or not. +#define DGEMM_LOAD1V_noload(V1,ADDR,IMM) +#define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ +" ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" + +// #define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM) +// #define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \ +// DGEMM_LOAD1V_load(V1,ADDR,IMM) \ +// DGEMM_LOAD1V_load(V2,ADDR,IMM+16) + +#define DGEMM_LOAD1V_G_noload(V1,ADDR,ST) +#define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \ +" ld1 {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \ +" ld1 {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t" + +// Prefetch C in the long direction. +#define DPRFMC_FWD(CADDR,DLONGC) \ +" prfm PLDL1KEEP, ["#CADDR"] \n\t" \ +" add "#CADDR", "#CADDR", "#DLONGC" \n\t" + +// For column-storage of C. +#define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ + DLOAD1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" +#define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ + DSTORE1V(C2,CADDR,CSHIFT+32) \ +" add "#CADDR", "#CADDR", "#CSC" \n\t" + +// For row-storage of C: Store 3+1/2 vectors. +#define DLOADC_3PHV_R_FWD(C0,C1,C2,C3,CIDX,CADDR,CSHIFT,RSC,CTMP) \ +" add "#CTMP", "#CADDR", "#CSHIFT"+48 \n\t" \ + DLOAD2V(C0,C1,CADDR,CSHIFT) \ + DLOAD1V(C2,CADDR,CSHIFT+32) \ +" ld1 {v"#C3".d}["#CIDX"], ["#CTMP"] \n\t" \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" +#define DSTOREC_3PHV_R_FWD(C0,C1,C2,C3,CIDX,CADDR,CSHIFT,RSC,CTMP) \ +" add "#CTMP", "#CADDR", "#CSHIFT"+48 \n\t" \ + DSTORE2V(C0,C1,CADDR,CSHIFT) \ + DSTORE1V(C2,CADDR,CSHIFT+32) \ +" st1 {v"#C3".d}["#CIDX"], ["#CTMP"] \n\t" \ +" add "#CADDR", "#CADDR", "#RSC" \n\t" + +#define DSCALE7V(V0,V1,V2,V3,V4,V5,V6,A,IDX) \ + DSCALE4V(V0,V1,V2,V3,A,IDX) \ + DSCALE2V(V4,V5,A,IDX) \ + DSCALE1V(V6,A,IDX) +#define DSCALEA7V(D0,D1,D2,D3,D4,D5,D6,S0,S1,S2,S3,S4,S5,S6,A,IDX) \ + DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ + DSCALEA2V(D4,D5,S4,S5,A,IDX) \ + DSCALEA1V(D6,S6,A,IDX) +#define DSCALEA3V(D0,D1,D2,S0,S1,S2,A,IDX) \ + DSCALEA2V(D0,D1,S0,S1,A,IDX) \ + DSCALEA1V(D2,S2,A,IDX) + + +void bli_dgemmsup_rv_armv8a_asm_6x7m + ( + conj_t conja, + conj_t conjb, + dim_t m0, + dim_t n0, + dim_t k0, + double* restrict alpha, + double* restrict a, inc_t rs_a0, inc_t cs_a0, + double* restrict b, inc_t rs_b0, inc_t cs_b0, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* data, + cntx_t* cntx + ) +{ + assert( n0 == 7 ); + + // LLVM has very bad routing ability for inline asm. + // Limit number of registers in case of Clang compilation. +#ifndef __clang__ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); +#endif + uint64_t ps_a = bli_auxinfo_ps_a( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 7; + uint64_t k_left = k0 % 7; + + int64_t m_iter = m0 / 6; + int64_t m_left = m0 % 6; + + uint64_t rs_a = rs_a0; + uint64_t cs_a = cs_a0; + uint64_t rs_b = rs_b0; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + // uint64_t cs_b = cs_b0; + assert( cs_b0 == 1 ); + + if ( m_iter == 0 ) goto consider_edge_cases; + + __asm__ volatile + ( +" ldr x10, %[a] \n\t" +" ldr x13, %[c] \n\t" +" ldr x12, %[m_iter] \n\t" +" ldr x11, %[ps_a] \n\t" // Panel-skip of A. +" ldr x9, %[rs_a] \n\t" // Row-skip of A. +" ldr x2, %[cs_a] \n\t" // Column-skip of A. +" ldr x3, %[rs_b] \n\t" // Row-skip of B. +" \n\t" +" ldr x6, %[rs_c] \n\t" // Row-skip of C. +" ldr x7, %[cs_c] \n\t" // Column-skip of C. +" \n\t" +" \n\t" // Multiply some address skips by sizeof(double). +" lsl x11, x11, #3 \n\t" // ps_a +" lsl x9, x9, #3 \n\t" // rs_a +" lsl x2, x2, #3 \n\t" // cs_a +" lsl x3, x3, #3 \n\t" // rs_b +" lsl x6, x6, #3 \n\t" // rs_c +" lsl x7, x7, #3 \n\t" // cs_c +" \n\t" +" mov x1, x5 \n\t" +" cmp x7, #8 \n\t" // Prefetch column-strided C. +BNE(C_PREFETCH_COLS) +// This prefetch will not cover further mker perts. Skip. +// +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +// DPRFMC_FWD(x1,x6) +BRANCH(C_PREFETCH_END) +LABEL(C_PREFETCH_COLS) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +DPRFMC_FWD(x1,x7) +LABEL(C_PREFETCH_END) +// +// Millikernel. +LABEL(MILLIKER_MLOOP) +" \n\t" +" mov x0, x10 \n\t" // Parameters to be reloaded +" mov x5, x13 \n\t" // within each millikernel loop. +" ldr x1, %[b] \n\t" +" ldr x4, %[k_mker] \n\t" +" ldr x8, %[k_left] \n\t" +" \n\t" +// Storage scheme: +// V[ 0:20] <- C +// V[21:27] <- A +// V[28:31] <- B +// Under this scheme, the following is defined: +#define DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ + DGEMM_C6X7_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,A0,A1,A2,B0,B1,B2,B3,BIDX,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) +// Load from memory. +LABEL(LOAD_ABC) +" \n\t" // No-microkernel early return is a must +" cmp x4, #0 \n\t" // to avoid out-of-boundary read. +BEQ(CLEAR_CCOLS) +" \n\t" +" mov x14, x0 \n\t" // Load A. +" ld1 {v21.d}[0], [x14], x9 \n\t" +" ld1 {v21.d}[1], [x14], x9 \n\t" +" ld1 {v22.d}[0], [x14], x9 \n\t" +" ld1 {v22.d}[1], [x14], x9 \n\t" +" ld1 {v23.d}[0], [x14], x9 \n\t" +" ld1 {v23.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ld1 {v24.d}[0], [x14], x9 \n\t" +" ld1 {v24.d}[1], [x14], x9 \n\t" +" ld1 {v25.d}[0], [x14], x9 \n\t" +" ld1 {v25.d}[1], [x14], x9 \n\t" +" ld1 {v26.d}[0], [x14], x9 \n\t" +" ld1 {v26.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ld1 {v27.d}[0], [x14], x9 \n\t" +" ld1 {v27.d}[1], [x14], x9 \n\t" +" \n\t" +" ldr q28, [x1, #16*0] \n\t" // Load B. +" ldr q29, [x1, #16*1] \n\t" +" ldr q30, [x1, #16*2] \n\t" +" ldr d31, [x1, #16*3] \n\t" // Scalar loads into idx 0. +" add x1, x1, x3 \n\t" +" \n\t" +LABEL(CLEAR_CCOLS) +CLEAR4V(0,1,2,3) +CLEAR2V(4,5) +CLEAR1V(6) +CLEAR4V(7,8,9,10) +CLEAR2V(11,12) +CLEAR1V(13) +CLEAR4V(14,15,16,17) +CLEAR2V(18,19) +CLEAR1V(20) +// No-microkernel early return, once again. +BEQ(K_LEFT_LOOP) +// +// Microkernel is defined here as: +#define DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,B3,BIDX) \ + DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,BIDX,x14,x9,x1,0,load) \ + "add x0, x0, x2 \n\t" \ + "mov x14, x0 \n\t" \ + "ld1 {v"#A2".d}[0], [x14], x9 \n\t" \ + "ld1 {v"#A2".d}[1], [x14], x9 \n\t" \ + /* Due to this loading, BIDX can only be 0 here. */ \ + "ldr d"#B3", [x1, #16*3] \n\t" \ + "add x1, x1, x3 \n\t" +// Start microkernel loop. +LABEL(K_MKER_LOOP) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(21,22,23,28,29,30,31,0) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,28,29,30,31,0) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(27,21,22,28,29,30,31,0) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(23,24,25,28,29,30,31,0) +" \n\t" // Decrease counter before final replica. +" subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(26,27,21,28,29,30,31,0) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(22,23,24,28,29,30,31,0) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC_FWD(25,26,27,28,29,30,31,0) +BRANCH(K_MKER_LOOP) +// +// Final microkernel loop. +LABEL(FIN_MKER_LOOP) +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(26,27,21,28,29,30,31,0,x14,x9,x1,0,load) +" add x0, x0, x2 \n\t" +" mov x14, x0 \n\t" +" ldr d31, [x1, #16*3] \n\t" +" add x1, x1, x3 \n\t" +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(22,23,24,28,29,30,31,0,xzr,-1,xzr,-1,noload) +" ldr q28, [x1, #16*0] \n\t" +" ldr q29, [x1, #16*1] \n\t" +" ldr q30, [x1, #16*2] \n\t" +" ldr d31, [x1, #16*3] \n\t" +" add x1, x1, x3 \n\t" +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(25,26,27,28,29,30,31,0,xzr,-1,xzr,-1,noload) +// +// Loops left behind microkernels. +LABEL(K_LEFT_LOOP) +" cmp x8, #0 \n\t" // End of exec. +BEQ(WRITE_MEM_PREP) +" mov x14, x0 \n\t" // Load A col. +" ld1 {v21.d}[0], [x14], x9 \n\t" +" ld1 {v21.d}[1], [x14], x9 \n\t" +" ld1 {v22.d}[0], [x14], x9 \n\t" +" ld1 {v22.d}[1], [x14], x9 \n\t" +" ld1 {v23.d}[0], [x14], x9 \n\t" +" ld1 {v23.d}[1], [x14], x9 \n\t" +" add x0, x0, x2 \n\t" +" ldr q28, [x1, #16*0] \n\t" // Load B row. +" ldr q29, [x1, #16*1] \n\t" +" ldr q30, [x1, #16*2] \n\t" +" ldr d31, [x1, #16*3] \n\t" +" add x1, x1, x3 \n\t" +" sub x8, x8, #1 \n\t" +DGEMM_C6X7_MKER_LOOP_PLAIN_LOC(21,22,23,28,29,30,31,0,xzr,-1,xzr,-1,noload) +BRANCH(K_LEFT_LOOP) +// +// Scale and write to memory. +LABEL(WRITE_MEM_PREP) +" ldr x4, %[alpha] \n\t" // Load alpha & beta (address). +" ldr x8, %[beta] \n\t" +" ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta. +" ld1r {v31.2d}, [x8] \n\t" +" fmov d26, #1.0 \n\t" +" fcmp d30, d26 \n\t" +BEQ(UNIT_ALPHA) +DSCALE7V(0,1,2,3,4,5,6,30,0) +DSCALE7V(7,8,9,10,11,12,13,30,0) +DSCALE7V(14,15,16,17,18,19,20,30,0) +LABEL(UNIT_ALPHA) +" \n\t" +" mov x1, x5 \n\t" // C address for loading. +" \n\t" // C address for storing is x5 itself. +" cmp x7, #8 \n\t" // Check for column-storage. +BNE(WRITE_MEM_C) +// Unlike other RV kernels, here row-storage of C requires +// in-register transpose. +" trn1 v21.2d, v0.2d, v1.2d \n\t" +" trn2 v22.2d, v0.2d, v1.2d \n\t" +" trn1 v23.2d, v2.2d, v3.2d \n\t" +" trn2 v24.2d, v2.2d, v3.2d \n\t" +" trn1 v25.2d, v4.2d, v5.2d \n\t" +" trn2 v26.2d, v4.2d, v5.2d \n\t" +" \n\t" +" trn1 v0.2d, v7.2d, v8.2d \n\t" +" trn2 v1.2d, v7.2d, v8.2d \n\t" +" trn1 v2.2d, v9.2d, v10.2d \n\t" +" trn2 v3.2d, v9.2d, v10.2d \n\t" +" trn1 v4.2d, v11.2d, v12.2d \n\t" +" trn2 v5.2d, v11.2d, v12.2d \n\t" +" \n\t" +" trn1 v7.2d, v14.2d, v15.2d \n\t" +" trn2 v8.2d, v14.2d, v15.2d \n\t" +" trn1 v9.2d, v16.2d, v17.2d \n\t" +" trn2 v10.2d, v16.2d, v17.2d \n\t" +" trn1 v11.2d, v18.2d, v19.2d \n\t" +" trn2 v12.2d, v18.2d, v19.2d \n\t" +" \n\t" +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_R_1_2) +DLOADC_3PHV_R_FWD(14,15,16,30,0,x1,0,x6,x8) +DLOADC_3PHV_R_FWD(17,18,19,30,1,x1,0,x6,x8) +DSCALEA7V(21,23,25,22,24,26,6,14,15,16,17,18,19,30,31,0) +LABEL(ZERO_BETA_R_1_2) +DSTOREC_3PHV_R_FWD(21,23,25,6,0,x5,0,x6,x8) +DSTOREC_3PHV_R_FWD(22,24,26,6,1,x5,0,x6,x8) +BEQ(ZERO_BETA_R_3_4_5_6) +DLOADC_3PHV_R_FWD(14,15,16,30,0,x1,0,x6,x8) +DLOADC_3PHV_R_FWD(17,18,19,30,1,x1,0,x6,x8) +DLOADC_3PHV_R_FWD(21,22,23,28,0,x1,0,x6,x8) +DLOADC_3PHV_R_FWD(24,25,26,28,1,x1,0,x6,x8) +DSCALEA7V(0,2,4,1,3,5,13,14,15,16,17,18,19,30,31,0) +DSCALEA7V(7,9,11,8,10,12,20,21,22,23,24,25,26,28,31,0) +LABEL(ZERO_BETA_R_3_4_5_6) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_R) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_R) +#endif +DSTOREC_3PHV_R_FWD(0,2,4,13,0,x5,0,x6,x8) +DSTOREC_3PHV_R_FWD(1,3,5,13,1,x5,0,x6,x8) +DSTOREC_3PHV_R_FWD(7,9,11,20,0,x5,0,x6,x8) +DSTOREC_3PHV_R_FWD(8,10,12,20,1,x5,0,x6,x8) +BRANCH(END_WRITE_MEM) +// +// C storage in columns. +LABEL(WRITE_MEM_C) +" fcmp d31, #0.0 \n\t" +BEQ(ZERO_BETA_C_1_2) +DLOADC_3V_C_FWD(21,22,23,x1,0,x7) +DLOADC_3V_C_FWD(24,25,26,x1,0,x7) +DSCALEA3V(0,7,14,21,22,23,31,0) +DSCALEA3V(1,8,15,24,25,26,31,0) +LABEL(ZERO_BETA_C_1_2) +DSTOREC_3V_C_FWD(0,7,14,x5,0,x7) +DSTOREC_3V_C_FWD(1,8,15,x5,0,x7) +BEQ(ZERO_BETA_C_3_4_5_6_7) +DLOADC_3V_C_FWD(21,22,23,x1,0,x7) +DLOADC_3V_C_FWD(24,25,26,x1,0,x7) +DLOADC_3V_C_FWD(27,28,29,x1,0,x7) +DLOADC_3V_C_FWD(0,7,14,x1,0,x7) +DLOADC_3V_C_FWD(1,8,15,x1,0,x7) +DSCALEA3V(2,9,16,21,22,23,31,0) +DSCALEA3V(3,10,17,24,25,26,31,0) +DSCALEA3V(4,11,18,27,28,29,31,0) +DSCALEA3V(5,12,19,0,7,14,31,0) +DSCALEA3V(6,13,20,1,8,15,31,0) +LABEL(ZERO_BETA_C_3_4_5_6_7) +#ifndef __clang__ +" cmp x12, #1 \n\t" +BRANCH(PRFM_END_C) +" prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" +" prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*0] \n\t" +" prfm PLDL1STRM, [%[b_next], #16*1] \n\t" +LABEL(PRFM_END_C) +#endif +DSTOREC_3V_C_FWD(2,9,16,x5,0,x7) +DSTOREC_3V_C_FWD(3,10,17,x5,0,x7) +DSTOREC_3V_C_FWD(4,11,18,x5,0,x7) +DSTOREC_3V_C_FWD(5,12,19,x5,0,x7) +DSTOREC_3V_C_FWD(6,13,20,x5,0,x7) +// +// End of this microkernel. +LABEL(END_WRITE_MEM) +" \n\t" +" subs x12, x12, #1 \n\t" +BEQ(END_EXEC) +" \n\t" +" mov x8, #6 \n\t" +" madd x13, x6, x8, x13 \n\t" // Forward C's base address to the next logic panel. +" add x10, x10, x11 \n\t" // Forward A's base address to the next logic panel. +BRANCH(MILLIKER_MLOOP) +// +// End of execution. +LABEL(END_EXEC) +: +: [a] "m" (a), + [b] "m" (b), + [c] "m" (c), + [rs_a] "m" (rs_a), + [cs_a] "m" (cs_a), + [ps_a] "m" (ps_a), + [rs_b] "m" (rs_b), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + // In Clang, even "m"-passed parameter takes 1 register. + // Have to disable prefetching to pass compilation. +#ifndef __clang__ + [a_next] "r" (a_next), + [b_next] "r" (b_next), +#endif + [m_iter] "m" (m_iter), + [k_mker] "m" (k_mker), + [k_left] "m" (k_left), + [alpha] "m" (alpha), + [beta] "m" (beta) +: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10","x11","x12","x13","x14", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10","v11","v12","v13","v14","v15", + "v16","v17","v18","v19","v20","v21","v22","v23", + "v24","v25","v26","v27","v28","v29","v30","v31" + ); + +consider_edge_cases: + // Forward address. + a = a + m_iter * ps_a; + c = c + m_iter * 6 * rs_c; + auxinfo_t data_d6x4mn = *data; + bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn ); + bli_dgemmsup_rv_armv8a_int_6x4mn + ( + conja, conjb, m_left, 7, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx + ); + +} + diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c index 8ff5ec173..b912480fa 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c @@ -37,7 +37,6 @@ #include "blis.h" #include "assert.h" -GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" @@ -146,47 +145,70 @@ void bli_dgemmsup_rv_armv8a_asm_6x8m { if ( n0 != 8 ) { - if ( n0 < 8 ) + assert( n0 <= 13 ); + + // Manual separation. + dgemmsup_ker_ft ker_fp1 = NULL; + dgemmsup_ker_ft ker_fp2 = NULL; + dim_t nr1, nr2; + + if ( n0 == 13 ) { - for ( ; n0 >= 4; n0 -= 4 ) - { - dgemmsup_ker_ft ukr_fp; - auxinfo_t data_d8xkm = *data; - if ( bli_auxinfo_ps_a( data ) == 6 * rs_a0 ) - { - // Use 8x4 Asm kernel for the unpacked case. - bli_auxinfo_set_ps_a( 8 * rs_a0, &data_d8xkm ); - ukr_fp = bli_dgemmsup_rv_armv8a_asm_8x4m; - } - else - { - // Cannot change dimension for m when A is packed. - ukr_fp = bli_dgemmsup_rv_armv8a_int_6x4mn; - } - - ukr_fp - ( - conja, conjb, m0, 4, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, &data_d8xkm, cntx - ); - b += 4 * cs_b0; - c += 4 * cs_c0; - } - if ( n0 > 0 ) - { - bli_dgemmsup_rv_armv8a_int_6x4mn - ( - conja, conjb, m0, n0, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - } + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x7m; nr1 = 7; + ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr2 = 6; } - else + if ( n0 == 12 ) { - assert( FALSE ); + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr1 = 6; + ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr2 = 6; } + if ( n0 == 11 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr1 = 6; + ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr2 = 5; + } + if ( n0 == 10 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr1 = 5; + ker_fp2 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr2 = 5; + } + if ( n0 == 9 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr1 = 5; + ker_fp2 = bli_dgemmsup_rv_armv8a_int_6x4mn; nr2 = 4; + } + if ( n0 == 7 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x7m; nr1 = 7; + } + if ( n0 == 6 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x6m; nr1 = 6; + } + if ( n0 == 5 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_6x5m; nr1 = 5; + } + if ( n0 <= 4 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_int_6x4mn; nr1 = n0; + } + + ker_fp1 + ( + conja, conjb, m0, nr1, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + b += nr1 * cs_b0; + c += nr1 * cs_c0; + if ( ker_fp2 ) + ker_fp2 + ( + conja, conjb, m0, nr2, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); return; } @@ -534,7 +556,6 @@ LABEL(END_EXEC) // Forward address. a = a + m_iter * ps_a; c = c + m_iter * 6 * rs_c; -#if 1 auxinfo_t data_d6x4mn = *data; bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn ); bli_dgemmsup_rv_armv8a_int_6x4mn @@ -543,33 +564,6 @@ LABEL(END_EXEC) alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx ); -#else - if ( m_left >= 4 ) - { - // Calls 4x8m with only 1 outermost loop. - // As only 1 outermost loop is called, - // ps_a needs not being set here. - // - bli_dgemmsup_rv_armv8a_asm_4x8m - ( - conja, conjb, 4, 8, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - m_left -= 4; - a = a + 4 * rs_a; - c = c + 4 * rs_c; - } - if ( m_left ) - { - bli_dgemmsup_r_armv8a_ref2 - ( - conja, conjb, m_left, 8, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - } -#endif } diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c index 9bdf4b3b8..910e07dbb 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c @@ -37,7 +37,6 @@ #include "blis.h" #include "assert.h" -GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" @@ -146,33 +145,56 @@ void bli_dgemmsup_rv_armv8a_asm_6x8n { if ( m0 != 6 ) { - // 5 = 4 + 1; - // 4; - // - while ( m0 >= 4 ) + assert( m0 <= 9 ); + + // Manual separation. + dgemmsup_ker_ft ker_fp1 = NULL; + dgemmsup_ker_ft ker_fp2 = NULL; + dim_t mr1, mr2; + + if ( m0 == 9 ) { - bli_dgemmsup_rv_armv8a_asm_4x8n - ( - conja, conjb, 4, n0, k0, - alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, - beta, c, rs_c0, cs_c0, data, cntx - ); - m0 -= 4; - a += 4 * rs_a0; - c += 4 * rs_c0; + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_5x8n; mr1 = 5; + ker_fp2 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr2 = 4; } - - // 3, 2, 1; - // - if ( m0 > 0 ) + if ( m0 == 8 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr1 = 4; + ker_fp2 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr2 = 4; + } + if ( m0 == 7 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr1 = 4; + ker_fp2 = bli_dgemmsup_rv_armv8a_int_3x8mn; mr2 = 3; + } + if ( m0 == 5 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_5x8n; mr1 = 5; + } + if ( m0 == 4 ) { - bli_dgemmsup_rv_armv8a_int_3x8mn + ker_fp1 = bli_dgemmsup_rv_armv8a_asm_4x8n; mr1 = 4; + } + if ( m0 < 4 ) + { + ker_fp1 = bli_dgemmsup_rv_armv8a_int_3x8mn; mr1 = m0; + } + + ker_fp1 + ( + conja, conjb, mr1, n0, k0, + alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, + beta, c, rs_c0, cs_c0, data, cntx + ); + a += mr1 * rs_a0; + c += mr1 * rs_c0; + if ( ker_fp2 ) + ker_fp2 ( - conja, conjb, m0, n0, k0, + conja, conjb, mr2, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); - } return; } diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c index 4d374df98..d3af5781c 100644 --- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c +++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c @@ -36,7 +36,6 @@ #include "blis.h" #include "assert.h" -GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" diff --git a/kernels/armv8a/bli_kernels_armv8a.h b/kernels/armv8a/bli_kernels_armv8a.h index b7ab75541..64a3f2fb5 100644 --- a/kernels/armv8a/bli_kernels_armv8a.h +++ b/kernels/armv8a/bli_kernels_armv8a.h @@ -39,6 +39,8 @@ PACKM_KER_PROT( double, d, packm_armv8a_int_8xk ) GEMM_UKR_PROT( float, s, gemm_armv8a_asm_8x12 ) GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8 ) +GEMM_UKR_PROT( float, s, gemm_armv8a_asm_12x8r ) +GEMM_UKR_PROT( double, d, gemm_armv8a_asm_8x6r ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8r ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_8x4 ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_4x4 ) @@ -47,6 +49,10 @@ GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x8m ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x7m ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x6m ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x5m ) +GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_5x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_4x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_8x4m ) From a87eae2b11408b556e562f1b04e673c6cd1612bc Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 6 Sep 2022 18:04:09 -0500 Subject: [PATCH 076/230] Added '-q' quiet mode option to testsuite. (#657) Details: - Added support for a '-q' command line option to the testsuite. This option suppresses most informational output that would normally clutter up the screen. By default, verbose mode (the previous status quo) will be operative, and so quiet mode must be requested. --- testsuite/src/test_libblis.c | 58 ++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index eaa0a9cef..442fae0e0 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -42,6 +42,8 @@ char libblis_test_binary_name[ MAX_BINARY_NAME_LENGTH + 1 ]; char libblis_test_parameters_filename[ MAX_FILENAME_LENGTH + 1 ]; char libblis_test_operations_filename[ MAX_FILENAME_LENGTH + 1 ]; +bool libblis_test_quiet_mode = FALSE; + char libblis_test_pass_string[ MAX_PASS_STRING_LENGTH + 1 ]; char libblis_test_warn_string[ MAX_PASS_STRING_LENGTH + 1 ]; char libblis_test_fail_string[ MAX_PASS_STRING_LENGTH + 1 ]; @@ -720,6 +722,9 @@ void libblis_test_read_op_info( test_ops_t* ops, void libblis_test_output_section_overrides( FILE* os, test_ops_t* ops ) { + // Skip informational output if BLIS is running in quiet mode. + if ( libblis_test_quiet_mode ) return; + libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "--- Section overrides ---\n" ); libblis_test_fprintf_c( os, "\n" ); @@ -746,6 +751,17 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) cntx_t* cntx_c; cntx_t* cntx_z; +#ifndef BLIS_ENABLE_GEMM_MD + // Notify the user if mixed domain or mixed precision was requested. + if ( params->mixed_domain || params->mixed_precision ) + { + libblis_test_printf_error( "mixed domain and/or mixed precision testing requested, but building against BLIS without mixed datatype support.\n" ); + } +#endif + + // Skip informational output if BLIS is running in quiet mode. + if ( libblis_test_quiet_mode ) return; + // If bli_info_get_int_type_size() returns 32 or 64, the size is forced. // Otherwise, the size is chosen automatically. We query the result of // that automatic choice via sizeof(gint_t). @@ -1241,14 +1257,6 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf( os, "\n" ); -#ifndef BLIS_ENABLE_GEMM_MD - // Notify the user if mixed domain or mixed precision was requested. - if ( params->mixed_domain || params->mixed_precision ) - { - libblis_test_printf_error( "mixed domain and/or mixed precision testing requested, but building against BLIS without mixed datatype support.\n" ); - } -#endif - // If mixed domain or mixed precision was requested, we disable all // induced methods except 1m and native execution. if ( params->mixed_domain || params->mixed_precision ) @@ -1267,6 +1275,12 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) void libblis_test_output_op_struct( FILE* os, test_op_t* op, char* op_str ) { + // Skip informational output if BLIS is running in quiet mode. + if ( libblis_test_quiet_mode ) return; + + libblis_test_fprintf_c( os, "--- %s ---\n", op_str ); + libblis_test_fprintf_c( os, "\n" ); + dimset_t dimset = op->dimset; if ( dimset == BLIS_TEST_DIMS_MNK ) @@ -2086,8 +2100,6 @@ void libblis_test_op_driver if ( tdata->id == 0 ) { // Output a heading and the contents of the op struct. - libblis_test_fprintf_c( stdout, "--- %s ---\n", op_str ); - libblis_test_fprintf_c( stdout, "\n" ); libblis_test_output_op_struct( stdout, op, op_str ); // Also output to a matlab file if requested (and successfully opened). @@ -2099,8 +2111,6 @@ void libblis_test_op_driver // stdout (at the end of libblis_test_read_parameter_file()). libblis_test_output_params_struct( output_stream, params ); - libblis_test_fprintf_c( output_stream, "--- %s ---\n", op_str ); - libblis_test_fprintf_c( output_stream, "\n" ); libblis_test_output_op_struct( output_stream, op, op_str ); } } @@ -3082,7 +3092,7 @@ void libblis_test_parse_command_line( int argc, char** argv ) bli_getopt_init_state( 0, &state ); // Process all option arguments until we get a -1, which means we're done. - while( (opt = bli_getopt( argc, ( const char** )argv, "g:o:", &state )) != -1 ) + while( (opt = bli_getopt( argc, ( const char** )argv, "g:o:q", &state )) != -1 ) { // Explicitly typecast opt, which is an int, to a char. (Failing to // typecast resulted in at least one user-reported problem whereby @@ -3092,19 +3102,21 @@ void libblis_test_parse_command_line( int argc, char** argv ) switch( opt_ch ) { case 'g': - libblis_test_printf_infoc( "detected -g option; using \"%s\" for parameters filename.\n", state.optarg ); strncpy( libblis_test_parameters_filename, state.optarg, MAX_FILENAME_LENGTH ); gave_option_g = TRUE; break; case 'o': - libblis_test_printf_infoc( "detected -o option; using \"%s\" for operations filename.\n", state.optarg ); strncpy( libblis_test_operations_filename, state.optarg, MAX_FILENAME_LENGTH ); gave_option_o = TRUE; break; + case 'q': + libblis_test_quiet_mode = TRUE; + break; + case '?': libblis_test_printf_error( "unexpected option '%c' given or missing option argument\n", state.optopt ); break; @@ -3116,21 +3128,37 @@ void libblis_test_parse_command_line( int argc, char** argv ) if ( gave_option_g == FALSE ) { + // Skip informational output if BLIS is running in quiet mode. + if ( !libblis_test_quiet_mode ) libblis_test_printf_infoc( "no -g option given; defaulting to \"%s\" for parameters filename.\n", PARAMETERS_FILENAME ); // Copy default parameters filename into its global string. strncpy( libblis_test_parameters_filename, PARAMETERS_FILENAME, MAX_FILENAME_LENGTH ); } + else + { + // Skip informational output if BLIS is running in quiet mode. + if ( !libblis_test_quiet_mode ) + libblis_test_printf_infoc( "detected -g option; using \"%s\" for parameters filename.\n", state.optarg ); + } if ( gave_option_o == FALSE ) { + // Skip informational output if BLIS is running in quiet mode. + if ( !libblis_test_quiet_mode ) libblis_test_printf_infoc( "no -o option given; defaulting to \"%s\" for operations filename.\n", OPERATIONS_FILENAME ); // Copy default operations filename into its global string. strncpy( libblis_test_operations_filename, OPERATIONS_FILENAME, MAX_FILENAME_LENGTH ); } + else + { + // Skip informational output if BLIS is running in quiet mode. + if ( !libblis_test_quiet_mode ) + libblis_test_printf_infoc( "detected -o option; using \"%s\" for operations filename.\n", state.optarg ); + } // If there are still arguments remaining after getopt() processing is // complete, print an error. From 4afe0cfdab0e069e027f97920ea604249e34df47 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 8 Sep 2022 18:33:20 -0500 Subject: [PATCH 077/230] Defined invscalv, invscalm, invscald operations. (#661) Details: - Defined invert-scale (invscal) operation on vectors (level-1v), matrices (level-1m), and diagonals (level-1d). - Added test modules for invscalv and invscalm to the testsuite. - Updated BLISObjectAPI.md and BLISTypedAPI.md API documentation to reflect the new operations. Also updated KernelsHowTo.md accordingly. - Renamed 'beta' to 'alpha' in scalv and scalm testsuite modules (and input.operations files) so that the parameter name matches the parameter used in the documentation. --- docs/BLISObjectAPI.md | 55 +++++- docs/BLISTypedAPI.md | 60 +++++- docs/KernelsHowTo.md | 28 ++- frame/1/bli_l1v_check.c | 1 + frame/1/bli_l1v_check.h | 1 + frame/1/bli_l1v_fpa.c | 1 + frame/1/bli_l1v_fpa.h | 1 + frame/1/bli_l1v_ft.h | 3 +- frame/1/bli_l1v_ft_ker.h | 3 +- frame/1/bli_l1v_ker.h | 6 + frame/1/bli_l1v_ker_prot.h | 12 ++ frame/1/bli_l1v_oapi.c | 1 + frame/1/bli_l1v_oapi.h | 1 + frame/1/bli_l1v_tapi.c | 1 + frame/1/bli_l1v_tapi.h | 1 + frame/1d/bli_l1d_check.c | 1 + frame/1d/bli_l1d_check.h | 1 + frame/1d/bli_l1d_fpa.c | 1 + frame/1d/bli_l1d_fpa.h | 1 + frame/1d/bli_l1d_ft.h | 3 +- frame/1d/bli_l1d_oapi.c | 1 + frame/1d/bli_l1d_oapi.h | 1 + frame/1d/bli_l1d_tapi.c | 1 + frame/1d/bli_l1d_tapi.h | 1 + frame/1m/bli_l1m_check.c | 1 + frame/1m/bli_l1m_check.h | 1 + frame/1m/bli_l1m_fpa.c | 1 + frame/1m/bli_l1m_fpa.h | 1 + frame/1m/bli_l1m_ft.h | 3 +- frame/1m/bli_l1m_oapi.c | 1 + frame/1m/bli_l1m_oapi.h | 1 + frame/1m/bli_l1m_tapi.c | 1 + frame/1m/bli_l1m_tapi.h | 1 + frame/1m/bli_l1m_unb_var1.c | 1 + frame/1m/bli_l1m_unb_var1.h | 1 + frame/include/bli_type_defs.h | 1 + ref_kernels/1/bli_invscalv_ref.c | 81 +++++++++ ref_kernels/bli_cntx_ref.c | 31 ++-- testsuite/input.operations | 12 +- testsuite/input.operations.fast | 12 +- testsuite/input.operations.mixed | 12 +- testsuite/input.operations.salt | 12 +- testsuite/src/test_invscalm.c | 301 +++++++++++++++++++++++++++++++ testsuite/src/test_invscalm.h | 42 +++++ testsuite/src/test_invscalv.c | 297 ++++++++++++++++++++++++++++++ testsuite/src/test_invscalv.h | 42 +++++ testsuite/src/test_libblis.c | 14 +- testsuite/src/test_libblis.h | 4 + testsuite/src/test_scalm.c | 50 ++--- testsuite/src/test_scalv.c | 48 ++--- 50 files changed, 1070 insertions(+), 88 deletions(-) create mode 100644 ref_kernels/1/bli_invscalv_ref.c create mode 100644 testsuite/src/test_invscalm.c create mode 100644 testsuite/src/test_invscalm.h create mode 100644 testsuite/src/test_invscalv.c create mode 100644 testsuite/src/test_invscalv.h diff --git a/docs/BLISObjectAPI.md b/docs/BLISObjectAPI.md index 5e8ed3d8f..51f5753a0 100644 --- a/docs/BLISObjectAPI.md +++ b/docs/BLISObjectAPI.md @@ -41,11 +41,11 @@ This index provides a quick way to jump directly to the description for each operation discussed later in the [Computational function reference](BLISObjectAPI.md#computational-function-reference) section: * **[Level-1v](BLISObjectAPI.md#level-1v-operations)**: Operations on vectors: - * [addv](BLISObjectAPI.md#addv), [amaxv](BLISObjectAPI.md#amaxv), [axpyv](BLISObjectAPI.md#axpyv), [axpbyv](BLISObjectAPI.md#axpbyv), [copyv](BLISObjectAPI.md#copyv), [dotv](BLISObjectAPI.md#dotv), [dotxv](BLISObjectAPI.md#dotxv), [invertv](BLISObjectAPI.md#invertv), [scal2v](BLISObjectAPI.md#scal2v), [scalv](BLISObjectAPI.md#scalv), [setv](BLISObjectAPI.md#setv), [setrv](BLISObjectAPI.md#setrv), [setiv](BLISObjectAPI.md#setiv), [subv](BLISObjectAPI.md#subv), [swapv](BLISObjectAPI.md#swapv), [xpbyv](BLISObjectAPI.md#xpbyv) + * [addv](BLISObjectAPI.md#addv), [amaxv](BLISObjectAPI.md#amaxv), [axpyv](BLISObjectAPI.md#axpyv), [axpbyv](BLISObjectAPI.md#axpbyv), [copyv](BLISObjectAPI.md#copyv), [dotv](BLISObjectAPI.md#dotv), [dotxv](BLISObjectAPI.md#dotxv), [invertv](BLISObjectAPI.md#invertv), [invscalv](BLISObjectAPI.md#invscalv), [scalv](BLISObjectAPI.md#scalv), [scal2v](BLISObjectAPI.md#scal2v), [setv](BLISObjectAPI.md#setv), [setrv](BLISObjectAPI.md#setrv), [setiv](BLISObjectAPI.md#setiv), [subv](BLISObjectAPI.md#subv), [swapv](BLISObjectAPI.md#swapv), [xpbyv](BLISObjectAPI.md#xpbyv) * **[Level-1d](BLISObjectAPI.md#level-1d-operations)**: Element-wise operations on matrix diagonals: - * [addd](BLISObjectAPI.md#addd), [axpyd](BLISObjectAPI.md#axpyd), [copyd](BLISObjectAPI.md#copyd), [invertd](BLISObjectAPI.md#invertd), [scald](BLISObjectAPI.md#scald), [scal2d](BLISObjectAPI.md#scal2d), [setd](BLISObjectAPI.md#setd), [setid](BLISObjectAPI.md#setid), [shiftd](BLISObjectAPI.md#shiftd), [subd](BLISObjectAPI.md#subd), [xpbyd](BLISObjectAPI.md#xpbyd) + * [addd](BLISObjectAPI.md#addd), [axpyd](BLISObjectAPI.md#axpyd), [copyd](BLISObjectAPI.md#copyd), [invertd](BLISObjectAPI.md#invertd), [invscald](BLISObjectAPI.md#invscald), [scald](BLISObjectAPI.md#scald), [scal2d](BLISObjectAPI.md#scal2d), [setd](BLISObjectAPI.md#setd), [setid](BLISObjectAPI.md#setid), [shiftd](BLISObjectAPI.md#shiftd), [subd](BLISObjectAPI.md#subd), [xpbyd](BLISObjectAPI.md#xpbyd) * **[Level-1m](BLISObjectAPI.md#level-1m-operations)**: Element-wise operations on matrices: - * [addm](BLISObjectAPI.md#addm), [axpym](BLISObjectAPI.md#axpym), [copym](BLISObjectAPI.md#copym), [scalm](BLISObjectAPI.md#scalm), [scal2m](BLISObjectAPI.md#scal2m), [setm](BLISObjectAPI.md#setm), [setrm](BLISObjectAPI.md#setrm), [setim](BLISObjectAPI.md#setim), [subm](BLISObjectAPI.md#subm) + * [addm](BLISObjectAPI.md#addm), [axpym](BLISObjectAPI.md#axpym), [copym](BLISObjectAPI.md#copym), [invscalm](BLISObjectAPI.md#invscalm), [scalm](BLISObjectAPI.md#scalm), [scal2m](BLISObjectAPI.md#scal2m), [setm](BLISObjectAPI.md#setm), [setrm](BLISObjectAPI.md#setrm), [setim](BLISObjectAPI.md#setim), [subm](BLISObjectAPI.md#subm) * **[Level-1f](BLISObjectAPI.md#level-1f-operations)**: Fused operations on multiple vectors: * [axpy2v](BLISObjectAPI.md#axpy2v), [dotaxpyv](BLISObjectAPI.md#dotaxpyv), [axpyf](BLISObjectAPI.md#axpyf), [dotxf](BLISObjectAPI.md#dotxf), [dotxaxpyf](BLISObjectAPI.md#dotxaxpyf) * **[Level-2](BLISObjectAPI.md#level-2-operations)**: Operations with one matrix and (at least) one vector operand: @@ -845,6 +845,24 @@ Invert all elements of an _n_-length vector `x`. --- +#### invscalv +```c +void bli_invscalv + ( + obj_t* alpha, + obj_t* x + ); +``` +Perform +``` + x := ( 1.0 / conj?(alpha) ) * x +``` +where `x` is a vector of length _n_, and `alpha` is a scalar. + +Observed object properties: `conj?(alpha)`. + +--- + #### scalv ```c void bli_scalv @@ -1049,6 +1067,19 @@ Observed object properties: `diagoff(A)`. --- +#### invscald +```c +void bli_invscald + ( + obj_t* alpha, + obj_t* a + ); +``` + +Observed object properties: `conj?(alpha)`, `diagoff(A)`. + +--- + #### scald ```c void bli_scald @@ -1213,6 +1244,24 @@ Observed object properties: `diagoff(A)`, `diag(A)`, `uplo(A)`, `trans?(A)`. --- +#### invscalm +```c +void bli_invscalm + ( + obj_t* alpha, + obj_t* a + ); +``` +Perform +``` + A := ( 1.0 / conj?(alpha) ) * A +``` +where `A` is an _m x n_ matrix stored as a dense matrix, or lower- or upper-triangular/trapezoidal matrix with arbitrary diagonal offset. If `uplo(A)` indicates lower or upper storage, only that part of matrix `A` will be updated. + +Observed object properties: `conj?(alpha)`, `diagoff(A)`, `uplo(A)`. + +--- + #### scalm ```c void bli_scalm diff --git a/docs/BLISTypedAPI.md b/docs/BLISTypedAPI.md index 76d7ef8f6..497776a15 100644 --- a/docs/BLISTypedAPI.md +++ b/docs/BLISTypedAPI.md @@ -36,11 +36,11 @@ This index provides a quick way to jump directly to the description for each operation discussed later in the [Computational function reference](BLISTypedAPI.md#computational-function-reference) section: * **[Level-1v](BLISTypedAPI.md#level-1v-operations)**: Operations on vectors: - * [addv](BLISTypedAPI.md#addv), [amaxv](BLISTypedAPI.md#amaxv), [axpyv](BLISTypedAPI.md#axpyv), [axpbyv](BLISTypedAPI.md#axpbyv), [copyv](BLISTypedAPI.md#copyv), [dotv](BLISTypedAPI.md#dotv), [dotxv](BLISTypedAPI.md#dotxv), [invertv](BLISTypedAPI.md#invertv), [scal2v](BLISTypedAPI.md#scal2v), [scalv](BLISTypedAPI.md#scalv), [setv](BLISTypedAPI.md#setv), [subv](BLISTypedAPI.md#subv), [swapv](BLISTypedAPI.md#swapv), [xpbyv](BLISTypedAPI.md#xpbyv) + * [addv](BLISTypedAPI.md#addv), [amaxv](BLISTypedAPI.md#amaxv), [axpyv](BLISTypedAPI.md#axpyv), [axpbyv](BLISTypedAPI.md#axpbyv), [copyv](BLISTypedAPI.md#copyv), [dotv](BLISTypedAPI.md#dotv), [dotxv](BLISTypedAPI.md#dotxv), [invertv](BLISTypedAPI.md#invertv), [invscalv](BLISTypedAPI.md#invscalv), [scalv](BLISTypedAPI.md#scalv), [scal2v](BLISTypedAPI.md#scal2v), [setv](BLISTypedAPI.md#setv), [subv](BLISTypedAPI.md#subv), [swapv](BLISTypedAPI.md#swapv), [xpbyv](BLISTypedAPI.md#xpbyv) * **[Level-1d](BLISTypedAPI.md#level-1d-operations)**: Element-wise operations on matrix diagonals: - * [addd](BLISTypedAPI.md#addd), [axpyd](BLISTypedAPI.md#axpyd), [copyd](BLISTypedAPI.md#copyd), [invertd](BLISTypedAPI.md#invertd), [scald](BLISTypedAPI.md#scald), [scal2d](BLISTypedAPI.md#scal2d), [setd](BLISTypedAPI.md#setd), [setid](BLISTypedAPI.md#setid), [shiftd](BLISTypedAPI.md#shiftd), [subd](BLISTypedAPI.md#subd), [xpbyd](BLISTypedAPI.md#xpbyd) + * [addd](BLISTypedAPI.md#addd), [axpyd](BLISTypedAPI.md#axpyd), [copyd](BLISTypedAPI.md#copyd), [invertd](BLISTypedAPI.md#invertd), [invscald](BLISTypedAPI.md#invscald), [scald](BLISTypedAPI.md#scald), [scal2d](BLISTypedAPI.md#scal2d), [setd](BLISTypedAPI.md#setd), [setid](BLISTypedAPI.md#setid), [shiftd](BLISTypedAPI.md#shiftd), [subd](BLISTypedAPI.md#subd), [xpbyd](BLISTypedAPI.md#xpbyd) * **[Level-1m](BLISTypedAPI.md#level-1m-operations)**: Element-wise operations on matrices: - * [addm](BLISTypedAPI.md#addm), [axpym](BLISTypedAPI.md#axpym), [copym](BLISTypedAPI.md#copym), [scalm](BLISTypedAPI.md#scalm), [scal2m](BLISTypedAPI.md#scal2m), [setm](BLISTypedAPI.md#setm), [subm](BLISTypedAPI.md#subm) + * [addm](BLISTypedAPI.md#addm), [axpym](BLISTypedAPI.md#axpym), [copym](BLISTypedAPI.md#copym), [invscalm](BLISTypedAPI.md#invscalm), [scalm](BLISTypedAPI.md#scalm), [scal2m](BLISTypedAPI.md#scal2m), [setm](BLISTypedAPI.md#setm), [subm](BLISTypedAPI.md#subm) * **[Level-1f](BLISTypedAPI.md#level-1f-operations)**: Fused operations on multiple vectors: * [axpy2v](BLISTypedAPI.md#axpy2v), [dotaxpyv](BLISTypedAPI.md#dotaxpyv), [axpyf](BLISTypedAPI.md#axpyf), [dotxf](BLISTypedAPI.md#dotxf), [dotxaxpyf](BLISTypedAPI.md#dotxaxpyf) * **[Level-2](BLISTypedAPI.md#level-2-operations)**: Operations with one matrix and (at least) one vector operand: @@ -369,6 +369,24 @@ Invert all elements of an _n_-length vector `x`. --- +#### invscalv +```c +void bli_?invscalv + ( + conj_t conjalpha, + dim_t n, + ctype* alpha, + ctype* x, inc_t incx + ); +``` +Perform +``` + x := ( 1.0 / conjalpha(alpha) ) * x +``` +where `x` is a vector of length _n_, and `alpha` is a scalar. + +--- + #### scalv ```c void bli_?scalv @@ -548,6 +566,21 @@ void bli_?invertd --- +#### invscald +```c +void bli_?invscald + ( + conj_t conjalpha, + doff_t diagoffa, + dim_t m, + dim_t n, + ctype* alpha, + ctype* a, inc_t rsa, inc_t csa + ); +``` + +--- + #### scald ```c void bli_?scald @@ -737,6 +770,27 @@ where `B` is an _m x n_ matrix, `A` is stored as a dense matrix, or lower- or up --- +#### invscalm +```c +void bli_?invscalm + ( + conj_t conjalpha, + doff_t diagoffa, + uplo_t uploa, + dim_t m, + dim_t n, + ctype* alpha, + ctype* a, inc_t rsa, inc_t csa + ); +``` +Perform +``` + A := ( 1.0 / conjalpha(alpha) ) * A +``` +where `A` is an _m x n_ matrix stored as a dense matrix, or lower- or upper-triangular/trapezoidal matrix, as specified by `uploa`, with the diagonal offset of `A` specified by `diagoffa`. If `uploa` indicates lower or upper storage, only that part of matrix `A` will be updated. + +--- + #### scalm ```c void bli_?scalm diff --git a/docs/KernelsHowTo.md b/docs/KernelsHowTo.md index 6e84db8e7..30a4dc736 100644 --- a/docs/KernelsHowTo.md +++ b/docs/KernelsHowTo.md @@ -22,11 +22,11 @@ One of the primary features of BLIS is that it provides a large set of dense lin Presently, BLIS supports several groups of operations: * **[Level-1v](BLISTypedAPI.md#level-1v-operations)**: Operations on vectors: - * [addv](BLISTypedAPI.md#addv), [amaxv](BLISTypedAPI.md#amaxv), [axpyv](BLISTypedAPI.md#axpyv), [copyv](BLISTypedAPI.md#copyv), [dotv](BLISTypedAPI.md#dotv), [dotxv](BLISTypedAPI.md#dotxv), [invertv](BLISTypedAPI.md#invertv), [scal2v](BLISTypedAPI.md#scal2v), [scalv](BLISTypedAPI.md#scalv), [setv](BLISTypedAPI.md#setv), [subv](BLISTypedAPI.md#subv), [swapv](BLISTypedAPI.md#swapv) + * [addv](BLISTypedAPI.md#addv), [amaxv](BLISTypedAPI.md#amaxv), [axpyv](BLISTypedAPI.md#axpyv), [copyv](BLISTypedAPI.md#copyv), [dotv](BLISTypedAPI.md#dotv), [dotxv](BLISTypedAPI.md#dotxv), [invertv](BLISTypedAPI.md#invertv), [invscalv](BLISTypedAPI.md#invscalv), [scalv](BLISTypedAPI.md#scalv), [scal2v](BLISTypedAPI.md#scal2v), [setv](BLISTypedAPI.md#setv), [subv](BLISTypedAPI.md#subv), [swapv](BLISTypedAPI.md#swapv) * **[Level-1d](BLISTypedAPI.md#level-1d-operations)**: Element-wise operations on matrix diagonals: - * [addd](BLISTypedAPI.md#addd), [axpyd](BLISTypedAPI.md#axpyd), [copyd](BLISTypedAPI.md#copyd), [invertd](BLISTypedAPI.md#invertd), [scald](BLISTypedAPI.md#scald), [scal2d](BLISTypedAPI.md#scal2d), [setd](BLISTypedAPI.md#setd), [setid](BLISTypedAPI.md#setid), [subd](BLISTypedAPI.md#subd) + * [addd](BLISTypedAPI.md#addd), [axpyd](BLISTypedAPI.md#axpyd), [copyd](BLISTypedAPI.md#copyd), [invertd](BLISTypedAPI.md#invertd), [invscald](BLISTypedAPI.md#invscald), [scald](BLISTypedAPI.md#scald), [scal2d](BLISTypedAPI.md#scal2d), [setd](BLISTypedAPI.md#setd), [setid](BLISTypedAPI.md#setid), [subd](BLISTypedAPI.md#subd) * **[Level-1m](BLISTypedAPI.md#level-1m-operations)**: Element-wise operations on matrices: - * [addm](BLISTypedAPI.md#addm), [axpym](BLISTypedAPI.md#axpym), [copym](BLISTypedAPI.md#copym), [scalm](BLISTypedAPI.md#scalm), [scal2m](BLISTypedAPI.md#scal2m), [setm](BLISTypedAPI.md#setm), [subm](BLISTypedAPI.md#subm) + * [addm](BLISTypedAPI.md#addm), [axpym](BLISTypedAPI.md#axpym), [copym](BLISTypedAPI.md#copym), [invscalm](BLISTypedAPI.md#invscalm), [scalm](BLISTypedAPI.md#scalm), [scal2m](BLISTypedAPI.md#scal2m), [setm](BLISTypedAPI.md#setm), [subm](BLISTypedAPI.md#subm) * **[Level-1f](BLISTypedAPI.md#level-1f-operations)**: Fused operations on multiple vectors: * [axpy2v](BLISTypedAPI.md#axpy2v), [dotaxpyv](BLISTypedAPI.md#dotaxpyv), [axpyf](BLISTypedAPI.md#axpyf), [dotxf](BLISTypedAPI.md#dotxf), [dotxaxpyf](BLISTypedAPI.md#dotxaxpyf) * **[Level-2](BLISTypedAPI.md#level-2-operations)**: Operations with one matrix and (at least) one vector operand: @@ -81,6 +81,7 @@ BLIS supports the following 14 level-1v kernels. These kernels are used primaril * **dotv**: Performs a [dot product](BLISTypedAPI.md#dotv) where the output scalar is overwritten. * **dotxv**: Performs an [extended dot product](BLISTypedAPI.md#dotxv) operation where the dot product is first scaled and then accumulated into a scaled output scalar. * **invertv**: Performs an [element-wise vector inversion](BLISTypedAPI.md#invertv) operation. + * **invscalv**: Performs an [in-place (destructive) vector inverse-scaling](BLISTypedAPI.md#invscalv) operation. * **scalv**: Performs an [in-place (destructive) vector scaling](BLISTypedAPI.md#scalv) operation. * **scal2v**: Performs an [out-of-place (non-destructive) vector scaling](BLISTypedAPI.md#scal2v) operation. * **setv**: Performs a [vector broadcast](BLISTypedAPI.md#setv) operation. @@ -184,6 +185,7 @@ datatype characters. | copyv | `BLIS_COPYV_KER` | `?copyv_ft` | | dotxv | `BLIS_DOTXV_KER` | `?dotxv_ft` | | invertv | `BLIS_INVERTV_KER` | `?invertv_ft` | +| invscalv | `BLIS_INVSCALV_KER` | `?invscalv_ft` | | scalv | `BLIS_SCALV_KER` | `?scalv_ft` | | scal2v | `BLIS_SCAL2V_KER` | `?scal2v_ft` | | setv | `BLIS_SETV_KER` | `?setv_ft` | @@ -220,6 +222,7 @@ This section seeks to provide developers with a complete reference for each of t * [dotv](KernelsHowTo.md#dotv-kernel) * [dotxv](KernelsHowTo.md#dotxv-kernel) * [invertv](KernelsHowTo.md#invertv-kernel) + * [invscalv](KernelsHowTo.md#invscalv-kernel) * [scalv](KernelsHowTo.md#scalv-kernel) * [scal2v](KernelsHowTo.md#scal2v-kernel) * [setv](KernelsHowTo.md#setv-kernel) @@ -929,6 +932,25 @@ This kernel inverts all elements of an _n_-length vector `x`. --- +#### invscalv kernel +```c +void bli_?invscalv_ + ( + conj_t conjalpha, + dim_t n, + ctype* restrict alpha, + ctype* restrict x, inc_t incx, + cntx_t* restrict cntx + ) +``` +This kernel performs the following operation: +``` + x := ( 1.0 / conjalpha(alpha) ) * x +``` +where `x` is a vector of length _n_ stored with stride `incx` and `alpha` is a scalar. + +--- + #### scalv kernel ```c void bli_?scalv_ diff --git a/frame/1/bli_l1v_check.c b/frame/1/bli_l1v_check.c index 8ab470bf4..f2c4622d5 100644 --- a/frame/1/bli_l1v_check.c +++ b/frame/1/bli_l1v_check.c @@ -165,6 +165,7 @@ void PASTEMAC(opname,_check) \ bli_l1v_ax_check( alpha, x ); \ } +GENFRONT( invscalv ) GENFRONT( scalv ) GENFRONT( setv ) diff --git a/frame/1/bli_l1v_check.h b/frame/1/bli_l1v_check.h index 110b25d55..cfd6d9e6e 100644 --- a/frame/1/bli_l1v_check.h +++ b/frame/1/bli_l1v_check.h @@ -140,6 +140,7 @@ void PASTEMAC(opname,_check) \ const obj_t* x \ ); +GENTPROT( invscalv ) GENTPROT( scalv ) GENTPROT( setv ) diff --git a/frame/1/bli_l1v_fpa.c b/frame/1/bli_l1v_fpa.c index 311f0b2b9..a88aba93d 100644 --- a/frame/1/bli_l1v_fpa.c +++ b/frame/1/bli_l1v_fpa.c @@ -60,6 +60,7 @@ GENFRONT( scal2v ) GENFRONT( dotv ) GENFRONT( dotxv ) GENFRONT( invertv ) +GENFRONT( invscalv ) GENFRONT( scalv ) GENFRONT( setv ) GENFRONT( swapv ) diff --git a/frame/1/bli_l1v_fpa.h b/frame/1/bli_l1v_fpa.h index c05a4ff7b..52d477d30 100644 --- a/frame/1/bli_l1v_fpa.h +++ b/frame/1/bli_l1v_fpa.h @@ -52,6 +52,7 @@ GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) +GENPROT( invscalv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) diff --git a/frame/1/bli_l1v_ft.h b/frame/1/bli_l1v_ft.h index 57f9d223a..244b926ca 100644 --- a/frame/1/bli_l1v_ft.h +++ b/frame/1/bli_l1v_ft.h @@ -158,7 +158,7 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ INSERT_GENTDEF( invertv ) -// scalv, setv +// invscalv, scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ @@ -172,6 +172,7 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ BLIS_TAPI_EX_PARAMS \ ); +INSERT_GENTDEF( invscalv ) INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) diff --git a/frame/1/bli_l1v_ft_ker.h b/frame/1/bli_l1v_ft_ker.h index fd3f14c1c..ade2c98eb 100644 --- a/frame/1/bli_l1v_ft_ker.h +++ b/frame/1/bli_l1v_ft_ker.h @@ -161,7 +161,7 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ INSERT_GENTDEF( invertv ) -// scalv, setv +// invscalv, scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ @@ -175,6 +175,7 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ cntx_t* cntx \ ); +INSERT_GENTDEF( invscalv ) INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) diff --git a/frame/1/bli_l1v_ker.h b/frame/1/bli_l1v_ker.h index e91813a07..4ebbffa82 100644 --- a/frame/1/bli_l1v_ker.h +++ b/frame/1/bli_l1v_ker.h @@ -90,6 +90,12 @@ INSERT_GENTPROT_BASIC0( dotxv_ker_name ) INSERT_GENTPROT_BASIC0( invertv_ker_name ) +#undef GENTPROT +#define GENTPROT INVSCALV_KER_PROT + +INSERT_GENTPROT_BASIC0( invscalv_ker_name ) + + #undef GENTPROT #define GENTPROT SCALV_KER_PROT diff --git a/frame/1/bli_l1v_ker_prot.h b/frame/1/bli_l1v_ker_prot.h index b912ba7e0..965626392 100644 --- a/frame/1/bli_l1v_ker_prot.h +++ b/frame/1/bli_l1v_ker_prot.h @@ -139,6 +139,18 @@ void PASTEMAC(ch,opname) \ ); \ +#define INVSCALV_KER_PROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + conj_t conjalpha, \ + dim_t n, \ + ctype* restrict alpha, \ + ctype* restrict x, inc_t incx, \ + cntx_t* cntx \ + ); \ + + #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ diff --git a/frame/1/bli_l1v_oapi.c b/frame/1/bli_l1v_oapi.c index 4ea241693..ae12250e7 100644 --- a/frame/1/bli_l1v_oapi.c +++ b/frame/1/bli_l1v_oapi.c @@ -460,6 +460,7 @@ void PASTEMAC(opname,EX_SUF) \ ); \ } +GENFRONT( invscalv ) GENFRONT( scalv ) GENFRONT( setv ) diff --git a/frame/1/bli_l1v_oapi.h b/frame/1/bli_l1v_oapi.h index 957747a2a..b503cf9f4 100644 --- a/frame/1/bli_l1v_oapi.h +++ b/frame/1/bli_l1v_oapi.h @@ -147,6 +147,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ BLIS_OAPI_EX_PARAMS \ ); +GENTPROT( invscalv ) GENTPROT( scalv ) GENTPROT( setv ) diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c index 01e3356d5..b22ba365f 100644 --- a/frame/1/bli_l1v_tapi.c +++ b/frame/1/bli_l1v_tapi.c @@ -341,6 +341,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ); \ } +INSERT_GENTFUNC_BASIC( invscalv, BLIS_INVSCALV_KER ) INSERT_GENTFUNC_BASIC( scalv, BLIS_SCALV_KER ) INSERT_GENTFUNC_BASIC( setv, BLIS_SETV_KER ) diff --git a/frame/1/bli_l1v_tapi.h b/frame/1/bli_l1v_tapi.h index c1965cb3c..8eaf2b185 100644 --- a/frame/1/bli_l1v_tapi.h +++ b/frame/1/bli_l1v_tapi.h @@ -163,6 +163,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ BLIS_TAPI_EX_PARAMS \ ); \ +INSERT_GENTPROT_BASIC0( invscalv ) INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) diff --git a/frame/1d/bli_l1d_check.c b/frame/1d/bli_l1d_check.c index fcc62a757..776ab8aee 100644 --- a/frame/1d/bli_l1d_check.c +++ b/frame/1d/bli_l1d_check.c @@ -98,6 +98,7 @@ void PASTEMAC(opname,_check) \ bli_l1d_ax_check( alpha, x ); \ } +GENFRONT( invscald ) GENFRONT( scald ) GENFRONT( setd ) GENFRONT( setid ) diff --git a/frame/1d/bli_l1d_check.h b/frame/1d/bli_l1d_check.h index 1ef57e236..56286f9ee 100644 --- a/frame/1d/bli_l1d_check.h +++ b/frame/1d/bli_l1d_check.h @@ -85,6 +85,7 @@ void PASTEMAC(opname,_check) \ const obj_t* x \ ); +GENTPROT( invscald ) GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) diff --git a/frame/1d/bli_l1d_fpa.c b/frame/1d/bli_l1d_fpa.c index ec4c222ab..371f9289b 100644 --- a/frame/1d/bli_l1d_fpa.c +++ b/frame/1d/bli_l1d_fpa.c @@ -56,6 +56,7 @@ GENFRONT( subd ) GENFRONT( axpyd ) GENFRONT( scal2d ) GENFRONT( invertd ) +GENFRONT( invscald ) GENFRONT( scald ) GENFRONT( setd ) GENFRONT( setid ) diff --git a/frame/1d/bli_l1d_fpa.h b/frame/1d/bli_l1d_fpa.h index 4516912de..11fb36192 100644 --- a/frame/1d/bli_l1d_fpa.h +++ b/frame/1d/bli_l1d_fpa.h @@ -48,6 +48,7 @@ GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) +GENPROT( invscald ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) diff --git a/frame/1d/bli_l1d_ft.h b/frame/1d/bli_l1d_ft.h index 3de317527..b14e17b6a 100644 --- a/frame/1d/bli_l1d_ft.h +++ b/frame/1d/bli_l1d_ft.h @@ -95,7 +95,7 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ INSERT_GENTDEF( invertd ) -// scald, setd +// invscald, scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ @@ -111,6 +111,7 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ BLIS_TAPI_EX_PARAMS \ ); +INSERT_GENTDEF( invscald ) INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) diff --git a/frame/1d/bli_l1d_oapi.c b/frame/1d/bli_l1d_oapi.c index 7027e7780..8dfd9cad0 100644 --- a/frame/1d/bli_l1d_oapi.c +++ b/frame/1d/bli_l1d_oapi.c @@ -260,6 +260,7 @@ void PASTEMAC(opname,EX_SUF) \ ); \ } +GENFRONT( invscald ) GENFRONT( scald ) GENFRONT( setd ) diff --git a/frame/1d/bli_l1d_oapi.h b/frame/1d/bli_l1d_oapi.h index 66f9d698c..81171f3b8 100644 --- a/frame/1d/bli_l1d_oapi.h +++ b/frame/1d/bli_l1d_oapi.h @@ -89,6 +89,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ BLIS_OAPI_EX_PARAMS \ ); +GENTPROT( invscald ) GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c index 60916cd56..907afb703 100644 --- a/frame/1d/bli_l1d_tapi.c +++ b/frame/1d/bli_l1d_tapi.c @@ -312,6 +312,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ); \ } +INSERT_GENTFUNC_BASIC2( invscald, invscalv, BLIS_INVSCALV_KER ) INSERT_GENTFUNC_BASIC2( scald, scalv, BLIS_SCALV_KER ) INSERT_GENTFUNC_BASIC2( setd, setv, BLIS_SETV_KER ) diff --git a/frame/1d/bli_l1d_tapi.h b/frame/1d/bli_l1d_tapi.h index 831b3d390..8fe882f0c 100644 --- a/frame/1d/bli_l1d_tapi.h +++ b/frame/1d/bli_l1d_tapi.h @@ -106,6 +106,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ BLIS_TAPI_EX_PARAMS \ ); +INSERT_GENTPROT_BASIC0( invscald ) INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) diff --git a/frame/1m/bli_l1m_check.c b/frame/1m/bli_l1m_check.c index f5d4bf1b4..92f192838 100644 --- a/frame/1m/bli_l1m_check.c +++ b/frame/1m/bli_l1m_check.c @@ -84,6 +84,7 @@ void PASTEMAC(opname,_check) \ bli_l1m_ax_check( alpha, x ); \ } +GENFRONT( invscalm ) GENFRONT( scalm ) GENFRONT( setm ) diff --git a/frame/1m/bli_l1m_check.h b/frame/1m/bli_l1m_check.h index 6089dfa17..d767f104c 100644 --- a/frame/1m/bli_l1m_check.h +++ b/frame/1m/bli_l1m_check.h @@ -74,6 +74,7 @@ void PASTEMAC(opname,_check) \ const obj_t* x \ ); +GENPROT( invscalm ) GENPROT( scalm ) GENPROT( setm ) diff --git a/frame/1m/bli_l1m_fpa.c b/frame/1m/bli_l1m_fpa.c index c3d13fb51..7299dd7c8 100644 --- a/frame/1m/bli_l1m_fpa.c +++ b/frame/1m/bli_l1m_fpa.c @@ -55,6 +55,7 @@ GENFRONT( copym ) GENFRONT( subm ) GENFRONT( axpym ) GENFRONT( scal2m ) +GENFRONT( invscalm ) GENFRONT( scalm ) GENFRONT( setm ) GENFRONT( xpbym ) diff --git a/frame/1m/bli_l1m_fpa.h b/frame/1m/bli_l1m_fpa.h index 84ef8b77f..9de988559 100644 --- a/frame/1m/bli_l1m_fpa.h +++ b/frame/1m/bli_l1m_fpa.h @@ -47,6 +47,7 @@ GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) +GENPROT( invscalm ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) diff --git a/frame/1m/bli_l1m_ft.h b/frame/1m/bli_l1m_ft.h index 36d06b2fe..0851470dd 100644 --- a/frame/1m/bli_l1m_ft.h +++ b/frame/1m/bli_l1m_ft.h @@ -101,7 +101,7 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ INSERT_GENTDEF( scal2m ) -// scalm, setm +// invscalm, scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ @@ -119,6 +119,7 @@ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ BLIS_TAPI_EX_PARAMS \ ); +INSERT_GENTDEF( invscalm ) INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) diff --git a/frame/1m/bli_l1m_oapi.c b/frame/1m/bli_l1m_oapi.c index 7520afce7..775d69018 100644 --- a/frame/1m/bli_l1m_oapi.c +++ b/frame/1m/bli_l1m_oapi.c @@ -237,6 +237,7 @@ void PASTEMAC(opname,EX_SUF) \ ); \ } +GENFRONT( invscalm ) GENFRONT( scalm ) diff --git a/frame/1m/bli_l1m_oapi.h b/frame/1m/bli_l1m_oapi.h index 9510f1aee..6873e9903 100644 --- a/frame/1m/bli_l1m_oapi.h +++ b/frame/1m/bli_l1m_oapi.h @@ -77,6 +77,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ BLIS_OAPI_EX_PARAMS \ ); +GENPROT( invscalm ) GENPROT( scalm ) GENPROT( setm ) diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c index 6b802b9fe..0a641cf9e 100644 --- a/frame/1m/bli_l1m_tapi.c +++ b/frame/1m/bli_l1m_tapi.c @@ -378,6 +378,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ); \ } +INSERT_GENTFUNC_BASIC0( invscalm ) INSERT_GENTFUNC_BASIC0( scalm ) INSERT_GENTFUNC_BASIC0( setm ) diff --git a/frame/1m/bli_l1m_tapi.h b/frame/1m/bli_l1m_tapi.h index 68646a71f..531fae075 100644 --- a/frame/1m/bli_l1m_tapi.h +++ b/frame/1m/bli_l1m_tapi.h @@ -95,6 +95,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ BLIS_TAPI_EX_PARAMS \ ); +INSERT_GENTPROT_BASIC0( invscalm ) INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c index c979f082a..1bcd9b9ca 100644 --- a/frame/1m/bli_l1m_unb_var1.c +++ b/frame/1m/bli_l1m_unb_var1.c @@ -376,6 +376,7 @@ void PASTEMAC(ch,opname) \ } \ } +INSERT_GENTFUNC_BASIC2( invscalm_unb_var1, invscalv, BLIS_INVSCALV_KER ) INSERT_GENTFUNC_BASIC2( scalm_unb_var1, scalv, BLIS_SCALV_KER ) INSERT_GENTFUNC_BASIC2( setm_unb_var1, setv, BLIS_SETV_KER ) diff --git a/frame/1m/bli_l1m_unb_var1.h b/frame/1m/bli_l1m_unb_var1.h index 0364d4b7c..fe01989e3 100644 --- a/frame/1m/bli_l1m_unb_var1.h +++ b/frame/1m/bli_l1m_unb_var1.h @@ -98,6 +98,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \ rntm_t* rntm \ ); +INSERT_GENTPROT_BASIC0( invscalm ) INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 08c7ddc4a..b5c3ec255 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -635,6 +635,7 @@ typedef enum BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, + BLIS_INVSCALV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, diff --git a/ref_kernels/1/bli_invscalv_ref.c b/ref_kernels/1/bli_invscalv_ref.c new file mode 100644 index 000000000..a2263ee58 --- /dev/null +++ b/ref_kernels/1/bli_invscalv_ref.c @@ -0,0 +1,81 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, arch, suf ) \ +\ +void PASTEMAC3(ch,opname,arch,suf) \ + ( \ + conj_t conjalpha, \ + dim_t n, \ + ctype* restrict alpha, \ + ctype* restrict x, inc_t incx, \ + cntx_t* cntx \ + ) \ +{ \ + if ( bli_zero_dim1( n ) ) return; \ +\ + /* If alpha is one, return. */ \ + if ( PASTEMAC(ch,eq1)( *alpha ) ) return; \ +\ + /* If alpha is zero, inv(alpha) is undefined. Bad user! Return early. */ \ + if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \ +\ + ctype alpha_conj; \ +\ + PASTEMAC(ch,copycjs)( conjalpha, *alpha, alpha_conj ); \ +\ + if ( incx == 1 ) \ + { \ + PRAGMA_SIMD \ + for ( dim_t i = 0; i < n; ++i ) \ + { \ + PASTEMAC(ch,invscals)( alpha_conj, x[i] ); \ + } \ + } \ + else \ + { \ + for ( dim_t i = 0; i < n; ++i ) \ + { \ + PASTEMAC(ch,invscals)( alpha_conj, *x ); \ +\ + x += incx; \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC2( invscalv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) + diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c index e094db54b..11c3091e9 100644 --- a/ref_kernels/bli_cntx_ref.c +++ b/ref_kernels/bli_cntx_ref.c @@ -173,6 +173,8 @@ #define dotxv_ker_name GENARNAME(dotxv) #undef invertv_ker_name #define invertv_ker_name GENARNAME(invertv) +#undef invscalv_ker_name +#define invscalv_ker_name GENARNAME(invscalv) #undef scalv_ker_name #define scalv_ker_name GENARNAME(scalv) #undef scal2v_ker_name @@ -380,20 +382,21 @@ void GENBARNAME(cntx_init) // -- Set level-1v kernels ------------------------------------------------- - gen_func_init( &funcs[ BLIS_ADDV_KER ], addv_ker_name ); - gen_func_init( &funcs[ BLIS_AMAXV_KER ], amaxv_ker_name ); - gen_func_init( &funcs[ BLIS_AXPBYV_KER ], axpbyv_ker_name ); - gen_func_init( &funcs[ BLIS_AXPYV_KER ], axpyv_ker_name ); - gen_func_init( &funcs[ BLIS_COPYV_KER ], copyv_ker_name ); - gen_func_init( &funcs[ BLIS_DOTV_KER ], dotv_ker_name ); - gen_func_init( &funcs[ BLIS_DOTXV_KER ], dotxv_ker_name ); - gen_func_init( &funcs[ BLIS_INVERTV_KER ], invertv_ker_name ); - gen_func_init( &funcs[ BLIS_SCALV_KER ], scalv_ker_name ); - gen_func_init( &funcs[ BLIS_SCAL2V_KER ], scal2v_ker_name ); - gen_func_init( &funcs[ BLIS_SETV_KER ], setv_ker_name ); - gen_func_init( &funcs[ BLIS_SUBV_KER ], subv_ker_name ); - gen_func_init( &funcs[ BLIS_SWAPV_KER ], swapv_ker_name ); - gen_func_init( &funcs[ BLIS_XPBYV_KER ], xpbyv_ker_name ); + gen_func_init( &funcs[ BLIS_ADDV_KER ], addv_ker_name ); + gen_func_init( &funcs[ BLIS_AMAXV_KER ], amaxv_ker_name ); + gen_func_init( &funcs[ BLIS_AXPBYV_KER ], axpbyv_ker_name ); + gen_func_init( &funcs[ BLIS_AXPYV_KER ], axpyv_ker_name ); + gen_func_init( &funcs[ BLIS_COPYV_KER ], copyv_ker_name ); + gen_func_init( &funcs[ BLIS_DOTV_KER ], dotv_ker_name ); + gen_func_init( &funcs[ BLIS_DOTXV_KER ], dotxv_ker_name ); + gen_func_init( &funcs[ BLIS_INVERTV_KER ], invertv_ker_name ); + gen_func_init( &funcs[ BLIS_INVSCALV_KER ], invscalv_ker_name ); + gen_func_init( &funcs[ BLIS_SCALV_KER ], scalv_ker_name ); + gen_func_init( &funcs[ BLIS_SCAL2V_KER ], scal2v_ker_name ); + gen_func_init( &funcs[ BLIS_SETV_KER ], setv_ker_name ); + gen_func_init( &funcs[ BLIS_SUBV_KER ], subv_ker_name ); + gen_func_init( &funcs[ BLIS_SWAPV_KER ], swapv_ker_name ); + gen_func_init( &funcs[ BLIS_XPBYV_KER ], xpbyv_ker_name ); // -- Set level-1m (packm/unpackm) kernels --------------------------------- diff --git a/testsuite/input.operations b/testsuite/input.operations index eebe8b605..e6c39e631 100644 --- a/testsuite/input.operations +++ b/testsuite/input.operations @@ -138,9 +138,13 @@ 1 # normfv -1 # dimensions: m +1 # invscalv +-1 # dimensions: m +? # parameters: conjalpha + 1 # scalv -1 # dimensions: m -? # parameters: conjbeta +? # parameters: conjalpha 1 # scal2v -1 # dimensions: m @@ -175,9 +179,13 @@ 1 # normfm -1 -2 # dimensions: m n +1 # invscalm +-1 -2 # dimensions: m n +? # parameters: conjalpha + 1 # scalm -1 -2 # dimensions: m n -? # parameters: conjbeta +? # parameters: conjalpha 1 # scal2m -1 -2 # dimensions: m n diff --git a/testsuite/input.operations.fast b/testsuite/input.operations.fast index b733c672d..ecd526aaa 100644 --- a/testsuite/input.operations.fast +++ b/testsuite/input.operations.fast @@ -138,9 +138,13 @@ 1 # normfv -1 # dimensions: m +1 # invscalv +-1 # dimensions: m +? # parameters: conjalpha + 1 # scalv -1 # dimensions: m -? # parameters: conjbeta +? # parameters: conjalpha 1 # scal2v -1 # dimensions: m @@ -175,9 +179,13 @@ 1 # normfm -1 -2 # dimensions: m n +1 # invscalm +-1 -2 # dimensions: m n +? # parameters: conjalpha + 1 # scalm -1 -2 # dimensions: m n -? # parameters: conjbeta +? # parameters: conjalpha 1 # scal2m -1 -2 # dimensions: m n diff --git a/testsuite/input.operations.mixed b/testsuite/input.operations.mixed index 6292ea8ab..eb851b786 100644 --- a/testsuite/input.operations.mixed +++ b/testsuite/input.operations.mixed @@ -138,9 +138,13 @@ 1 # normfv -1 # dimensions: m +1 # invscalv +-1 # dimensions: m +? # parameters: conjalpha + 1 # scalv -1 # dimensions: m -? # parameters: conjbeta +? # parameters: conjalpha 1 # scal2v -1 # dimensions: m @@ -175,9 +179,13 @@ 1 # normfm -1 -2 # dimensions: m n +1 # invscalm +-1 -2 # dimensions: m n +? # parameters: conjalpha + 1 # scalm -1 -2 # dimensions: m n -? # parameters: conjbeta +? # parameters: conjalpha 1 # scal2m -1 -2 # dimensions: m n diff --git a/testsuite/input.operations.salt b/testsuite/input.operations.salt index b733c672d..ecd526aaa 100644 --- a/testsuite/input.operations.salt +++ b/testsuite/input.operations.salt @@ -138,9 +138,13 @@ 1 # normfv -1 # dimensions: m +1 # invscalv +-1 # dimensions: m +? # parameters: conjalpha + 1 # scalv -1 # dimensions: m -? # parameters: conjbeta +? # parameters: conjalpha 1 # scal2v -1 # dimensions: m @@ -175,9 +179,13 @@ 1 # normfm -1 -2 # dimensions: m n +1 # invscalm +-1 -2 # dimensions: m n +? # parameters: conjalpha + 1 # scalm -1 -2 # dimensions: m n -? # parameters: conjbeta +? # parameters: conjalpha 1 # scal2m -1 -2 # dimensions: m n diff --git a/testsuite/src/test_invscalm.c b/testsuite/src/test_invscalm.c new file mode 100644 index 000000000..9ad730631 --- /dev/null +++ b/testsuite/src/test_invscalm.c @@ -0,0 +1,301 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_libblis.h" + + +// Static variables. +static char* op_str = "invscalm"; +static char* o_types = "m"; // x +static char* p_types = "c"; // conjalpha +static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s + { 1e-04, 1e-05 }, // warn, pass for c + { 1e-13, 1e-14 }, // warn, pass for d + { 1e-13, 1e-14 } }; // warn, pass for z + +// Local prototypes. +void libblis_test_invscalm_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + +void libblis_test_invscalm_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_invscalm_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* y + ); + +void libblis_test_invscalm_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* y, + obj_t* y_save, + double* resid + ); + + + +void libblis_test_invscalm_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + libblis_test_randm( tdata, params, &(op->ops->randm) ); + libblis_test_normfm( tdata, params, &(op->ops->normfm) ); + libblis_test_copym( tdata, params, &(op->ops->copym) ); +} + + + +void libblis_test_invscalm + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + + // Return early if this test has already been done. + if ( libblis_test_op_is_done( op ) ) return; + + // Return early if operation is disabled. + if ( libblis_test_op_is_disabled( op ) || + libblis_test_l1m_is_disabled( op ) ) return; + + // Call dependencies first. + if ( TRUE ) libblis_test_invscalm_deps( tdata, params, op ); + + // Execute the test driver for each implementation requested. + //if ( op->front_seq == ENABLE ) + { + libblis_test_op_driver( tdata, + params, + op, + BLIS_TEST_SEQ_FRONT_END, + op_str, + p_types, + o_types, + thresh, + libblis_test_invscalm_experiment ); + } +} + + + +void libblis_test_invscalm_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + num_t datatype; + + dim_t m, n; + + conj_t conjalpha; + + obj_t alpha, y; + obj_t y_save; + + + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + + // Map the dimension specifier to actual dimensions. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); + + // Map parameter characters to BLIS constants. + bli_param_map_char_to_blis_conj( pc_str[0], &conjalpha ); + + // Create test scalars. + bli_obj_scalar_init_detached( datatype, &alpha ); + + // Create test operands (vectors and/or matrices). + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[0], m, n, &y ); + libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, + sc_str[0], m, n, &y_save ); + + // Set alpha to 0 + i. + //bli_setsc( 0.0, 1.0, &alpha ); + if ( bli_obj_is_real( &y ) ) + bli_setsc( -2.0, 0.0, &alpha ); + else + bli_setsc( 0.0, -2.0, &alpha ); + + // Randomize and save y. + libblis_test_mobj_randomize( params, FALSE, &y ); + bli_copym( &y, &y_save ); + + // Apply the parameters. + bli_obj_set_conj( conjalpha, &alpha ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + bli_copym( &y_save, &y ); + + time = bli_clock(); + + libblis_test_invscalm_impl( iface, &alpha, &y ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + *perf = ( 1.0 * m * n ) / time_min / FLOPS_PER_UNIT_PERF; + if ( bli_obj_is_complex( &y ) ) *perf *= 6.0; + + // Perform checks. + libblis_test_invscalm_check( params, &alpha, &y, &y_save, resid ); + + // Zero out performance and residual if output matrix is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + + // Free the test objects. + bli_obj_free( &y ); + bli_obj_free( &y_save ); +} + + + +void libblis_test_invscalm_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* y + ) +{ + switch ( iface ) + { + case BLIS_TEST_SEQ_FRONT_END: + bli_invscalm( alpha, y ); + break; + + default: + libblis_test_printf_error( "Invalid interface type.\n" ); + } +} + + + +void libblis_test_invscalm_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* y, + obj_t* y_orig, + double* resid + ) +{ + num_t dt = bli_obj_dt( y ); + num_t dt_real = bli_obj_dt_proj_to_real( y ); + + dim_t m = bli_obj_length( y ); + dim_t n = bli_obj_width( y ); + + obj_t norm_y_r; + + obj_t y2; + + double junk; + + // + // Pre-conditions: + // - y_orig is randomized. + // Note: + // - alpha should have a non-zero imaginary component in the complex + // cases in order to more fully exercise the implementation. + // + // Under these conditions, we assume that the implementation for + // + // y := ( 1.0 / conjalpha(alpha) ) * y_orig + // + // is functioning correctly if + // + // normfv( y_orig - conjalpha(alpha) * y ) + // + // is negligible. + // + + bli_obj_create( dt, m, n, 0, 0, &y2 ); + bli_copym( y, &y2 ); + + bli_obj_scalar_init_detached( dt_real, &norm_y_r ); + + bli_scalm( alpha, &y2 ); + bli_subm( y_orig, &y2 ); + + bli_normfm( &y2, &norm_y_r ); + + bli_getsc( &norm_y_r, resid, &junk ); + + bli_obj_free( &y2 ); +} + diff --git a/testsuite/src/test_invscalm.h b/testsuite/src/test_invscalm.h new file mode 100644 index 000000000..698f9b377 --- /dev/null +++ b/testsuite/src/test_invscalm.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void libblis_test_invscalm + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + diff --git a/testsuite/src/test_invscalv.c b/testsuite/src/test_invscalv.c new file mode 100644 index 000000000..47d46b4c2 --- /dev/null +++ b/testsuite/src/test_invscalv.c @@ -0,0 +1,297 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_libblis.h" + + +// Static variables. +static char* op_str = "invscalv"; +static char* o_types = "v"; // y +static char* p_types = "c"; // conjalpha +static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s + { 1e-04, 1e-05 }, // warn, pass for c + { 1e-13, 1e-14 }, // warn, pass for d + { 1e-13, 1e-14 } }; // warn, pass for z + +// Local prototypes. +void libblis_test_invscalv_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + +void libblis_test_invscalv_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_invscalv_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* y + ); + +void libblis_test_invscalv_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* y, + obj_t* y_orig, + double* resid + ); + + + +void libblis_test_invscalv_deps + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + libblis_test_randv( tdata, params, &(op->ops->randv) ); + libblis_test_normfv( tdata, params, &(op->ops->normfv) ); + libblis_test_addv( tdata, params, &(op->ops->addv) ); + libblis_test_copyv( tdata, params, &(op->ops->copyv) ); +} + + + +void libblis_test_invscalv + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ) +{ + + // Return early if this test has already been done. + if ( libblis_test_op_is_done( op ) ) return; + + // Return early if operation is disabled. + if ( libblis_test_op_is_disabled( op ) || + libblis_test_l1v_is_disabled( op ) ) return; + + // Call dependencies first. + if ( TRUE ) libblis_test_invscalv_deps( tdata, params, op ); + + // Execute the test driver for each implementation requested. + //if ( op->front_seq == ENABLE ) + { + libblis_test_op_driver( tdata, + params, + op, + BLIS_TEST_SEQ_FRONT_END, + op_str, + p_types, + o_types, + thresh, + libblis_test_invscalv_experiment ); + } +} + + + +void libblis_test_invscalv_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + char* dc_str, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + num_t datatype; + + dim_t m; + + conj_t conjalpha; + + obj_t alpha, y; + obj_t y_save; + + + // Use the datatype of the first char in the datatype combination string. + bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); + + // Map the dimension specifier to an actual dimension. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + + // Map parameter characters to BLIS constants. + bli_param_map_char_to_blis_conj( pc_str[0], &conjalpha ); + + // Create test scalars. + bli_obj_scalar_init_detached( datatype, &alpha ); + + // Create test operands (vectors and/or matrices). + libblis_test_vobj_create( params, datatype, sc_str[0], m, &y ); + libblis_test_vobj_create( params, datatype, sc_str[0], m, &y_save ); + + // Set alpha. + if ( bli_obj_is_real( &y ) ) + bli_setsc( -2.0, 0.0, &alpha ); + else + bli_setsc( 0.0, -2.0, &alpha ); + + // Randomize and save y. + libblis_test_vobj_randomize( params, FALSE, &y ); + bli_copyv( &y, &y_save ); + + // Apply the parameters. + bli_obj_set_conj( conjalpha, &alpha ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + bli_copyv( &y_save, &y ); + + time = bli_clock(); + + libblis_test_invscalv_impl( iface, &alpha, &y ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + *perf = ( 1.0 * m ) / time_min / FLOPS_PER_UNIT_PERF; + if ( bli_obj_is_complex( &y ) ) *perf *= 6.0; + + // Perform checks. + libblis_test_invscalv_check( params, &alpha, &y, &y_save, resid ); + + // Zero out performance and residual if output vector is empty. + libblis_test_check_empty_problem( &y, perf, resid ); + + // Free the test objects. + bli_obj_free( &y ); + bli_obj_free( &y_save ); +} + + + +void libblis_test_invscalv_impl + ( + iface_t iface, + obj_t* alpha, + obj_t* y + ) +{ + switch ( iface ) + { + case BLIS_TEST_SEQ_FRONT_END: + bli_invscalv( alpha, y ); + break; + + default: + libblis_test_printf_error( "Invalid interface type.\n" ); + } +} + + + +void libblis_test_invscalv_check + ( + test_params_t* params, + obj_t* alpha, + obj_t* y, + obj_t* y_orig, + double* resid + ) +{ + num_t dt = bli_obj_dt( y ); + num_t dt_real = bli_obj_dt_proj_to_real( y ); + + dim_t m = bli_obj_vector_dim( y ); + + obj_t norm_y_r; + + obj_t y2; + + double junk; + + // + // Pre-conditions: + // - y_orig is randomized. + // Note: + // - alpha should have a non-zero imaginary component in the complex + // cases in order to more fully exercise the implementation. + // + // Under these conditions, we assume that the implementation for + // + // y := ( 1.0 / conjalpha(alpha) ) * y_orig + // + // is functioning correctly if + // + // normfv( y_orig - conjalpha(alpha) * y ) + // + // is negligible. + // + + bli_obj_create( dt, m, 1, 0, 0, &y2 ); + bli_copyv( y, &y2 ); + + bli_obj_scalar_init_detached( dt_real, &norm_y_r ); + + bli_scalv( alpha, &y2 ); + bli_subv( y_orig, &y2 ); + + bli_normfv( &y2, &norm_y_r ); + + bli_getsc( &norm_y_r, resid, &junk ); + + bli_obj_free( &y2 ); +} + diff --git a/testsuite/src/test_invscalv.h b/testsuite/src/test_invscalv.h new file mode 100644 index 000000000..297be4836 --- /dev/null +++ b/testsuite/src/test_invscalv.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void libblis_test_invscalv + ( + thread_data_t* tdata, + test_params_t* params, + test_op_t* op + ); + diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 442fae0e0..3ce92e377 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -255,6 +255,7 @@ void libblis_test_level1v_ops( thread_data_t* tdata, test_params_t* params, test libblis_test_dotv( tdata, params, &(ops->dotv) ); libblis_test_dotxv( tdata, params, &(ops->dotxv) ); libblis_test_normfv( tdata, params, &(ops->normfv) ); + libblis_test_invscalv( tdata, params, &(ops->invscalv) ); libblis_test_scalv( tdata, params, &(ops->scalv) ); libblis_test_scal2v( tdata, params, &(ops->scal2v) ); libblis_test_setv( tdata, params, &(ops->setv) ); @@ -270,6 +271,7 @@ void libblis_test_level1m_ops( thread_data_t* tdata, test_params_t* params, test libblis_test_axpym( tdata, params, &(ops->axpym) ); libblis_test_copym( tdata, params, &(ops->copym) ); libblis_test_normfm( tdata, params, &(ops->normfm) ); + libblis_test_invscalm( tdata, params, &(ops->invscalm) ); libblis_test_scalm( tdata, params, &(ops->scalm) ); libblis_test_scal2m( tdata, params, &(ops->scal2m) ); libblis_test_setm( tdata, params, &(ops->setm) ); @@ -370,6 +372,7 @@ void libblis_test_read_ops_file( char* input_filename, test_ops_t* ops ) libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 2, &(ops->dotv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 2, &(ops->dotxv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 0, &(ops->normfv) ); + libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->invscalv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->scalv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->scal2v) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 0, &(ops->setv) ); @@ -381,6 +384,7 @@ void libblis_test_read_ops_file( char* input_filename, test_ops_t* ops ) libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN, 1, &(ops->axpym) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN, 1, &(ops->copym) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN, 0, &(ops->normfm) ); + libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN, 1, &(ops->invscalm) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN, 1, &(ops->scalm) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN, 1, &(ops->scal2m) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_MN, 0, &(ops->setm) ); @@ -2705,8 +2709,9 @@ void libblis_test_vobj_randomize( test_params_t* params, bool normalize, obj_t* bli_normfv( x, &kappa_r ); libblis_test_ceil_pow2( &kappa_r ); bli_copysc( &kappa_r, &kappa ); - bli_invertsc( &kappa ); - bli_scalv( &kappa, x ); + //bli_invertsc( &kappa ); + //bli_scalv( &kappa, x ); + bli_invscalv( &kappa, x ); } } @@ -2744,8 +2749,9 @@ void libblis_test_mobj_randomize( test_params_t* params, bool normalize, obj_t* bli_norm1m( a, &kappa_r ); libblis_test_ceil_pow2( &kappa_r ); bli_copysc( &kappa_r, &kappa ); - bli_invertsc( &kappa ); - bli_scalm( &kappa, a ); + //bli_invertsc( &kappa ); + //bli_scalm( &kappa, a ); + bli_invscalm( &kappa, a ); } } diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index cdb3c6dac..9e38964ee 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -230,6 +230,7 @@ typedef struct test_ops_s test_op_t dotv; test_op_t dotxv; test_op_t normfv; + test_op_t invscalv; test_op_t scalv; test_op_t scal2v; test_op_t setv; @@ -241,6 +242,7 @@ typedef struct test_ops_s test_op_t axpym; test_op_t copym; test_op_t normfm; + test_op_t invscalm; test_op_t scalm; test_op_t scal2m; test_op_t setm; @@ -504,6 +506,7 @@ char libblis_test_proj_dtchar_to_precchar( char dt_char ); #include "test_dotv.h" #include "test_dotxv.h" #include "test_normfv.h" +#include "test_invscalv.h" #include "test_scalv.h" #include "test_scal2v.h" #include "test_setv.h" @@ -515,6 +518,7 @@ char libblis_test_proj_dtchar_to_precchar( char dt_char ); #include "test_axpym.h" #include "test_copym.h" #include "test_normfm.h" +#include "test_invscalm.h" #include "test_scalm.h" #include "test_scal2m.h" #include "test_setm.h" diff --git a/testsuite/src/test_scalm.c b/testsuite/src/test_scalm.c index 6219c71df..bd4565ccd 100644 --- a/testsuite/src/test_scalm.c +++ b/testsuite/src/test_scalm.c @@ -40,7 +40,7 @@ // Static variables. static char* op_str = "scalm"; static char* o_types = "m"; // x -static char* p_types = "c"; // conjbeta +static char* p_types = "c"; // conjalpha static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s { 1e-04, 1e-05 }, // warn, pass for c { 1e-13, 1e-14 }, // warn, pass for d @@ -70,14 +70,14 @@ void libblis_test_scalm_experiment void libblis_test_scalm_impl ( iface_t iface, - obj_t* beta, + obj_t* alpha, obj_t* y ); void libblis_test_scalm_check ( test_params_t* params, - obj_t* beta, + obj_t* alpha, obj_t* y, obj_t* y_save, double* resid @@ -157,9 +157,9 @@ void libblis_test_scalm_experiment dim_t m, n; - conj_t conjbeta; + conj_t conjalpha; - obj_t beta, y; + obj_t alpha, y; obj_t y_save; @@ -171,10 +171,10 @@ void libblis_test_scalm_experiment n = libblis_test_get_dim_from_prob_size( op->dim_spec[1], p_cur ); // Map parameter characters to BLIS constants. - bli_param_map_char_to_blis_conj( pc_str[0], &conjbeta ); + bli_param_map_char_to_blis_conj( pc_str[0], &conjalpha ); // Create test scalars. - bli_obj_scalar_init_detached( datatype, &beta ); + bli_obj_scalar_init_detached( datatype, &alpha ); // Create test operands (vectors and/or matrices). libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, @@ -182,19 +182,19 @@ void libblis_test_scalm_experiment libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE, sc_str[0], m, n, &y_save ); - // Set beta to 0 + i. - //bli_setsc( 0.0, 1.0, &beta ); + // Set alpha to 0 + i. + //bli_setsc( 0.0, 1.0, &alpha ); if ( bli_obj_is_real( &y ) ) - bli_setsc( -2.0, 0.0, &beta ); + bli_setsc( -2.0, 0.0, &alpha ); else - bli_setsc( 0.0, -2.0, &beta ); + bli_setsc( 0.0, -2.0, &alpha ); // Randomize and save y. libblis_test_mobj_randomize( params, FALSE, &y ); bli_copym( &y, &y_save ); // Apply the parameters. - bli_obj_set_conj( conjbeta, &beta ); + bli_obj_set_conj( conjalpha, &alpha ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) @@ -203,7 +203,7 @@ void libblis_test_scalm_experiment time = bli_clock(); - libblis_test_scalm_impl( iface, &beta, &y ); + libblis_test_scalm_impl( iface, &alpha, &y ); time_min = bli_clock_min_diff( time_min, time ); } @@ -213,7 +213,7 @@ void libblis_test_scalm_experiment if ( bli_obj_is_complex( &y ) ) *perf *= 6.0; // Perform checks. - libblis_test_scalm_check( params, &beta, &y, &y_save, resid ); + libblis_test_scalm_check( params, &alpha, &y, &y_save, resid ); // Zero out performance and residual if output matrix is empty. libblis_test_check_empty_problem( &y, perf, resid ); @@ -228,14 +228,14 @@ void libblis_test_scalm_experiment void libblis_test_scalm_impl ( iface_t iface, - obj_t* beta, + obj_t* alpha, obj_t* y ) { switch ( iface ) { case BLIS_TEST_SEQ_FRONT_END: - bli_scalm( beta, y ); + bli_scalm( alpha, y ); break; default: @@ -248,7 +248,7 @@ void libblis_test_scalm_impl void libblis_test_scalm_check ( test_params_t* params, - obj_t* beta, + obj_t* alpha, obj_t* y, obj_t* y_orig, double* resid @@ -261,7 +261,7 @@ void libblis_test_scalm_check dim_t n = bli_obj_width( y ); obj_t norm_y_r; - obj_t nbeta; + obj_t nalpha; obj_t y2; @@ -271,16 +271,16 @@ void libblis_test_scalm_check // Pre-conditions: // - y_orig is randomized. // Note: - // - beta should have a non-zero imaginary component in the complex + // - alpha should have a non-zero imaginary component in the complex // cases in order to more fully exercise the implementation. // // Under these conditions, we assume that the implementation for // - // y := conjbeta(beta) * y_orig + // y := conjalpha(alpha) * y_orig // // is functioning correctly if // - // normfm( y + -conjbeta(beta) * y_orig ) + // normfm( y + -conjalpha(alpha) * y_orig ) // // is negligible. // @@ -288,13 +288,13 @@ void libblis_test_scalm_check bli_obj_create( dt, m, n, 0, 0, &y2 ); bli_copym( y_orig, &y2 ); - bli_obj_scalar_init_detached( dt, &nbeta ); + bli_obj_scalar_init_detached( dt, &nalpha ); bli_obj_scalar_init_detached( dt_real, &norm_y_r ); - bli_copysc( beta, &nbeta ); - bli_mulsc( &BLIS_MINUS_ONE, &nbeta ); + bli_copysc( alpha, &nalpha ); + bli_mulsc( &BLIS_MINUS_ONE, &nalpha ); - bli_scalm( &nbeta, &y2 ); + bli_scalm( &nalpha, &y2 ); bli_addm( &y2, y ); bli_normfm( y, &norm_y_r ); diff --git a/testsuite/src/test_scalv.c b/testsuite/src/test_scalv.c index 142b5e410..7b409103b 100644 --- a/testsuite/src/test_scalv.c +++ b/testsuite/src/test_scalv.c @@ -40,7 +40,7 @@ // Static variables. static char* op_str = "scalv"; static char* o_types = "v"; // y -static char* p_types = "c"; // conjbeta +static char* p_types = "c"; // conjalpha static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s { 1e-04, 1e-05 }, // warn, pass for c { 1e-13, 1e-14 }, // warn, pass for d @@ -70,14 +70,14 @@ void libblis_test_scalv_experiment void libblis_test_scalv_impl ( iface_t iface, - obj_t* beta, + obj_t* alpha, obj_t* y ); void libblis_test_scalv_check ( test_params_t* params, - obj_t* beta, + obj_t* alpha, obj_t* y, obj_t* y_orig, double* resid @@ -158,9 +158,9 @@ void libblis_test_scalv_experiment dim_t m; - conj_t conjbeta; + conj_t conjalpha; - obj_t beta, y; + obj_t alpha, y; obj_t y_save; @@ -171,27 +171,27 @@ void libblis_test_scalv_experiment m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); // Map parameter characters to BLIS constants. - bli_param_map_char_to_blis_conj( pc_str[0], &conjbeta ); + bli_param_map_char_to_blis_conj( pc_str[0], &conjalpha ); // Create test scalars. - bli_obj_scalar_init_detached( datatype, &beta ); + bli_obj_scalar_init_detached( datatype, &alpha ); // Create test operands (vectors and/or matrices). libblis_test_vobj_create( params, datatype, sc_str[0], m, &y ); libblis_test_vobj_create( params, datatype, sc_str[0], m, &y_save ); - // Set beta. + // Set alpha. if ( bli_obj_is_real( &y ) ) - bli_setsc( -2.0, 0.0, &beta ); + bli_setsc( -2.0, 0.0, &alpha ); else - bli_setsc( 0.0, -2.0, &beta ); + bli_setsc( 0.0, -2.0, &alpha ); // Randomize and save y. libblis_test_vobj_randomize( params, FALSE, &y ); bli_copyv( &y, &y_save ); // Apply the parameters. - bli_obj_set_conj( conjbeta, &beta ); + bli_obj_set_conj( conjalpha, &alpha ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) @@ -200,7 +200,7 @@ void libblis_test_scalv_experiment time = bli_clock(); - libblis_test_scalv_impl( iface, &beta, &y ); + libblis_test_scalv_impl( iface, &alpha, &y ); time_min = bli_clock_min_diff( time_min, time ); } @@ -210,7 +210,7 @@ void libblis_test_scalv_experiment if ( bli_obj_is_complex( &y ) ) *perf *= 6.0; // Perform checks. - libblis_test_scalv_check( params, &beta, &y, &y_save, resid ); + libblis_test_scalv_check( params, &alpha, &y, &y_save, resid ); // Zero out performance and residual if output vector is empty. libblis_test_check_empty_problem( &y, perf, resid ); @@ -225,14 +225,14 @@ void libblis_test_scalv_experiment void libblis_test_scalv_impl ( iface_t iface, - obj_t* beta, + obj_t* alpha, obj_t* y ) { switch ( iface ) { case BLIS_TEST_SEQ_FRONT_END: - bli_scalv( beta, y ); + bli_scalv( alpha, y ); break; default: @@ -245,7 +245,7 @@ void libblis_test_scalv_impl void libblis_test_scalv_check ( test_params_t* params, - obj_t* beta, + obj_t* alpha, obj_t* y, obj_t* y_orig, double* resid @@ -257,7 +257,7 @@ void libblis_test_scalv_check dim_t m = bli_obj_vector_dim( y ); obj_t norm_y_r; - obj_t nbeta; + obj_t nalpha; obj_t y2; @@ -267,16 +267,16 @@ void libblis_test_scalv_check // Pre-conditions: // - y_orig is randomized. // Note: - // - beta should have a non-zero imaginary component in the complex + // - alpha should have a non-zero imaginary component in the complex // cases in order to more fully exercise the implementation. // // Under these conditions, we assume that the implementation for // - // y := conjbeta(beta) * y_orig + // y := conjalpha(alpha) * y_orig // // is functioning correctly if // - // normfv( y + -conjbeta(beta) * y_orig ) + // normfv( y + -conjalpha(alpha) * y_orig ) // // is negligible. // @@ -284,13 +284,13 @@ void libblis_test_scalv_check bli_obj_create( dt, m, 1, 0, 0, &y2 ); bli_copyv( y_orig, &y2 ); - bli_obj_scalar_init_detached( dt, &nbeta ); + bli_obj_scalar_init_detached( dt, &nalpha ); bli_obj_scalar_init_detached( dt_real, &norm_y_r ); - bli_copysc( beta, &nbeta ); - bli_mulsc( &BLIS_MINUS_ONE, &nbeta ); + bli_copysc( alpha, &nalpha ); + bli_mulsc( &BLIS_MINUS_ONE, &nalpha ); - bli_scalv( &nbeta, &y2 ); + bli_scalv( &nalpha, &y2 ); bli_addv( &y2, y ); bli_normfv( y, &norm_y_r ); From 6e5431e8494b06bd80efcab3abf0a6456d6c0381 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sat, 10 Sep 2022 15:16:58 -0500 Subject: [PATCH 078/230] Fix line number issue in flattened blis.h. (#660) Details: - Updated the top-level Makefile so that it invokes flatten-headers.py without the -c option, which was requesting that comments be stripped (since comment stripping is disabled by default). - Updated flatten-headers.py to accept a new option (-l) to enable insertion of #line directives into the output file. This new option is enabled by default. - Also added logic to flatten-headers.py that outputs a warning if both comment stripping and line numbers are requested since the comment stripping will cause the line numbers to become inaccurate. --- Makefile | 8 ++++---- build/flatten-headers.py | 16 +++++++++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 5605dd8fc..e790e8752 100644 --- a/Makefile +++ b/Makefile @@ -492,10 +492,10 @@ flat-header: check-env $(BLIS_H_FLAT) $(BLIS_H_FLAT): $(ALL_H99_FILES) ifeq ($(ENABLE_VERBOSE),yes) - $(FLATTEN_H) -c -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" + $(FLATTEN_H) -l -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" else @echo -n "Generating monolithic blis.h" - @$(FLATTEN_H) -c -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" + @$(FLATTEN_H) -l -v1 $(BLIS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" @echo "Generated $@" endif @@ -505,10 +505,10 @@ flat-cblas-header: check-env $(CBLAS_H_FLAT) $(CBLAS_H_FLAT): $(FRAME_H99_FILES) ifeq ($(ENABLE_VERBOSE),yes) - $(FLATTEN_H) -c -v1 $(CBLAS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" + $(FLATTEN_H) -l -v1 $(CBLAS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" else @echo -n "Generating monolithic cblas.h" - @$(FLATTEN_H) -c -v1 $(CBLAS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" + @$(FLATTEN_H) -l -v1 $(CBLAS_H_SRC_PATH) $@ "./$(INCLUDE_DIR)" "$(ALL_H99_DIRPATHS)" @echo "Generated $@" endif diff --git a/build/flatten-headers.py b/build/flatten-headers.py index ecd4635d1..2d5b74c7a 100755 --- a/build/flatten-headers.py +++ b/build/flatten-headers.py @@ -278,14 +278,16 @@ def flatten_header( inputfile, header_dirpaths, cursp ): # Mark the beginning of the header being inserted. ostring += "%s%s%c" % ( beginstr, header, '\n' ) - #ostring += "#line %d \"%s\"%c\n" % ( 1, header_path, '\n' ) + if line_numbers: + ostring += "#line %d \"%s\"%c\n" % ( 1, header_path, '\n' ) # Recurse on the header, accumulating the string. ostring += flatten_header( header_path, header_dirpaths, cursp + " " ) # Mark the end of the header being inserted. ostring += "%s%s%c" % ( endstr, header, '\n' ) - #ostring += "#line %d \"%s\"%c\n" % ( lineno+1, inputfile, '\n' ) + if line_numbers: + ostring += "#line %d \"%s\"%c\n" % ( lineno+1, inputfile, '\n' ) echov2( "%sheader file '%s' fully processed." \ % ( cursp, header_path ) ) @@ -350,6 +352,7 @@ def find_header_dirs( dirpath ): output_name = None strip_comments = None recursive_flag = None +line_numbers = None verbose_flag = None regex = None root_inputfile = None @@ -360,6 +363,7 @@ def main(): global output_name global strip_comments global recursive_flag + global line_numbers global verbose_flag global regex global root_inputfile @@ -371,13 +375,14 @@ def main(): strip_comments = False recursive_flag = False + line_numbers = False verbose_flag = "1" nestsp = " " # Process our command line options. try: - opts, args = getopt.getopt( sys.argv[1:], "o:rchv:" ) + opts, args = getopt.getopt( sys.argv[1:], "o:rclhv:" ) except getopt.GetoptError as err: # print help information and exit: @@ -390,6 +395,8 @@ def main(): output_name = optarg elif opt == "-r": recursive_flag = True + elif opt == "-l": + line_numbers = True elif opt == "-c": strip_comments = True elif opt == "-v": @@ -401,6 +408,9 @@ def main(): print_usage() sys.exit() + if line_numbers and strip_comments: + my_print( "WARNING: stripping comments will result in inaccurate line numbers" ) + # Make sure that the verboseness level is valid. if ( verbose_flag != "0" and verbose_flag != "1" and From cb74202db39dc8cb81fdd06f8a445f8837e27853 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 13 Sep 2022 11:46:24 -0500 Subject: [PATCH 079/230] Fixed incorrect sizeof(type) in edge case macros. (#662) Details: - In bli_edge_case_macro_defs.h, the GEMM_UKR_SETUP_CT_PRE() and GEMMTRSM_UKR_SETUP_CT_PRE() macros previously declared their temporary ct microtiles as: PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ The problem here is that sizeof( PASTEMAC(ch,type) ) evaluates to things like sizeof( BLIS_DOUBLE ), not sizeof( double ), and since BLIS_DOUBLE is an enum, it is typically an int, which means the sizeof() expression is evaluating to the wrong value. This was likely a benign bug, though, since BLIS does not support any computational datatypes that are smaller than sizeof( int ), which means the ct array would be *over*-allocated rather than underallocated. Thanks to @moon-chilled for identifying and reporting this bug in #624. - CREDITS file update. --- CREDITS | 1 + frame/include/bli_edge_case_macro_defs.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CREDITS b/CREDITS index 49361c801..152de0a4b 100644 --- a/CREDITS +++ b/CREDITS @@ -68,6 +68,7 @@ but many others have contributed code and feedback, including Devin Matthews @devinamatthews (The University of Texas at Austin) Stefanos Mavros @smavros Mithun Mohan @MithunMohanKadavil (AMD) + @moon-chilled Ilknur Mustafazade @Runkli @nagsingh Bhaskar Nallani @BhaskarNallani (AMD) diff --git a/frame/include/bli_edge_case_macro_defs.h b/frame/include/bli_edge_case_macro_defs.h index 70d97d5d1..6fc4e46c8 100644 --- a/frame/include/bli_edge_case_macro_defs.h +++ b/frame/include/bli_edge_case_macro_defs.h @@ -47,7 +47,7 @@ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ - PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ + PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,ctype) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; @@ -137,7 +137,7 @@ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ - PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ + PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,ctype) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; From fd885cf98f4fe1d3bc46468e567776c37c670fcc Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 13 Sep 2022 11:50:23 -0500 Subject: [PATCH 080/230] Use kernel CFLAGS for 'kernels' subdirs in addons. (#658) Details: - Updated Makefile and common.mk so that the targeted configuration's kernel CFLAGS are applied to source files that are found in a 'kernels' subdirectory within an enabled addon. For now, this behavior only applies when the 'kernels' directory is at the top level of the addon directory structure. For example, if there is an addon named 'foobar', the source code must be located in addon/foobar/kernels/ in order for it to be compiled with the target configurations's kernel CFLAGS. Any other source code within addon/foobar/ will be compiled with general-purpose CFLAGS (the same ones that were used on all addon code prior to this commit). Thanks to AMD (esp. Mithun Mohan) for suggesting this change and catching an intermediate bug in the PR. - Comment/whitespace updates. --- Makefile | 43 +++++++++++++++++++++++++++++++++++++++++-- common.mk | 32 +++++++++++++++++++++----------- 2 files changed, 62 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index e790e8752..f5396f79b 100644 --- a/Makefile +++ b/Makefile @@ -213,8 +213,19 @@ MK_REFKERN_OBJS := $(foreach arch, $(CONFIG_LIST), \ MK_FRAME_OBJS := $(call gen-obj-paths-from-src,$(FRAME_SRC_SUFS),$(MK_FRAME_SRC),$(FRAME_PATH),$(BASE_OBJ_FRAME_PATH)) # Generate object file paths for the addon source code. If one or more addons -# were not enabled a configure-time, this variable will we empty. -MK_ADDON_OBJS := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDON_SRC),$(ADDON_PATH),$(BASE_OBJ_ADDON_PATH)) +# were not enabled a configure-time, these variable will we empty. +# NOTE: We separate the source and objects into kernel and non-kernel lists. +MK_ADDON_KERS_SRC := $(foreach addon, $(ADDON_LIST), \ + $(filter $(ADDON_PATH)/$(addon)/$(KERNELS_DIR)/%, \ + $(MK_ADDON_SRC)) \ + ) +MK_ADDON_OTHER_SRC := $(foreach addon, $(ADDON_LIST), \ + $(filter-out $(ADDON_PATH)/$(addon)/$(KERNELS_DIR)/%, \ + $(MK_ADDON_SRC)) \ + ) +MK_ADDON_KERS_OBJS := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDON_KERS_SRC),$(ADDON_PATH),$(BASE_OBJ_ADDON_PATH)) +MK_ADDON_OTHER_OBJS := $(call gen-obj-paths-from-src,$(ADDON_SRC_SUFS),$(MK_ADDON_OTHER_SRC),$(ADDON_PATH),$(BASE_OBJ_ADDON_PATH)) +MK_ADDON_OBJS := $(MK_ADDON_KERS_OBJS) $(MK_ADDON_OTHER_OBJS) # Generate object file paths for the sandbox source code. If a sandbox was not # enabled a configure-time, this variable will we empty. @@ -580,6 +591,7 @@ endef # first argument: a configuration name from the union of config_list and # config_name, used to look up the CFLAGS to use during compilation. +# second argument: the C99 addon file suffix being considered. define make-c99-addon-rule $(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS) ifeq ($(ENABLE_VERBOSE),yes) @@ -590,6 +602,23 @@ else endif endef +# first argument: a configuration name from the union of config_list and +# config_name, used to look up the CFLAGS to use during compilation. +# second argument: the C99 addon file suffix being considered. +# third argument: the name of the addon being considered. +define make-c99-addon-kers-rule +$(BASE_OBJ_ADDON_PATH)/$(3)/$(KERNELS_DIR)/%.o: $(ADDON_PATH)/$(3)/$(KERNELS_DIR)/%.$(2) $(BLIS_H_FLAT) $(ADDON_H99_FILES) $(MAKE_DEFS_MK_PATHS) +ifeq ($(ENABLE_VERBOSE),yes) + $(CC) $(call get-addon-kernel-c99flags-for,$(1)) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-addon-kernel-text-for,$(1)) + @$(CC) $(call get-addon-kernel-c99flags-for,$(1)) -c $$< -o $$@ +endif +endef + +# first argument: a configuration name from the union of config_list and +# config_name, used to look up the CFLAGS to use during compilation. +# second argument: the C++ addon file suffix being considered. define make-cxx-addon-rule $(BASE_OBJ_ADDON_PATH)/%.o: $(ADDON_PATH)/%.$(2) $(BLIS_H_FLAT) $(ADDON_HXX_FILES) $(MAKE_DEFS_MK_PATHS) ifeq ($(ENABLE_VERBOSE),yes) @@ -602,6 +631,7 @@ endef # first argument: a configuration name from the union of config_list and # config_name, used to look up the CFLAGS to use during compilation. +# second argument: the C99 sandbox file suffix being considered. define make-c99-sandbox-rule $(BASE_OBJ_SANDBOX_PATH)/%.o: $(SANDBOX_PATH)/%.$(2) $(BLIS_H_FLAT) $(SANDBOX_H99_FILES) $(MAKE_DEFS_MK_PATHS) ifeq ($(ENABLE_VERBOSE),yes) @@ -612,6 +642,9 @@ else endif endef +# first argument: a configuration name from the union of config_list and +# config_name, used to look up the CFLAGS to use during compilation. +# second argument: the C++ sandbox file suffix being considered. define make-cxx-sandbox-rule $(BASE_OBJ_SANDBOX_PATH)/%.o: $(SANDBOX_PATH)/%.$(2) $(BLIS_H_FLAT) $(SANDBOX_HXX_FILES) $(MAKE_DEFS_MK_PATHS) ifeq ($(ENABLE_VERBOSE),yes) @@ -657,6 +690,12 @@ $(foreach kset, $(KERNEL_LIST), $(eval $(call make-kernels-rule,$(kset),$(call g $(foreach suf, $(ADDON_C99_SUFS), \ $(foreach conf, $(CONFIG_NAME), $(eval $(call make-c99-addon-rule,$(conf),$(suf))))) +# Instantiate the build rule for C addon/kernels files. Use the CFLAGS for the +# configuration family. +$(foreach addon, $(ADDON_LIST), \ +$(foreach suf, $(ADDON_C99_SUFS), \ +$(foreach conf, $(CONFIG_NAME), $(eval $(call make-c99-addon-kers-rule,$(conf),$(suf),$(addon)))))) + # Instantiate the build rule for C++ addon files. Use the CFLAGS for the # configuration family. $(foreach suf, $(ADDON_CXX_SUFS), \ diff --git a/common.mk b/common.mk index 33713e9f5..b49089419 100644 --- a/common.mk +++ b/common.mk @@ -154,7 +154,7 @@ get-kernel-cflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \ $(BUILD_SYMFLAGS) \ ) -# When compiling sandboxes, we use flags similar to those of general framework +# When compiling addons, we use flags similar to those of general framework # source. This ensures that the same code can be linked and run across various # sub-configurations. get-addon-c99flags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ @@ -169,6 +169,15 @@ get-addon-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ ) +# When compiling addon kernels, we use flags similar to those of kernels +# flags, except we also include the addon header paths. +get-addon-kernel-c99flags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \ + $(call load-var-for,CKVECFLAGS,$(1)) \ + $(call get-noopt-cflags-for,$(1)) \ + $(CADDONINCFLAGS) \ + $(BUILD_CPPFLAGS) \ + $(BUILD_SYMFLAGS) \ + ) # When compiling sandboxes, we use flags similar to those of general framework # source. This ensures that the same code can be linked and run across various @@ -203,16 +212,17 @@ get-user-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ # Define functions that return messages appropriate for each non-verbose line # of compilation output. -get-noopt-text = "(CFLAGS for no optimization)" -get-refinit-text-for = "('$(1)' CFLAGS for ref. kernel init)" -get-refkern-text-for = "('$(1)' CFLAGS for ref. kernels)" -get-config-text-for = "('$(1)' CFLAGS for config code)" -get-frame-text-for = "('$(1)' CFLAGS for framework code)" -get-kernel-text-for = "('$(1)' CFLAGS for kernels)" -get-addon-c99text-for = "('$(1)' CFLAGS for addons)" -get-addon-cxxtext-for = "('$(1)' CXXFLAGS for addons)" -get-sandbox-c99text-for = "('$(1)' CFLAGS for sandboxes)" -get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)" +get-noopt-text = "(CFLAGS for no optimization)" +get-refinit-text-for = "('$(1)' CFLAGS for ref. kernel init)" +get-refkern-text-for = "('$(1)' CFLAGS for ref. kernels)" +get-config-text-for = "('$(1)' CFLAGS for config code)" +get-frame-text-for = "('$(1)' CFLAGS for framework code)" +get-kernel-text-for = "('$(1)' CFLAGS for kernels)" +get-addon-c99text-for = "('$(1)' CFLAGS for addons)" +get-addon-cxxtext-for = "('$(1)' CXXFLAGS for addons)" +get-addon-kernel-text-for = "('$(1)' CFLAGS for addon kernels)" +get-sandbox-c99text-for = "('$(1)' CFLAGS for sandboxes)" +get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)" From 05a811e898b371a76581abd4afa416980cce7db9 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 13 Sep 2022 19:24:05 -0500 Subject: [PATCH 081/230] Initialize rntm_t nt/ways fields with 1 (not -1). (#663) Details: - Changed the way that rntm_t structs are initialized, mainly so that the global rntm_t that is set via environment variables at runtime may be queried by the application prior to any computation taking place. (Strictly speaking, the application may already query these fields, but they do not always contain valid values and often contain -1 when they are unset.) These changes also served to clarify how these parameters are treated, and homogenized the implementations of bli_rntm_set_ways_from_rntm(), bli_rntm_set_ways_from_rntm_sup(), and bli_thread_init_rntm_from_env(). Special thanks to Jeff Diamond, Leick Robinson, and Devin Matthews for pointing out that the previous behavior was needlessly confusing and could be improved. - The aforementioned modifications also included subtle changes as to what counts as "setting" a loop's ways of parallelism for the purposes of deciding whether to use the ways or the total number of threads. Previously, setting any loop's ways, even to 1, counted in favor of using the ways. Now, only values greater than 1 will count as "setting", and all other values will silently be mapped to 1, with those parameters treated as if they were untouched all along. - Updated bli_rntm.h and bli_thread.c so that any attempt to set the PC_NT variable (or pc_nt field of a rntm_t) will either ignore the request or reassert the value as 1. - Updated bli_rntm_set_ways() so that rather than clear the num_threads field, it is set to the product of all of the per-loop ways of parallelism. - Removed code from test_libblis.c that handled the possibility of unset environment variables when printing out their values. - Removed bli_rntm_equals() inline function from bli_rntm.h, which has long been disabled. - Updates to docs/Multithreading.md related to the aforementioned changes. - Comment updates. --- docs/Multithreading.md | 59 +++++---- frame/base/bli_rntm.c | 249 ++++++++++++++++++----------------- frame/base/bli_rntm.h | 38 ++---- frame/thread/bli_thread.c | 97 ++++++++++---- testsuite/src/test_libblis.c | 21 +-- 5 files changed, 246 insertions(+), 218 deletions(-) diff --git a/docs/Multithreading.md b/docs/Multithreading.md index 48fbc8ca1..8e636f06a 100644 --- a/docs/Multithreading.md +++ b/docs/Multithreading.md @@ -47,6 +47,7 @@ $ ./configure --enable-threading=pthreads auto ``` You can also use the shorthand option for `--enable-threading`, which is `-t`: ``` +$ ./configure -t openmp auto $ ./configure -t pthreads auto ``` For more complete and up-to-date information on the `--enable-threading` option, simply run `configure` with the `--help` (or `-h`) option: @@ -102,19 +103,19 @@ There are three broad methods of specifying multithreading in BLIS: * [Globally at runtime](Multithreading.md#globally-at-runtime) * [Locally at runtime](Multithreading.md#locally-at-runtime) (that is, on a per-call, thread-safe basis) -Within these three broad methods there are two specific ways of expressing a request for parallelism. First, the user may express a single number--the total number of threads, or ways of parallelism, to use within a single operation such as `gemm`. We call this the "automatic" way. Alternatively, the user may express the number of ways of parallelism to obtain within *each loop* of the level-3 operation. We call this the "manual" way. The latter way is actually what BLIS eventually needs before it can perform its multithreading; the former is viable only because we have a heuristic of determining a reasonable instance of the latter when given the former. -This pattern--automatic or manual--holds regardless of which of the three methods is used. +Within each of these three broad methods there are two specific ways of expressing a request for parallelism. First, the user may express a single number--the total number of threads, or ways of parallelism, to use within a single operation such as `gemm`. We call this the "automatic" way. Alternatively, the user may express the number of ways of parallelism to obtain within *each loop* of the level-3 operation. We call this the "manual" way. The latter way is actually what BLIS eventually needs before it can perform its multithreading; the former is viable only because we have a heuristic of determining a reasonable instance of the latter when given the former. +This choice--automatic or manual--must be made regardless of which of the three methods is used. Regardless of which method is employed, and which specific way within each method, after setting the number of threads, the application may call the desired level-3 operation (via either the [typed API](docs/BLISTypedAPI.md) or the [object API](docs/BLISObjectAPI.md)) and the operation will execute in a multithreaded manner. (When calling BLIS via the BLAS API, only the first two (global) methods are available.) **Note**: Please be aware of what happens if you try to specify both the automatic and manual ways, as it could otherwise confuse new users. Here are the important points: - * Regardless of which broad method is used, **if multithreading is specified via both the automatic and manual ways, the values set via the manual way will always take precedence.** - * Specifying parallelism for even *one* loop counts as specifying the manual way (in which case the ways of parallelism for the remaining loops will be assumed to be 1). And in the case of the environment variable method, setting the ways of parallelism for a loop to 1 counts as specifying parallelism! If you want to switch from using the manual way to automatic way, you must not only set (`export`) the `BLIS_NUM_THREADS` variable, but you must also `unset` all of the `BLIS_*_NT` variables. - * If you have specified multithreading via *both* the automatic and manual ways, BLIS will **not** complain if the values are inconsistent with one another. (For example, you may request 12 total threads be used while also specifying 2 and 4 ways of parallelism within the JC and IC loops, respectively, for a total of 8 ways.) Furthermore, you will be able to query these inconsistent values via the runtime API both before and after multithreading executes. + * Regardless of which of the three methods is used, **if multithreading is specified via both the automatic and manual ways, the values set via the manual way will always take precedence.** + * Specifying parallelism for even *one* loop counts as specifying the manual way (in which case the ways of parallelism for the remaining loops will be assumed to be 1). (Note: Setting the ways of parallelism for a loop to any value less than or equal to 1 does *not* count as specifying parallelism for that loop; in these cases, the default of 1 will silently be used instead.) If you want to switch from using the manual way to automatic way, you must not only set (`export`) the `BLIS_NUM_THREADS` variable, but you must either `unset` all of the `BLIS_*_NT` variables, or make sure they are all set to 1. + * If you have specified multithreading via *both* the automatic and manual ways, BLIS will **not** complain if the values are inconsistent with one another. (For example, you may request 12 total threads be used while also specifying 2 and 4 ways of parallelism within the JC and IC loops, respectively, for a total of 8 ways. 12 is obviously not equal to 8, and in this case the 8-thread specification will prevail.) Furthermore, you will be able to query these inconsistent values via the runtime API both before and after multithreading executes. * If multithreading is disabled, you **may still** specify multithreading values via either the manual or automatic ways. However, BLIS will silently ignore **all** of these values. A BLIS library that is built with multithreading disabled at configure-time will always run sequentially (from the perspective of a single application thread). Furthermore: -* For small numbers of threads, the number requested will be honored faithfully. However, if you request a larger number of threads that happens to also be prime, BLIS will reduce the number by one in order to allow more more efficient thread factorizations. This behavior can be overridden by configuring BLIS with the `BLIS_ENABLE_AUTO_PRIME_NUM_THREADS` macro defined in the `bli_family_*.h` file of the relevant subconfiguration. Similarly, the threshold beyond which BLIS will reduce primes by one can be set via `BLIS_NT_MAX_PRIME`. (This latter value is ignored if the former macro is defined.) +* For small numbers of threads, the number requested will be honored faithfully. However, if you request a larger number of threads that happens to also be prime, BLIS will (by default) reduce the number by one in order to allow more more efficient thread factorizations. This behavior (in which `BLIS_DISABLE_AUTO_PRIME_NUM_THREADS` is set by default) can be overridden by configuring BLIS with the `BLIS_ENABLE_AUTO_PRIME_NUM_THREADS` macro defined in the `bli_family_*.h` file of the relevant target configuration. This `BLIS_ENABLE_*` macro will allow BLIS to use any prime number of threads. Note that the threshold beyond which BLIS will reduce primes by one (assuming `BLIS_DISABLE_AUTO_PRIME_NUM_THREADS` is set) can be set via `BLIS_NT_MAX_PRIME`. This value is ignored if `BLIS_ENABLE_AUTO_PRIME_NUM_THREADS` is defined. ## Globally via environment variables @@ -126,7 +127,7 @@ Regardless of whether you end up using the automatic or manual way of expressing ### Environment variables: the automatic way -The automatic way of specifying parallelism entails simply setting the total number of threads you wish BLIS to employ in its parallelization. This total number of threads is captured by the `BLIS_NUM_THREADS` environment variable. You can set this variable prior to executing your BLIS-linked executable: +The automatic way of specifying parallelism entails setting the total number of threads you wish BLIS to employ in its parallelization. This total number of threads is captured by the `BLIS_NUM_THREADS` environment variable. You can set this variable prior to executing your BLIS-linked executable: ``` $ export GOMP_CPU_AFFINITY="..." # optional step when using GNU libgomp. $ export BLIS_NUM_THREADS=16 @@ -134,7 +135,7 @@ $ ./my_blis_program ``` This causes BLIS to automatically determine a reasonable threading strategy based on what is known about the operation and problem size. If `BLIS_NUM_THREADS` is not set, BLIS will attempt to query the value of `OMP_NUM_THREADS`. If neither variable is set, the default number of threads is 1. -**Note**: We *highly* discourage use of the `OMP_NUM_THREADS` environment variable and may remove support for it in the future. If you wish to set parallelism globally via environment variables, please use `BLIS_NUM_THREADS`. +**Note**: We *highly* discourage use of the `OMP_NUM_THREADS` environment variable to specify multithreading within BLIS and may remove support for it in the future. If you wish to set parallelism globally via environment variables, please use `BLIS_NUM_THREADS`. ### Environment variables: the manual way @@ -142,15 +143,15 @@ The manual way of specifying parallelism involves communicating which loops with The below chart describes the five loops used in BLIS's matrix multiplication operations. -| Loop around microkernel | Environment variable | Direction | Notes | -|:-------------------------|:---------------------|:----------|:---------------| -| 5th loop | `BLIS_JC_NT` | `n` | | -| 4th loop | _N/A_ | `k` | Not enabled | -| 3rd loop | `BLIS_IC_NT` | `m` | | -| 2nd loop | `BLIS_JR_NT` | `n` | Typically <= 4 | -| 1st loop | `BLIS_IR_NT` | `m` | Typically 1 | +| Loop around microkernel | Environment variable | Direction | Notes | +|:-------------------------|:---------------------|:----------|:----------------------| +| 5th loop ("JC loop") | `BLIS_JC_NT` | `n` | | +| 4th loop ("PC loop") | _N/A_ | `k` | Unavailable; always 1 | +| 3rd loop ("IC loop") | `BLIS_IC_NT` | `m` | | +| 2nd loop ("JR loop") | `BLIS_JR_NT` | `n` | Typically <= 8 | +| 1st loop ("IR loop") | `BLIS_IR_NT` | `m` | Typically 1 | -**Note**: Parallelization of the 4th loop is not currently enabled because each iteration of the loop updates the same part of the output matrix C. Thus, to safely parallelize it requires either a reduction or mutex locks when updating C. +**Note**: Parallelization of the 4th loop is not currently available because each iteration of the loop updates the same part of the output matrix C. Thus, to safely parallelize it requires either a reduction or mutex locks when updating C. Parallelization in BLIS is hierarchical. So if we parallelize multiple loops, the total number of threads will be the product of the amount of parallelism for each loop. Thus the total number of threads used is the product of all the values: `BLIS_JC_NT * BLIS_IC_NT * BLIS_JR_NT * BLIS_IR_NT`. @@ -169,6 +170,8 @@ Next, which combinations of loops to parallelize depends on which caches are sha If you still wish to set the parallelization scheme globally, but you want to do so at runtime, BLIS provides a thread-safe API for specifying multithreading. Think of these functions as a way to modify the same internal data structure into which the environment variables are read. (Recall that the environment variables are only read once, when BLIS is initialized). +**Note**: If you set parallelization globally via environment variables and *then* your application *also* uses the global runtime API to set the ways of parallelism, the global runtime API will prevail. + **Note**: Regardless of which way ([automatic](Multithreading.md#globally-at-runtime-the-automatic-way) or [manual](Multithreading.md#globally-at-runtime-the-manual-way)) the global runtime API is used to specify multithreading, that specification will affect operation of BLIS through **both** the BLAS compatibility layer as well as the native ([typed](docs/BLISTypedAPI.md) and [object](docs/BLISObjectAPI.md)) APIs that are unique to BLIS. ### Globally at runtime: the automatic way @@ -181,7 +184,7 @@ This function takes one integer--the total number of threads for BLIS to utilize ```c bli_thread_set_num_threads( 4 ); ``` -we are requesting that the global number of threads be set to 4. You may also query the global number of threads at any time via +we are requesting that the total number of threads (ways of parallelism) be set to 4. You may also query the number of threads at any time via ```c dim_t bli_thread_get_num_threads( void ); ``` @@ -201,7 +204,7 @@ So, for example, if we call ```c bli_thread_set_ways( 2, 1, 4, 1, 1 ); ``` -we are requesting two ways of parallelism in the `JC` loop and 4 ways of parallelism in the `IC` loop. +we are requesting 2 ways of parallelism in the `JC` loop and 4 ways of parallelism in the `IC` loop. Unlike environment variables, which only allow the user to set the parallelization strategy prior to running the executable, `bli_thread_set_ways()` may be called any time during the normal course of the BLIS-linked application's execution. ## Locally at runtime @@ -210,15 +213,17 @@ In addition to the global methods based on environment variables and runtime fun As with environment variables and the global runtime API, there are two ways to specify parallelism: the automatic way and the manual way. Both ways involve allocating a BLIS-specific object, initializing the object and encoding the desired parallelization, and then passing a pointer to the object into one of the expert interfaces of either the [typed](docs/BLISTypedAPI.md) or [object](docs/BLISObjectAPI) APIs. We provide examples of utilizing this threading object below. +**Note**: If you set parallelization globally via environment variables and/or globally via the runtime API, and *then* specify parallelization locally on a per-call basis, the values specified locally will prevail. + **Note**: Neither way ([automatic](Multithreading.md#locally-at-runtime-the-automatic-way) nor [manual](Multithreading.md#locally-at-runtime-the-manual-way)) of specifying multithreading via the local runtime API can be used via the BLAS interfaces. The local runtime API may *only* be used via the native ([typed](docs/BLISTypedAPI.md) and [object](docs/BLISObjectAPI.md)) APIs, which are unique to BLIS. (Furthermore, the expert interfaces of each API must be used. This is demonstrated later on in this section.) ### Initializing a rntm_t -Before specifying the parallelism (automatically or manually), you must first allocate a special BLIS object called a `rntm_t` (runtime). The object is quite small (about 64 bytes), and so we recommend allocating it statically on the function stack: +Before specifying the parallelism (automatically or manually), you must first allocate a special BLIS object called a `rntm_t` (runtime). The object is quite small (about 128 bytes), and so we recommend allocating it statically on the function stack: ```c rntm_t rntm; ``` -We **strongly recommend** initializing the `rntm_t`. This can be done in either of two ways. +You **must** initialize the `rntm_t`. This can be done in either of two ways. If you want to initialize it as part of the declaration, you may do so via the default `BLIS_RNTM_INITIALIZER` macro: ```c rntm_t rntm = BLIS_RNTM_INITIALIZER; @@ -229,7 +234,7 @@ bli_rntm_init( &rntm ); ``` As of this writing, BLIS treats a default-initialized `rntm_t` as a request for single-threaded execution. -**Note**: If you choose to **not** initialize the `rntm_t` object, you **must** set its parallelism via either the automatic way or the manual way, described below. Passing a completely uninitialized `rntm_t` to a level-3 operation **will almost surely result in undefined behavior!** +**Note**: If you choose to **not** initialize the `rntm_t` object and then pass it into a level-3 operation, **you will almost surely observe undefined behavior!** Please don't do this! ### Locally at runtime: the automatic way @@ -241,7 +246,7 @@ As with `bli_thread_set_num_threads()` [discussed previously](Multithreading.md# ```c bli_rntm_set_num_threads( 6, &rntm ); ``` -the `rntm_t` object will be encoded to use a total of 6 threads. +the `rntm_t` object will be encoded to use a total of 6 threads. ### Locally at runtime: the manual way @@ -250,7 +255,7 @@ Once your `rntm_t` is initialized, you may manually encode the ways of paralleli void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ); ``` As with `bli_thread_set_ways()` [discussed previously](Multithreading.md#globally-at-runtime-the-manual-way), this function takes one integer for each loop in the level-3 operations. It also takes the address of the `rntm_t` to modify. -(**Note**: even though the function takes a `pc` argument, it will be ignored until parallelism is supported in the `KC` loop.) +(**Note**: even though the function takes a `pc` argument, it will be ignored--and assumed to be 1--until parallelism is supported in the `KC` loop.) So, for example, if we call ```c bli_rntm_set_ways( 1, 1, 2, 3, 1, &rntm ); @@ -259,13 +264,13 @@ we are requesting two ways of parallelism in the `IC` loop and three ways of par ### Locally at runtime: using the expert interfaces -Regardless of whether you specified parallelism into your `rntm_t` object via the automatic or manual method, eventually you must use the data structure when calling a BLIS operation. +Regardless of whether you specified parallelism into your `rntm_t` object via the automatic or manual method, eventually you must use the data structure when calling a BLIS operation in order for it to have any effect. -Let's assume you wish to call `gemm`. To so do, simply use the expert interface, which takes two additional arguments: a `cntx_t` (context) and a `rntm_t`. For the context, you may simply pass in `NULL` and BLIS will select a default context (which is exactly what happens when you call the basic/non-expert interfaces). Here is an example of such a call: +Let's assume you wish to call `gemm`. To so do, use the expert interface, which takes two additional arguments: a `cntx_t` (context) and a `rntm_t`. For the context, you may simply pass in `NULL` and BLIS will select a default context internally (which is exactly what happens for both the `cntx_t*` and `rntm_t*` parameters when you call the basic/non-expert interfaces). Here is an example of such a call: ```c bli_gemm_ex( &alpha, &a, &b, &beta, &c, NULL, &rntm ); ``` -This will cause `gemm` to execute and parallelize in the manner encoded by `rntm`. +This will cause `gemm` to execute and parallelize in the manner encoded by `rntm` (and it will do so using a default `cntx_t*`). To summarize, using a `rntm_t` involves three steps: ```c @@ -293,7 +298,7 @@ Also, you may pass in `NULL` for the `rntm_t*` parameter of an expert interface. There are currently no good *and* easy solutions to this problem. Eventually, though, we plan to add support for two microkernels per datatype per configuration--one for use with matrices C that are row-stored, and one for those that are column-stored. This will obviate the logic within BLIS that sometimes induces the operation transposition, and the problem will go away. -* **Thread affinity when BLIS and MKL are used together.** Some users have reported that when running a program that links both BLIS (configured with OpenMP) and MKL, **and** when OpenMP thread affinity has been specified (e.g. via `OMP_PROC_BIND` and `OMP_PLACES`), that very poor performance is observed. This may be due to incorrect thread masking in this case, causing all threads to run on one physical core. The exact circumstances leading to this behavior have not been identified, but unsetting the OpenMP thread affinity variables appears to be a solution. +* **Thread affinity when BLIS and MKL are used together.** Some users have reported that when running a program that links both BLIS (configured with OpenMP) and MKL, **and** when OpenMP thread affinity has been specified (e.g. via `OMP_PROC_BIND` and `OMP_PLACES`), that very poor performance is observed. This may be due to incorrect thread masking, causing all threads to run on one physical core. The exact circumstances leading to this behavior have not been identified, but unsetting the OpenMP thread affinity variables appears to be a solution. # Conclusion diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c index 2c13c74a2..aae0ac043 100644 --- a/frame/base/bli_rntm.c +++ b/frame/base/bli_rntm.c @@ -161,70 +161,72 @@ void bli_rntm_set_ways_from_rntm rntm_t* rntm ) { - dim_t nt = bli_rntm_num_threads( rntm ); + // NOTE: While much of the multithreading cpp case of this function may seem + // redundant with bli_thread_init_rntm_from_env(), we need them both. The + // bli_thread_init_rntm_from_env() function is only called to initialize the + // global rntm_t. There, the consistency logic serves to make sure that sane + // values will be returned if the application (in the time between library + // initialization and when computation begins) subsequently queries the + // number of threads or ways via the runtime API. This function also needs + // the same consistency logic, but for a different reason: this function + // guarantees that the rntm_t has sane values in the event that the + // application passed in a custom rntm_t via an expert interface. - dim_t jc = bli_rntm_jc_ways( rntm ); - dim_t pc = bli_rntm_pc_ways( rntm ); - dim_t ic = bli_rntm_ic_ways( rntm ); - dim_t jr = bli_rntm_jr_ways( rntm ); - dim_t ir = bli_rntm_ir_ways( rntm ); bool auto_factor = FALSE; + dim_t nt; + dim_t jc, pc, ic, jr, ir; #ifdef BLIS_ENABLE_MULTITHREADING - bool nt_set = FALSE; - bool ways_set = FALSE; - - // If the rntm was fed in as a copy of the global runtime via - // bli_rntm_init_from_global(), we know that either: - // - the num_threads field is -1 and all of the ways are -1; - // - the num_threads field is -1 and all of the ways are set; - // - the num_threads field is set and all of the ways are -1. - // However, we can't be sure that a user-provided rntm_t isn't - // initialized uncleanly. So here we have to enforce some rules - // to get the rntm_t into a predictable state. - - // First, we establish whether or not the number of threads is set. - if ( nt > 0 ) nt_set = TRUE; - - // Take this opportunity to set the auto_factor field. - if ( nt_set ) auto_factor = TRUE; - - // Next, we establish whether or not any of the ways of parallelism - // for each loop were set. If any of the ways are set (positive), we - // then we assume the user wanted to use those positive values and - // default the non-positive values to 1. - if ( jc > 0 || pc > 0 || ic > 0 || jr > 0 || ir > 0 ) - { - ways_set = TRUE; - - if ( jc < 1 ) jc = 1; - if ( pc < 1 ) pc = 1; - if ( ic < 1 ) ic = 1; - if ( jr < 1 ) jr = 1; - if ( ir < 1 ) ir = 1; - } + nt = bli_rntm_num_threads( rntm ); + jc = bli_rntm_jc_ways( rntm ); + pc = bli_rntm_pc_ways( rntm ); + ic = bli_rntm_ic_ways( rntm ); + jr = bli_rntm_jr_ways( rntm ); + ir = bli_rntm_ir_ways( rntm ); + + bool nt_set = FALSE; + bool ways_set = FALSE; + + // Some users are mischievous/dumb. Make sure they don't cause trouble. + if ( nt < 1 ) nt = 1; + if ( jc < 1 ) jc = 1; + if ( pc < 1 ) pc = 1; + if ( ic < 1 ) ic = 1; + if ( jr < 1 ) jr = 1; + if ( ir < 1 ) ir = 1; + + // First, we establish whether or not the number of threads or ways of + // parallelism were set to meaningful values. + if ( nt > 1 ) nt_set = TRUE; + if ( jc > 1 ) ways_set = TRUE; + if ( pc > 1 ) ways_set = TRUE; pc = 1; // Disable pc_nt values. + if ( ic > 1 ) ways_set = TRUE; + if ( jr > 1 ) ways_set = TRUE; + if ( ir > 1 ) ways_set = TRUE; // Now we use the values of nt_set and ways_set to determine how to // interpret the original values we found in the rntm_t object. if ( ways_set == TRUE ) { - // If the ways were set, then we use the values that were given - // and interpreted above (we set any non-positive value to 1). - // The only thing left to do is calculate the correct number of - // threads. + // If the per-loop ways of parallelism were set, then we use the values + // that were given and interpreted above. The only thing left to do is + // calculate the correct number of threads. Notice that if the user also + // happened to set the total number of threads that value is discarded + // in favor of the implied value from the per-loop ways of parallelism. nt = jc * pc * ic * jr * ir; + auto_factor = FALSE; } else if ( ways_set == FALSE && nt_set == TRUE ) { - // If the ways were not set but the number of thread was set, then - // we attempt to automatically generate a thread factorization that + // If the ways were not set but the number of thread was set, then we + // will attempt to automatically generate a thread factorization that // will work given the problem size. -#ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS + #ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // If use of prime numbers is disallowed for automatic thread // factorizations, we first check if the number of threads requested // is prime. If it is prime, and it exceeds a minimum threshold, then @@ -232,11 +234,11 @@ void bli_rntm_set_ways_from_rntm // prime. This will allow for automatic thread factorizations to span // two dimensions (loops), which tends to be more efficient. if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1; -#endif + #endif - pc = 1; - - //printf( "m n = %d %d BLIS_THREAD_RATIO_M _N = %d %d\n", (int)m, (int)n, (int)BLIS_THREAD_RATIO_M, (int)BLIS_THREAD_RATIO_N ); + //printf( "m n = %d %d BLIS_THREAD_RATIO_M _N = %d %d\n", + // (int)m, (int)n, (int)BLIS_THREAD_RATIO_M, + // (int)BLIS_THREAD_RATIO_N ); bli_thread_partition_2x2( nt, m*BLIS_THREAD_RATIO_M, n*BLIS_THREAD_RATIO_N, &ic, &jc ); @@ -252,27 +254,34 @@ void bli_rntm_set_ways_from_rntm { if ( jc % jr == 0 ) { jc /= jr; break; } } + + // Force the number of ways of parallelism in the pc loop to 1 + // just in case the caller set it to something greater than 1. + pc = 1; + + // Make note that auto-factorization was performed. + auto_factor = TRUE; } else // if ( ways_set == FALSE && nt_set == FALSE ) { - // If neither the ways nor the number of threads were set, then - // the rntm was not meaningfully changed since initialization, - // and thus we'll default to single-threaded execution. - - nt = 1; - jc = pc = ic = jr = ir = 1; + // If neither the ways nor the number of threads were set, then the + // rntm_t was not meaningfully changed since initialization. This means + // the fields are all 1, which will lead to the default behavior of + // single-threaded execution. + //nt = jc = pc = ic = jr = ir = 1; + //auto_factor = FALSE; } #else - // When multithreading is disabled, always set the rntm_t ways - // values to 1. + // When multithreading is disabled, always set the per-loop ways of + // parallelism to 1. nt = 1; jc = pc = ic = jr = ir = 1; #endif - // Save the results back in the runtime object. + // Save the results back in the rntm_t object. bli_rntm_set_auto_factor_only( auto_factor, rntm ); bli_rntm_set_num_threads_only( nt, rntm ); bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm ); @@ -286,70 +295,60 @@ void bli_rntm_set_ways_from_rntm_sup rntm_t* rntm ) { - dim_t nt = bli_rntm_num_threads( rntm ); - - dim_t jc = bli_rntm_jc_ways( rntm ); - dim_t pc = bli_rntm_pc_ways( rntm ); - dim_t ic = bli_rntm_ic_ways( rntm ); - dim_t jr = bli_rntm_jr_ways( rntm ); - dim_t ir = bli_rntm_ir_ways( rntm ); - bool auto_factor = FALSE; + dim_t nt; + dim_t jc, pc, ic, jr, ir; #ifdef BLIS_ENABLE_MULTITHREADING - bool nt_set = FALSE; - bool ways_set = FALSE; - - // If the rntm was fed in as a copy of the global runtime via - // bli_rntm_init_from_global(), we know that either: - // - the num_threads field is -1 and all of the ways are -1; - // - the num_threads field is -1 and all of the ways are set; - // - the num_threads field is set and all of the ways are -1. - // However, we can't be sure that a user-provided rntm_t isn't - // initialized uncleanly. So here we have to enforce some rules - // to get the rntm_t into a predictable state. - - // First, we establish whether or not the number of threads is set. - if ( nt > 0 ) nt_set = TRUE; - - // Take this opportunity to set the auto_factor field. - if ( nt_set ) auto_factor = TRUE; - - // Next, we establish whether or not any of the ways of parallelism - // for each loop were set. If any of the ways are set (positive), we - // then we assume the user wanted to use those positive values and - // default the non-positive values to 1. - if ( jc > 0 || pc > 0 || ic > 0 || jr > 0 || ir > 0 ) - { - ways_set = TRUE; - - if ( jc < 1 ) jc = 1; - if ( pc < 1 ) pc = 1; - if ( ic < 1 ) ic = 1; - if ( jr < 1 ) jr = 1; - if ( ir < 1 ) ir = 1; - } + nt = bli_rntm_num_threads( rntm ); + jc = bli_rntm_jc_ways( rntm ); + pc = bli_rntm_pc_ways( rntm ); + ic = bli_rntm_ic_ways( rntm ); + jr = bli_rntm_jr_ways( rntm ); + ir = bli_rntm_ir_ways( rntm ); + + bool nt_set = FALSE; + bool ways_set = FALSE; + + // Some users are mischievous/dumb. Make sure they don't cause trouble. + if ( nt < 1 ) nt = 1; + if ( jc < 1 ) jc = 1; + if ( pc < 1 ) pc = 1; + if ( ic < 1 ) ic = 1; + if ( jr < 1 ) jr = 1; + if ( ir < 1 ) ir = 1; + + // First, we establish whether or not the number of threads or ways of + // parallelism were set to meaningful values. + if ( nt > 1 ) nt_set = TRUE; + if ( jc > 1 ) ways_set = TRUE; + if ( pc > 1 ) ways_set = TRUE; pc = 1; // Disable pc_nt values. + if ( ic > 1 ) ways_set = TRUE; + if ( jr > 1 ) ways_set = TRUE; + if ( ir > 1 ) ways_set = TRUE; // Now we use the values of nt_set and ways_set to determine how to // interpret the original values we found in the rntm_t object. if ( ways_set == TRUE ) { - // If the ways were set, then we use the values that were given - // and interpreted above (we set any non-positive value to 1). - // The only thing left to do is calculate the correct number of - // threads. + // If the per-loop ways of parallelism were set, then we use the values + // that were given and interpreted above. The only thing left to do is + // calculate the correct number of threads. Notice that if the user also + // happened to set the total number of threads that value is discarded + // in favor of the implied value from the per-loop ways of parallelism. nt = jc * pc * ic * jr * ir; + auto_factor = FALSE; } else if ( ways_set == FALSE && nt_set == TRUE ) { - // If the ways were not set but the number of thread was set, then - // we attempt to automatically generate a thread factorization that - // will work given the problem size. + // If the ways were not set but the number of thread was set, then we + // will attempt to automatically generate a thread factorization that + // work given the problem size. -#ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS + #ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // If use of prime numbers is disallowed for automatic thread // factorizations, we first check if the number of threads requested // is prime. If it is prime, and it exceeds a minimum threshold, then @@ -357,17 +356,17 @@ void bli_rntm_set_ways_from_rntm_sup // prime. This will allow for automatic thread factorizations to span // two dimensions (loops), which tends to be more efficient. if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1; -#endif - - pc = 1; + #endif //bli_thread_partition_2x2( nt, m*BLIS_THREAD_SUP_RATIO_M, // n*BLIS_THREAD_SUP_RATIO_N, &ic, &jc ); bli_thread_partition_2x2( nt, m, n, &ic, &jc ); -//printf( "bli_rntm_set_ways_from_rntm_sup(): jc = %d ic = %d\n", (int)jc, (int)ic ); -#if 0 + //printf( "bli_rntm_set_ways_from_rntm_sup(): jc = %d ic = %d\n", + // (int)jc, (int)ic ); + + #if 0 for ( ir = BLIS_THREAD_SUP_MAX_IR ; ir > 1 ; ir-- ) { if ( ic % ir == 0 ) { ic /= ir; break; } @@ -377,32 +376,38 @@ void bli_rntm_set_ways_from_rntm_sup { if ( jc % jr == 0 ) { jc /= jr; break; } } -#else + #else ir = 1; jr = 1; + #endif -#endif + // Force the number of ways of parallelism in the pc loop to 1 just in + // case the caller set it to something greater than 1. + pc = 1; + + // Make note that auto-factorization was performed. + auto_factor = TRUE; } else // if ( ways_set == FALSE && nt_set == FALSE ) { - // If neither the ways nor the number of threads were set, then - // the rntm was not meaningfully changed since initialization, - // and thus we'll default to single-threaded execution. - - nt = 1; - jc = pc = ic = jr = ir = 1; + // If neither the ways nor the number of threads were set, then the + // rntm_t was not meaningfully changed since initialization. This means + // the fields are all 1, which will lead to the default behavior of + // single-threaded execution. + //nt = jc = pc = ic = jr = ir = 1; + //auto_factor = FALSE; } #else - // When multithreading is disabled, always set the rntm_t ways - // values to 1. + // When multithreading is disabled, always set the per-loop ways of + // parallelism to 1. nt = 1; jc = pc = ic = jr = ir = 1; #endif - // Save the results back in the runtime object. + // Save the results back in the rntm_t object. bli_rntm_set_auto_factor_only( auto_factor, rntm ); bli_rntm_set_num_threads_only( nt, rntm ); bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm ); diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h index 2a39f8894..8b6538484 100644 --- a/frame/base/bli_rntm.h +++ b/frame/base/bli_rntm.h @@ -46,7 +46,7 @@ typedef struct rntm_s bool auto_factor; dim_t num_threads; - dim_t* thrloop; + dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; bool pack_b; bool l3_sup; @@ -129,22 +129,6 @@ BLIS_INLINE pba_t* bli_rntm_pba( const rntm_t* rntm ) return rntm->pba; } -#if 0 -BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) -{ - const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); - const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); - const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); - const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); - const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); - const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); - const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); - - if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; - else return FALSE; -} -#endif - // // -- rntm_t modification (internal use only) ---------------------------------- // @@ -170,7 +154,7 @@ BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { - bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); + bli_rntm_set_ways_for_only( BLIS_KC, 1, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { @@ -193,7 +177,7 @@ BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); - bli_rntm_set_pc_ways_only( pc, rntm ); + bli_rntm_set_pc_ways_only( 1, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); @@ -212,11 +196,11 @@ BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { - bli_rntm_set_num_threads_only( -1, rntm ); + bli_rntm_set_num_threads_only( 1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { - bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); + bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { @@ -244,14 +228,16 @@ BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_ { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); - bli_rntm_set_pc_ways_only( pc, rntm ); + bli_rntm_set_pc_ways_only( 1, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); - // Set the num_threads field to a default state. - bli_rntm_clear_num_threads_only( rntm ); + // Set the num_threads field to the product of all the ways. The only + // benefit of doing this, though, is that the user can query the total + // number of threads from the rntm_t after calling this function. + bli_rntm_set_num_threads_only( jc * 1 * ic * jr * ir, rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) @@ -307,8 +293,8 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ - .num_threads = -1, \ - .thrloop = { -1, -1, -1, -1, -1, -1 }, \ + .num_threads = 1, \ + .thrloop = { 1, 1, 1, 1, 1, 1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 7d647a314..0e5afa3f8 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -1564,7 +1564,7 @@ void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ) // Acquire the mutex protecting global_rntm. bli_pthread_mutex_lock( &global_rntm_mutex ); - bli_rntm_set_ways_only( jc, pc, ic, jr, ir, &global_rntm ); + bli_rntm_set_ways_only( jc, 1, ic, jr, ir, &global_rntm ); // Release the mutex protecting global_rntm. bli_pthread_mutex_unlock( &global_rntm_mutex ); @@ -1595,6 +1595,17 @@ void bli_thread_init_rntm_from_env // function is only called from bli_thread_init(), which is only called // by bli_init_once(). + // NOTE: While much of the multithreading cpp case of this function may seem + // redundant with bli_rntm_set_ways_from_rntm(), we need them both. This + // function is only called to initialize the global rntm_t. Here, the + // consistency logic serves to make sure that sane values will be returned + // if the application (in the time between library initialization and when + // computation begins) subsequently queries the number of threads or ways + // via the runtime API. The bli_rntm_set_ways_from_rntm() function also + // needs the same consistency logic, but for a different reason: that + // function guarantees that the rntm_t has sane values in the event that the + // application passed in a custom rntm_t via an expert interface. + bool auto_factor = FALSE; dim_t nt; dim_t jc, pc, ic, jr, ir; @@ -1608,44 +1619,72 @@ void bli_thread_init_rntm_from_env if ( nt == -1 ) nt = bli_env_get_var( "OMP_NUM_THREADS", -1 ); - // Read the environment variables for the number of threads (ways - // of parallelism) for each individual loop. + // Read the environment variables for the number of threads (ways of + // parallelism) for each individual loop. jc = bli_env_get_var( "BLIS_JC_NT", -1 ); - pc = bli_env_get_var( "BLIS_PC_NT", -1 ); + pc = bli_env_get_var( "BLIS_PC_NT", -1 ); pc = 1; // Disable PC_NT values. ic = bli_env_get_var( "BLIS_IC_NT", -1 ); jr = bli_env_get_var( "BLIS_JR_NT", -1 ); ir = bli_env_get_var( "BLIS_IR_NT", -1 ); - // If any BLIS_*_NT environment variable was set, then we ignore the - // value of BLIS_NUM_THREADS or OMP_NUM_THREADS and use the - // BLIS_*_NT values instead (with unset variables being treated as if - // they contained 1). - if ( jc != -1 || pc != -1 || ic != -1 || jr != -1 || ir != -1 ) + bool nt_set = FALSE; + bool ways_set = FALSE; + + // Some users are mischievous/dumb. Make sure they don't cause trouble. + if ( nt < 1 ) nt = 1; + if ( jc < 1 ) jc = 1; + if ( pc < 1 ) pc = 1; + if ( ic < 1 ) ic = 1; + if ( jr < 1 ) jr = 1; + if ( ir < 1 ) ir = 1; + + // First, we establish whether or not the number of threads or ways of + // parallelism were set to meaningful values. + if ( nt > 1 ) nt_set = TRUE; + if ( jc > 1 ) ways_set = TRUE; + if ( pc > 1 ) ways_set = TRUE; + if ( ic > 1 ) ways_set = TRUE; + if ( jr > 1 ) ways_set = TRUE; + if ( ir > 1 ) ways_set = TRUE; + + // Now we use the values of nt_set and ways_set to determine how to + // interpret the original values we found in the rntm_t object. + + if ( ways_set == TRUE ) { - if ( jc == -1 ) jc = 1; - if ( pc == -1 ) pc = 1; - if ( ic == -1 ) ic = 1; - if ( jr == -1 ) jr = 1; - if ( ir == -1 ) ir = 1; - - // Unset the value for nt. - nt = -1; + // If the per-loop ways of parallelism were set, then we use the values + // that were given and interpreted above. The only thing left to do is + // calculate the correct number of threads. Notice that if the user also + // happened to set BLIS_NUM_THREADS, that value is discarded in favor of + // the implied value from the per-loop ways of parallelism. + + nt = jc * pc * ic * jr * ir; + auto_factor = FALSE; + } + else if ( ways_set == FALSE && nt_set == TRUE ) + { + // If the ways were not set but the number of thread was set, then we + // will attempt to automatically generate a thread factorization that + // will work given the problem size. This auto-factorization will + // occur later, in bli_rntm_set_ways_from_rntm(), once we know the + // problem size. + + // Make note that auto-factorization will be performed. + auto_factor = TRUE; + } + else // if ( ways_set == FALSE && nt_set == FALSE ) + { + // If neither the ways nor the number of threads were set, then we + // allow the default values to stand. + //nt = jc = pc = ic = jr = ir = 1; + //auto_factor = FALSE; } - - // By this time, one of the following conditions holds: - // - nt is -1 and the ways for each loop are -1. - // - nt is -1 and the ways for each loop are all set. - // - nt is set and the ways for each loop are -1. - - // If nt is set (ie: not -1), then we know we will perform an automatic - // thread factorization (later, in bli_rntm.c). - if ( nt != -1 ) auto_factor = TRUE; #else - // When multithreading is disabled, always set the rntm_t ways - // values to 1. - nt = -1; + // When multithreading is disabled, always set the per-loop ways of + // parallelism to 1. + nt = 1; jc = pc = ic = jr = ir = 1; #endif diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 3ce92e377..3bfde8788 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -795,8 +795,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) char ir_nt_str[16]; // Query the number of ways of parallelism per loop (and overall) and - // convert these values into strings, with "unset" being used if the - // value returned was -1 (indicating the environment variable was unset). + // convert these values into strings. dim_t nt = bli_thread_get_num_threads(); dim_t jc_nt = bli_thread_get_jc_nt(); dim_t pc_nt = bli_thread_get_pc_nt(); @@ -804,18 +803,12 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) dim_t jr_nt = bli_thread_get_jr_nt(); dim_t ir_nt = bli_thread_get_ir_nt(); - if ( nt == -1 ) sprintf( nt_str, "unset" ); - else sprintf( nt_str, "%d", ( int ) nt ); - if ( jc_nt == -1 ) sprintf( jc_nt_str, "unset" ); - else sprintf( jc_nt_str, "%d", ( int )jc_nt ); - if ( pc_nt == -1 ) sprintf( pc_nt_str, "unset" ); - else sprintf( pc_nt_str, "%d", ( int )pc_nt ); - if ( ic_nt == -1 ) sprintf( ic_nt_str, "unset" ); - else sprintf( ic_nt_str, "%d", ( int )ic_nt ); - if ( jr_nt == -1 ) sprintf( jr_nt_str, "unset" ); - else sprintf( jr_nt_str, "%d", ( int )jr_nt ); - if ( ir_nt == -1 ) sprintf( ir_nt_str, "unset" ); - else sprintf( ir_nt_str, "%d", ( int )ir_nt ); + sprintf( nt_str, "%d", ( int ) nt ); + sprintf( jc_nt_str, "%d", ( int )jc_nt ); + sprintf( pc_nt_str, "%d", ( int )pc_nt ); + sprintf( ic_nt_str, "%d", ( int )ic_nt ); + sprintf( jr_nt_str, "%d", ( int )jr_nt ); + sprintf( ir_nt_str, "%d", ( int )ir_nt ); // Set up rntm_t objects for each of the four families: // gemm, herk, trmm, trsm. From 63177dca48cb7d066576d884da4a7a599ececebf Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 15 Sep 2022 11:21:26 -0500 Subject: [PATCH 082/230] Fixed gemmlike sandbox bug introduced in 7c07b47. Details: - Fixed a bug in the 'gemmlike' sandbox that was introduced in 7c07b47. This bug was the result of the fact that the gemmlike implementation uses bli_thrinfo_sup_grow() to grow its thrinfo_t tree, but the aforementioned commit added an optimization that kicks in when the rntm_t .pack_a and .pack_b fields are both FALSE. Those fields were originally added only for sup execution; for large code path, they are intended to be ignored. But the default initial state of a rntm_t has those fields set to FALSE, which was inadvertantly activating the optimization (which targeted single-threaded cases only) and would cause multithreaded use cases of 'gemmlike' to segfault. The fix took the form of setting the .pack_a and .pack_b fields to TRUE in bls_gemm_ex(). - Added minimal 'const' and 'const'-casting to 'gemmlike' so that gcc stays quiet. --- sandbox/gemmlike/bli_gemm_ex.c | 23 ++++++++++++++--------- sandbox/gemmlike/bls_gemm.c | 29 +++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/sandbox/gemmlike/bli_gemm_ex.c b/sandbox/gemmlike/bli_gemm_ex.c index 96dae1a3a..fe220e603 100644 --- a/sandbox/gemmlike/bli_gemm_ex.c +++ b/sandbox/gemmlike/bli_gemm_ex.c @@ -46,13 +46,13 @@ void bli_gemm_ex ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { bli_init_once(); @@ -65,7 +65,11 @@ void bli_gemm_ex // directly. if ( 1 ) { - bls_gemm_ex( alpha, a, b, beta, c, cntx, rntm ); + bls_gemm_ex + ( + ( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b, ( obj_t* )beta, ( obj_t* )c, + ( cntx_t* )cntx, ( rntm_t* )rntm + ); return; } @@ -85,7 +89,8 @@ void bli_gemm_ex // Invoke the operation's front end. bli_gemm_front ( - alpha, a, b, beta, c, cntx, rntm, NULL + ( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b, ( obj_t* )beta, ( obj_t* )c, + ( cntx_t* )cntx, ( rntm_t* )rntm, NULL ); } diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c index ec5d8d5b1..d960928a4 100644 --- a/sandbox/gemmlike/bls_gemm.c +++ b/sandbox/gemmlike/bls_gemm.c @@ -78,14 +78,27 @@ void bls_gemm_ex if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } + // Set the .pack_a and .pack_b fields to TRUE. This is only needed because + // this sandbox uses bli_thrinfo_sup_grow(), which calls + // bli_thrinfo_sup_create_for_cntl(), which employs an optimization if + // both fields are FALSE (as is often the case with sup). However, this + // sandbox implements the "large" code path, and so both A and B must + // always be packed. Setting the fields to TRUE will avoid the optimization + // while this sandbox implementation executes (and it also reinforces the + // fact that we *are* indeed packing A and B, albeit not in the sup context + // originally envisioned for the .pack_a and .pack_b fields). + bli_rntm_set_pack_a( TRUE, rntm ); + bli_rntm_set_pack_b( TRUE, rntm ); + // Obtain a valid (native) context from the gks if necessary. // NOTE: This must be done before calling the _check() function, since // that function assumes the context pointer is valid. - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx(); // Check parameters. if ( bli_error_checking_is_enabled() ) - bls_gemm_check( alpha, a, b, beta, c, cntx ); + bls_gemm_check( ( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b, + ( obj_t* )beta, ( obj_t* )c, ( cntx_t* )cntx ); // -- bli_gemm_front() ----------------------------------------------------- @@ -163,12 +176,12 @@ void bls_gemm_ex ( bls_gemm_int, BLIS_GEMM, // operation family id - alpha, - &a_local, - &b_local, - beta, - &c_local, - cntx, + ( obj_t* )alpha, + ( obj_t* )&a_local, + ( obj_t* )&b_local, + ( obj_t* )beta, + ( obj_t* )&c_local, + ( cntx_t* )cntx, rntm ); } From e86076bf4461d1a78186fb21ba8320cfb430f62c Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 15 Sep 2022 14:22:59 -0500 Subject: [PATCH 083/230] Test the 'gemmlike' sandbox via AppVeyor. (#664) Details: - Added a fifth test to our .appveyor.yml that enables the 'gemmlike' sandbox with OpenMP enabled (via clang, the 'auto' configuration target, and building to a static library). Thanks to Jeff Diamond for pointing out that this test would be useful. --- .appveyor.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.appveyor.yml b/.appveyor.yml index f4f56fa15..cafad4817 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -23,6 +23,12 @@ environment: CC: clang THREADING: openmp + - LIB_TYPE: static + CONFIG: auto + CC: clang + THREADING: openmp + SANDBOX: yes + install: - set "PATH=C:\msys64\mingw64\bin;C:\msys64\bin;%PATH%" - if [%CC%]==[clang] set "PATH=C:\Program Files\LLVM\bin;%PATH%" @@ -34,6 +40,7 @@ build_script: - if [%LIB_TYPE%]==[shared] set "CONFIGURE_OPTS=%CONFIGURE_OPTS% --enable-shared --disable-static" - if [%LIB_TYPE%]==[static] set "CONFIGURE_OPTS=%CONFIGURE_OPTS% --disable-shared --enable-static" - if not [%CBLAS%]==[no] set "CONFIGURE_OPTS=%CONFIGURE_OPTS% --enable-cblas" +- if [%SANDBOX%]==[yes] set "CONFIGURE_OPTS=%CONFIGURE_OPTS% -s gemmlike" - set RANLIB=echo - set LIBPTHREAD= - set "PATH=%PATH%;C:\blis\lib" From fb91337eff1ee2098f315a83888f6667b3a56f86 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 15 Sep 2022 19:08:10 -0500 Subject: [PATCH 084/230] Fixed a harmless pc_nt bug in 05a811e. Details: - Added missing curly braces around some statements in bli_rntm.c, one of which needed them in order for the relevant code to be executed in the intended way. The consequence of 05a811e omitting those braces was that a statement (pc_nt = 1;) was executed more often than it needed to be. - Also adjusted the analagous code in bli_thread.c to match that of bli_rntm.c. --- frame/base/bli_rntm.c | 24 ++++++++++++------------ frame/thread/bli_thread.c | 14 +++++++------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c index aae0ac043..1411ffaa3 100644 --- a/frame/base/bli_rntm.c +++ b/frame/base/bli_rntm.c @@ -199,12 +199,12 @@ void bli_rntm_set_ways_from_rntm // First, we establish whether or not the number of threads or ways of // parallelism were set to meaningful values. - if ( nt > 1 ) nt_set = TRUE; - if ( jc > 1 ) ways_set = TRUE; - if ( pc > 1 ) ways_set = TRUE; pc = 1; // Disable pc_nt values. - if ( ic > 1 ) ways_set = TRUE; - if ( jr > 1 ) ways_set = TRUE; - if ( ir > 1 ) ways_set = TRUE; + if ( nt > 1 ) { nt_set = TRUE; } + if ( jc > 1 ) { ways_set = TRUE; } + if ( pc > 1 ) { ways_set = TRUE; pc = 1; } // Disable pc_nt values. + if ( ic > 1 ) { ways_set = TRUE; } + if ( jr > 1 ) { ways_set = TRUE; } + if ( ir > 1 ) { ways_set = TRUE; } // Now we use the values of nt_set and ways_set to determine how to // interpret the original values we found in the rntm_t object. @@ -321,12 +321,12 @@ void bli_rntm_set_ways_from_rntm_sup // First, we establish whether or not the number of threads or ways of // parallelism were set to meaningful values. - if ( nt > 1 ) nt_set = TRUE; - if ( jc > 1 ) ways_set = TRUE; - if ( pc > 1 ) ways_set = TRUE; pc = 1; // Disable pc_nt values. - if ( ic > 1 ) ways_set = TRUE; - if ( jr > 1 ) ways_set = TRUE; - if ( ir > 1 ) ways_set = TRUE; + if ( nt > 1 ) { nt_set = TRUE; } + if ( jc > 1 ) { ways_set = TRUE; } + if ( pc > 1 ) { ways_set = TRUE; pc = 1; } // Disable pc_nt values. + if ( ic > 1 ) { ways_set = TRUE; } + if ( jr > 1 ) { ways_set = TRUE; } + if ( ir > 1 ) { ways_set = TRUE; } // Now we use the values of nt_set and ways_set to determine how to // interpret the original values we found in the rntm_t object. diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 0e5afa3f8..9bad6a456 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -1622,7 +1622,7 @@ void bli_thread_init_rntm_from_env // Read the environment variables for the number of threads (ways of // parallelism) for each individual loop. jc = bli_env_get_var( "BLIS_JC_NT", -1 ); - pc = bli_env_get_var( "BLIS_PC_NT", -1 ); pc = 1; // Disable PC_NT values. + pc = bli_env_get_var( "BLIS_PC_NT", -1 ); ic = bli_env_get_var( "BLIS_IC_NT", -1 ); jr = bli_env_get_var( "BLIS_JR_NT", -1 ); ir = bli_env_get_var( "BLIS_IR_NT", -1 ); @@ -1640,12 +1640,12 @@ void bli_thread_init_rntm_from_env // First, we establish whether or not the number of threads or ways of // parallelism were set to meaningful values. - if ( nt > 1 ) nt_set = TRUE; - if ( jc > 1 ) ways_set = TRUE; - if ( pc > 1 ) ways_set = TRUE; - if ( ic > 1 ) ways_set = TRUE; - if ( jr > 1 ) ways_set = TRUE; - if ( ir > 1 ) ways_set = TRUE; + if ( nt > 1 ) { nt_set = TRUE; } + if ( jc > 1 ) { ways_set = TRUE; } + if ( pc > 1 ) { ways_set = TRUE; pc = 1; } // Disable pc_nt values. + if ( ic > 1 ) { ways_set = TRUE; } + if ( jr > 1 ) { ways_set = TRUE; } + if ( ir > 1 ) { ways_set = TRUE; } // Now we use the values of nt_set and ways_set to determine how to // interpret the original values we found in the rntm_t object. From 89df7b8fa3a3e47ab2fc10ac4d65d0b9fde16942 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sun, 18 Sep 2022 18:46:57 -0500 Subject: [PATCH 085/230] De-templatized _sup_var1n2m.c; unified _sup_packm_a/b(). (#659) Details: - Re-expressed the two variants in frame/3/bli_l3_sup_var1n2m.c as a single function each that performs char* pointer arithmetic rather than four datatype-specific functions. Did the same for the functions in bli_l3_sup_packm_a.c and _sup_packm_b.c, and then unified the two into a single set of functions for packing either A or B, which now resides in bli_l3_sup_packm.c. - Pre-grow the cntl_t tree in both bli_l3_sup_var1n2m.c variants rather than grow them incrementally. - Relocated empty-matrix and scale-by-beta early return handlnig from bli_gemm_front() and bli_gemmt_front() to their _ex() counterparts. - Comment, whitespace updates. --- frame/3/bli_l3.h | 3 +- frame/3/bli_l3_oapi_ex.c | 26 + frame/3/bli_l3_sup_packm.c | 428 +++++++ frame/3/bli_l3_sup_packm.h | 95 ++ frame/3/bli_l3_sup_packm_a.c | 430 ------- frame/3/bli_l3_sup_packm_a.h | 118 -- frame/3/bli_l3_sup_packm_b.c | 430 ------- frame/3/bli_l3_sup_packm_b.h | 118 -- frame/3/bli_l3_sup_packm_var.c | 38 +- frame/3/bli_l3_sup_packm_var.h | 38 +- frame/3/bli_l3_sup_var1n2m.c | 1895 ++++++++++++------------------- frame/3/bli_l3_sup_vars.h | 26 - frame/3/gemm/bli_gemm_front.c | 16 - frame/3/gemmt/bli_gemmt_front.c | 16 - 14 files changed, 1340 insertions(+), 2337 deletions(-) create mode 100644 frame/3/bli_l3_sup_packm.c create mode 100644 frame/3/bli_l3_sup_packm.h delete mode 100644 frame/3/bli_l3_sup_packm_a.c delete mode 100644 frame/3/bli_l3_sup_packm_a.h delete mode 100644 frame/3/bli_l3_sup_packm_b.c delete mode 100644 frame/3/bli_l3_sup_packm_b.h diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index 4dc1a9d54..9d39fc47d 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -71,8 +71,7 @@ #include "bli_l3_sup_ref.h" #include "bli_l3_sup_int.h" #include "bli_l3_sup_vars.h" -#include "bli_l3_sup_packm_a.h" -#include "bli_l3_sup_packm_b.h" +#include "bli_l3_sup_packm.h" #include "bli_l3_sup_packm_var.h" // Prototype microkernel wrapper APIs. diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c index 20b0294eb..16e5f15de 100644 --- a/frame/3/bli_l3_oapi_ex.c +++ b/frame/3/bli_l3_oapi_ex.c @@ -55,6 +55,19 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF) { bli_init_once(); + // If C has a zero dimension, return early. + if ( bli_obj_has_zero_dim( c ) ) return; + + // If alpha is zero, or if A or B has a zero dimension, scale C by beta + // and return early. + if ( bli_obj_equals( alpha, &BLIS_ZERO ) || + bli_obj_has_zero_dim( a ) || + bli_obj_has_zero_dim( b ) ) + { + bli_scalm( beta, c ); + return; + } + // If the rntm is non-NULL, it may indicate that we should forgo sup // handling altogether. bool enable_sup = TRUE; @@ -128,6 +141,19 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) { bli_init_once(); + // If C has a zero dimension, return early. + if ( bli_obj_has_zero_dim( c ) ) return; + + // If alpha is zero, or if A or B has a zero dimension, scale C by beta + // and return early. + if ( bli_obj_equals( alpha, &BLIS_ZERO ) || + bli_obj_has_zero_dim( a ) || + bli_obj_has_zero_dim( b ) ) + { + bli_scalm( beta, c ); + return; + } + // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; diff --git a/frame/3/bli_l3_sup_packm.c b/frame/3/bli_l3_sup_packm.c new file mode 100644 index 000000000..b7a7ee02b --- /dev/null +++ b/frame/3/bli_l3_sup_packm.c @@ -0,0 +1,428 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_packm_sup_init_mem + ( + bool will_pack, + packbuf_t pack_buf_type, + num_t dt, + dim_t m, + dim_t k, + dim_t mr, + rntm_t* rntm, + mem_t* mem, + thrinfo_t* thread + ) +{ + // Inspect whether we are going to be packing matrix A. + if ( will_pack == FALSE ) + { + } + else // if ( will_pack == TRUE ) + { + // NOTE: This "rounding up" of the last upanel is actually optional + // for the rrc/crc cases, but absolutely necessary for the other cases + // since we NEED that last micropanel to have the same ldim (cs_p) as + // the other micropanels. Why? So that millikernels can use the same + // upanel ldim for all iterations of the ir loop. + const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; + const dim_t k_pack = k; + + // Barrier to make sure all threads are caught up and ready to begin + // the packm stage. + bli_thread_barrier( thread ); + + // Compute the size of the memory block eneded. + siz_t size_needed = bli_dt_size( dt ) * m_pack * k_pack; + + // Check the mem_t entry provided by the caller. If it is unallocated, + // then we need to acquire a block from the pba. + if ( bli_mem_is_unalloc( mem ) ) + { + if ( bli_thread_am_ochief( thread ) ) + { + // Acquire directly to the chief thread's mem_t that was + // passed in. It needs to be that mem_t struct, and not a + // local (temporary) mem_t, since there is no barrier until + // after packing is finished, which could allow a race + // condition whereby the chief thread exits the current + // function before the other threads have a chance to copy + // from it. (A barrier would fix that race condition, but + // then again, I prefer to keep barriers to a minimum.) + bli_pba_acquire_m + ( + rntm, + size_needed, + pack_buf_type, + mem + ); + } + + // Broadcast the address of the chief thread's passed-in mem_t + // to all threads. + mem_t* mem_p = bli_thread_broadcast( thread, mem ); + + // Non-chief threads: Copy the contents of the chief thread's + // passed-in mem_t to the passed-in mem_t for this thread. (The + // chief thread already has the mem_t, so it does not need to + // perform any copy.) + if ( !bli_thread_am_ochief( thread ) ) + { + *mem = *mem_p; + } + } + else // if ( bli_mem_is_alloc( mem ) ) + { + // If the mem_t entry provided by the caller does NOT contain a NULL + // buffer, then a block has already been acquired from the pba and + // cached by the caller. + + // As a sanity check, we should make sure that the mem_t object isn't + // associated with a block that is too small compared to the size of + // the packed matrix buffer that is needed, according to the value + // computed above. + siz_t mem_size = bli_mem_size( mem ); + + if ( mem_size < size_needed ) + { + if ( bli_thread_am_ochief( thread ) ) + { + // The chief thread releases the existing block associated + // with the mem_t, and then re-acquires a new block, saving + // the associated mem_t to its passed-in mem_t. (See coment + // above for why the acquisition needs to be directly to + // the chief thread's passed-in mem_t and not a local + // (temporary) mem_t. + bli_pba_release + ( + rntm, + mem + ); + bli_pba_acquire_m + ( + rntm, + size_needed, + pack_buf_type, + mem + ); + } + + // Broadcast the address of the chief thread's passed-in mem_t + // to all threads. + mem_t* mem_p = bli_thread_broadcast( thread, mem ); + + // Non-chief threads: Copy the contents of the chief thread's + // passed-in mem_t to the passed-in mem_t for this thread. (The + // chief thread already has the mem_t, so it does not need to + // perform any copy.) + if ( !bli_thread_am_ochief( thread ) ) + { + *mem = *mem_p; + } + } + else + { + // If the mem_t entry is already allocated and sufficiently large, + // then we use it as-is. No action is needed. + } + } + } +} + +void bli_packm_sup_finalize_mem + ( + bool did_pack, + rntm_t* rntm, + mem_t* mem, + thrinfo_t* thread + ) +{ + // Inspect whether we previously packed matrix A. + if ( did_pack == FALSE ) + { + // If we didn't pack matrix A, there's nothing to be done. + } + else // if ( did_pack == TRUE ) + { + if ( thread != NULL ) + if ( bli_thread_am_ochief( thread ) ) + { + // Check the mem_t entry provided by the caller. Only proceed if it + // is allocated, which it should be. + if ( bli_mem_is_alloc( mem ) ) + { + bli_pba_release + ( + rntm, + mem + ); + } + } + } +} + +void bli_packm_sup_init + ( + bool will_pack, + stor3_t stor_id, + pack_t* schema, + dim_t m, + dim_t k, + dim_t mr, + dim_t* m_max, + dim_t* k_max, + const void* x, inc_t rs_x, inc_t cs_x, + void** p, inc_t* rs_p, inc_t* cs_p, + dim_t* pd_p, inc_t* ps_p, + mem_t* mem + ) +{ + // Inspect whether we are going to be packing matrix A. + if ( will_pack == FALSE ) + { + *m_max = m; + *k_max = k; + + // Set the parameters for use with no packing of A (ie: using the + // source matrix A directly). + { + // Use the strides of the source matrix as the final values. + *rs_p = rs_x; + *cs_p = cs_x; + + *pd_p = mr; + *ps_p = mr * rs_x; + + // Set the schema to "not packed" to indicate that packing will be + // skipped. + *schema = BLIS_NOT_PACKED; + } + + // Since we won't be packing, simply update the buffer address provided + // by the caller to point to source matrix. + *p = ( void* )x; + } + else // if ( will_pack == TRUE ) + { + // NOTE: This is "rounding up" of the last upanel is actually optional + // for the rrc/crc cases, but absolutely necessary for the other cases + // since we NEED that last micropanel to have the same ldim (cs_p) as + // the other micropanels. Why? So that millikernels can use the same + // upanel ldim for all iterations of the ir loop. + *m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; + *k_max = k; + + // Determine the dimensions and strides for the packed matrix A. + if ( stor_id == BLIS_RRC || + stor_id == BLIS_CRC ) + { + // stor3_t id values _RRC and _CRC: pack A to plain row storage. + *rs_p = k; + *cs_p = 1; + + *pd_p = mr; + *ps_p = mr * k; + + // Set the schema to "row packed" to indicate packing to plain + // row storage. + *schema = BLIS_PACKED_ROWS; + } + else + { + // All other stor3_t ids: pack A to column-stored row-panels. + *rs_p = 1; + *cs_p = mr; + + *pd_p = mr; + *ps_p = mr * k; + + // Set the schema to "packed row panels" to indicate packing to + // conventional column-stored row panels. + *schema = BLIS_PACKED_ROW_PANELS; + } + + // Set the buffer address provided by the caller to point to the + // memory associated with the mem_t entry acquired from the pba. + *p = bli_mem_buffer( mem ); + } +} + +typedef void (*packm_sup_var1_fp) + ( + trans_t transc, + pack_t schema, + dim_t m, + dim_t n, + dim_t m_max, + dim_t n_max, + void* kappa, + void* c, inc_t rs_c, inc_t cs_c, + void* p, inc_t rs_p, inc_t cs_p, + dim_t pd_p, inc_t ps_p, + cntx_t* cntx, + thrinfo_t* thread + ); + +typedef void (*packm_sup_var2_fp) + ( + trans_t transc, + pack_t schema, + dim_t m, + dim_t n, + void* kappa, + void* c, inc_t rs_c, inc_t cs_c, + void* p, inc_t rs_p, inc_t cs_p, + cntx_t* cntx, + thrinfo_t* thread + ); + +static packm_sup_var1_fp GENARRAY(packm_sup_var1,packm_sup_var1); +static packm_sup_var2_fp GENARRAY(packm_sup_var2,packm_sup_var2); + +// +// Define BLAS-like interfaces to the variant chooser. +// + +void bli_packm_sup + ( + bool will_pack, + packbuf_t pack_buf_type, + stor3_t stor_id, + trans_t transc, + num_t dt, + dim_t m_alloc, + dim_t k_alloc, + dim_t m, + dim_t k, + dim_t mr, + const void* kappa, + const void* a, inc_t rs_a, inc_t cs_a, + void** p, inc_t* rs_p, inc_t* cs_p, + inc_t* ps_p, + const cntx_t* cntx, + rntm_t* rntm, + mem_t* mem, + thrinfo_t* thread + ) +{ + pack_t schema; + dim_t m_max; + dim_t k_max; + dim_t pd_p; + + // Prepare the packing destination buffer. If packing is not requested, + // this function will reduce to a no-op. + bli_packm_sup_init_mem + ( + will_pack, + pack_buf_type, + dt, m_alloc, k_alloc, mr, + rntm, + mem, + thread + ); + + // Determine the packing buffer and related parameters for matrix A. If A + // will not be packed, then a_use will be set to point to a and the _a_use + // strides will be set accordingly. + bli_packm_sup_init + ( + will_pack, + stor_id, + &schema, + m, k, mr, + &m_max, &k_max, + a, rs_a, cs_a, + p, rs_p, cs_p, + &pd_p, ps_p, + mem + ); + + // Inspect whether we are going to be packing matrix A. + if ( will_pack == FALSE ) + { + // If we aren't going to pack matrix A, then there's nothing to do. + + // printf( "blis_ packm_sup_a: not packing A.\n" ); + } + else // if ( will_pack == TRUE ) + { + if ( schema == BLIS_PACKED_ROWS ) + { + // printf( "blis_ packm_sup_a: packing A to rows.\n" ); + + // For plain packing by rows, use var2. + packm_sup_var2[ dt ] + ( + transc, + schema, + m, + k, + ( void* )kappa, + ( void* )a, rs_a, cs_a, + *p, *rs_p, *cs_p, + ( cntx_t* )cntx, + thread + ); + } + else // if ( schema == BLIS_PACKED_ROW_PANELS ) + { + // printf( "blis_ packm_sup_a: packing A to row panels.\n" ); + + // For packing to column-stored row panels, use var1. + packm_sup_var1[ dt ] + ( + transc, + schema, + m, + k, + m_max, + k_max, + ( void* )kappa, + ( void* )a, rs_a, cs_a, + *p, *rs_p, *cs_p, + pd_p, *ps_p, + ( cntx_t* )cntx, + thread + ); + } + + // Barrier so that packing is done before computation. + bli_thread_barrier( thread ); + } +} + diff --git a/frame/3/bli_l3_sup_packm.h b/frame/3/bli_l3_sup_packm.h new file mode 100644 index 000000000..a84d4e45c --- /dev/null +++ b/frame/3/bli_l3_sup_packm.h @@ -0,0 +1,95 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +void bli_packm_sup_init_mem + ( + bool will_pack, + packbuf_t pack_buf_type, + num_t dt, + dim_t m, + dim_t k, + dim_t mr, + rntm_t* rntm, + mem_t* mem, + thrinfo_t* thread + ); + +void bli_packm_sup_finalize_mem + ( + bool did_pack, + rntm_t* rntm, + mem_t* mem, + thrinfo_t* thread + ); + +void bli_packm_sup_init + ( + bool will_pack, + stor3_t stor_id, + pack_t* schema, + dim_t m, + dim_t k, + dim_t mr, + dim_t* m_max, + dim_t* k_max, + const void* x, inc_t rs_x, inc_t cs_x, + void** p, inc_t* rs_p, inc_t* cs_p, + dim_t* pd_p, inc_t* ps_p, + mem_t* mem + ); + +void bli_packm_sup + ( + bool will_pack, + packbuf_t pack_buf_type, + stor3_t stor_id, + trans_t transc, + num_t dt, + dim_t m_alloc, + dim_t k_alloc, + dim_t m, + dim_t k, + dim_t mr, + const void* kappa, + const void* a, inc_t rs_a, inc_t cs_a, + void** p, inc_t* rs_p, inc_t* cs_p, + inc_t* ps_p, + const cntx_t* cntx, + rntm_t* rntm, + mem_t* mem, + thrinfo_t* thread + ); + diff --git a/frame/3/bli_l3_sup_packm_a.c b/frame/3/bli_l3_sup_packm_a.c deleted file mode 100644 index 6b73050fd..000000000 --- a/frame/3/bli_l3_sup_packm_a.c +++ /dev/null @@ -1,430 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - const cntx_t* cntx, \ - rntm_t* rntm, \ - mem_t* mem, \ - const thrinfo_t* thread \ - ) \ -{ \ - /* Inspect whether we are going to be packing matrix A. */ \ - if ( will_pack == FALSE ) \ - { \ - } \ - else /* if ( will_pack == TRUE ) */ \ - { \ - /* NOTE: This "rounding up" of the last upanel is actually optional - for the rrc/crc cases, but absolutely necessary for the other cases - since we NEED that last micropanel to have the same ldim (cs_p) as - the other micropanels. Why? So that millikernels can use the same - upanel ldim for all iterations of the ir loop. */ \ - const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \ - const dim_t k_pack = k; \ -\ - /* Barrier to make sure all threads are caught up and ready to begin - the packm stage. */ \ - bli_thread_barrier( thread ); \ -\ - /* Compute the size of the memory block eneded. */ \ - siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \ -\ - /* Check the mem_t entry provided by the caller. If it is unallocated, - then we need to acquire a block from the memory broker. */ \ - if ( bli_mem_is_unalloc( mem ) ) \ - { \ - if ( bli_thread_am_ochief( thread ) ) \ - { \ - /* Acquire directly to the chief thread's mem_t that was - passed in. It needs to be that mem_t struct, and not a - local (temporary) mem_t, since there is no barrier until - after packing is finished, which could allow a race - condition whereby the chief thread exits the current - function before the other threads have a chance to copy - from it. (A barrier would fix that race condition, but - then again, I prefer to keep barriers to a minimum.) */ \ - bli_pba_acquire_m \ - ( \ - rntm, \ - size_needed, \ - pack_buf_type, \ - mem \ - ); \ - } \ -\ - /* Broadcast the address of the chief thread's passed-in mem_t - to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ -\ - /* Non-chief threads: Copy the contents of the chief thread's - passed-in mem_t to the passed-in mem_t for this thread. (The - chief thread already has the mem_t, so it does not need to - perform any copy.) */ \ - if ( !bli_thread_am_ochief( thread ) ) \ - { \ - *mem = *mem_p; \ - } \ - } \ - else /* if ( bli_mem_is_alloc( mem ) ) */ \ - { \ - /* If the mem_t entry provided by the caller does NOT contain a NULL - buffer, then a block has already been acquired from the memory - broker and cached by the caller. */ \ -\ - /* As a sanity check, we should make sure that the mem_t object isn't - associated with a block that is too small compared to the size of - the packed matrix buffer that is needed, according to the value - computed above. */ \ - siz_t mem_size = bli_mem_size( mem ); \ -\ - if ( mem_size < size_needed ) \ - { \ - if ( bli_thread_am_ochief( thread ) ) \ - { \ - /* The chief thread releases the existing block associated - with the mem_t, and then re-acquires a new block, saving - the associated mem_t to its passed-in mem_t. (See coment - above for why the acquisition needs to be directly to - the chief thread's passed-in mem_t and not a local - (temporary) mem_t. */ \ - bli_pba_release \ - ( \ - rntm, \ - mem \ - ); \ - bli_pba_acquire_m \ - ( \ - rntm, \ - size_needed, \ - pack_buf_type, \ - mem \ - ); \ - } \ -\ - /* Broadcast the address of the chief thread's passed-in mem_t - to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ -\ - /* Non-chief threads: Copy the contents of the chief thread's - passed-in mem_t to the passed-in mem_t for this thread. (The - chief thread already has the mem_t, so it does not need to - perform any copy.) */ \ - if ( !bli_thread_am_ochief( thread ) ) \ - { \ - *mem = *mem_p; \ - } \ - } \ - else \ - { \ - /* If the mem_t entry is already allocated and sufficiently large, - then we use it as-is. No action is needed. */ \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_a ) - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool did_pack, \ - rntm_t* rntm, \ - mem_t* mem, \ - const thrinfo_t* thread \ - ) \ -{ \ - /* Inspect whether we previously packed matrix A. */ \ - if ( did_pack == FALSE ) \ - { \ - /* If we didn't pack matrix A, there's nothing to be done. */ \ - } \ - else /* if ( did_pack == TRUE ) */ \ - { \ - if ( thread != NULL ) \ - if ( bli_thread_am_ochief( thread ) ) \ - { \ - /* Check the mem_t entry provided by the caller. Only proceed if it - is allocated, which it should be. */ \ - if ( bli_mem_is_alloc( mem ) ) \ - { \ - bli_pba_release \ - ( \ - rntm, \ - mem \ - ); \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_a ) - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool will_pack, \ - stor3_t stor_id, \ - pack_t* schema, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - dim_t* m_max, \ - dim_t* k_max, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype** p, inc_t* rs_p, inc_t* cs_p, \ - dim_t* pd_p, inc_t* ps_p, \ - cntx_t* cntx, \ - mem_t* mem, \ - thrinfo_t* thread \ - ) \ -{ \ - /* Inspect whether we are going to be packing matrix A. */ \ - if ( will_pack == FALSE ) \ - { \ - *m_max = m; \ - *k_max = k; \ -\ - /* Set the parameters for use with no packing of A (ie: using the - source matrix A directly). */ \ - { \ - /* Use the strides of the source matrix as the final values. */ \ - *rs_p = rs_a; \ - *cs_p = cs_a; \ -\ - *pd_p = mr; \ - *ps_p = mr * rs_a; \ -\ - /* Set the schema to "not packed" to indicate that packing will be - skipped. */ \ - *schema = BLIS_NOT_PACKED; \ - } \ -\ - /* Since we won't be packing, simply update the buffer address provided - by the caller to point to source matrix. */ \ - *p = a; \ - } \ - else /* if ( will_pack == TRUE ) */ \ - { \ - /* NOTE: This is "rounding up" of the last upanel is actually optional - for the rrc/crc cases, but absolutely necessary for the other cases - since we NEED that last micropanel to have the same ldim (cs_p) as - the other micropanels. Why? So that millikernels can use the same - upanel ldim for all iterations of the ir loop. */ \ - *m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \ - *k_max = k; \ -\ - /* Determine the dimensions and strides for the packed matrix A. */ \ - if ( stor_id == BLIS_RRC || \ - stor_id == BLIS_CRC ) \ - { \ - /* stor3_t id values _RRC and _CRC: pack A to plain row storage. */ \ - *rs_p = k; \ - *cs_p = 1; \ -\ - *pd_p = mr; \ - *ps_p = mr * k; \ -\ - /* Set the schema to "row packed" to indicate packing to plain - row storage. */ \ - *schema = BLIS_PACKED_ROWS; \ - } \ - else \ - { \ - /* All other stor3_t ids: pack A to column-stored row-panels. */ \ - *rs_p = 1; \ - *cs_p = mr; \ -\ - *pd_p = mr; \ - *ps_p = mr * k; \ -\ - /* Set the schema to "packed row panels" to indicate packing to - conventional column-stored row panels. */ \ - *schema = BLIS_PACKED_ROW_PANELS; \ - } \ -\ - /* Set the buffer address provided by the caller to point to the - memory associated with the mem_t entry acquired from the memory - broker. */ \ - *p = bli_mem_buffer( mem ); \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_sup_init_a ) - - -// -// Define BLAS-like interfaces to the variant chooser. -// - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - stor3_t stor_id, \ - trans_t transc, \ - dim_t m_alloc, \ - dim_t k_alloc, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - ctype* kappa, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype** p, inc_t* rs_p, inc_t* cs_p, \ - inc_t* ps_p, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - mem_t* mem, \ - thrinfo_t* thread \ - ) \ -{ \ - pack_t schema; \ - dim_t m_max; \ - dim_t k_max; \ - dim_t pd_p; \ -\ - /* Prepare the packing destination buffer. If packing is not requested, - this function will reduce to a no-op. */ \ - PASTEMAC(ch,packm_sup_init_mem_a) \ - ( \ - will_pack, \ - pack_buf_type, \ - m_alloc, k_alloc, mr, \ - cntx, \ - rntm, \ - mem, \ - thread \ - ); \ -\ - /* Determine the packing buffer and related parameters for matrix A. If A - will not be packed, then a_use will be set to point to a and the _a_use - strides will be set accordingly. */ \ - PASTEMAC(ch,packm_sup_init_a) \ - ( \ - will_pack, \ - stor_id, \ - &schema, \ - m, k, mr, \ - &m_max, &k_max, \ - a, rs_a, cs_a, \ - p, rs_p, cs_p, \ - &pd_p, ps_p, \ - cntx, \ - mem, \ - thread \ - ); \ -\ - /* Inspect whether we are going to be packing matrix A. */ \ - if ( will_pack == FALSE ) \ - { \ - /* If we aren't going to pack matrix A, then there's nothing to do. */ \ -\ - /* - printf( "blis_ packm_sup_a: not packing A.\n" ); \ - */ \ - } \ - else /* if ( will_pack == TRUE ) */ \ - { \ - if ( schema == BLIS_PACKED_ROWS ) \ - { \ - /* - printf( "blis_ packm_sup_a: packing A to rows.\n" ); \ - */ \ -\ - /* For plain packing by rows, use var2. */ \ - PASTEMAC(ch,packm_sup_var2) \ - ( \ - transc, \ - schema, \ - m, \ - k, \ - kappa, \ - a, rs_a, cs_a, \ - *p, *rs_p, *cs_p, \ - cntx, \ - thread \ - ); \ - } \ - else /* if ( schema == BLIS_PACKED_ROW_PANELS ) */ \ - { \ - /* - printf( "blis_ packm_sup_a: packing A to row panels.\n" ); \ - */ \ -\ - /* For packing to column-stored row panels, use var1. */ \ - PASTEMAC(ch,packm_sup_var1) \ - ( \ - transc, \ - schema, \ - m, \ - k, \ - m_max, \ - k_max, \ - kappa, \ - a, rs_a, cs_a, \ - *p, *rs_p, *cs_p, \ - pd_p, *ps_p, \ - cntx, \ - thread \ - ); \ - } \ -\ - /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( thread ); \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_sup_a ) - diff --git a/frame/3/bli_l3_sup_packm_a.h b/frame/3/bli_l3_sup_packm_a.h deleted file mode 100644 index 0aaa302c8..000000000 --- a/frame/3/bli_l3_sup_packm_a.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - const cntx_t* cntx, \ - rntm_t* rntm, \ - mem_t* mem, \ - const thrinfo_t* thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool did_pack, \ - rntm_t* rntm, \ - mem_t* mem, \ - const thrinfo_t* thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool will_pack, \ - stor3_t stor_id, \ - pack_t* schema, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - dim_t* m_max, \ - dim_t* k_max, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype** p, inc_t* rs_p, inc_t* cs_p, \ - dim_t* pd_p, inc_t* ps_p, \ - cntx_t* cntx, \ - mem_t* mem, \ - thrinfo_t* thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_init_a ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - stor3_t stor_id, \ - trans_t transc, \ - dim_t m_alloc, \ - dim_t k_alloc, \ - dim_t m, \ - dim_t k, \ - dim_t mr, \ - ctype* kappa, \ - ctype* a, inc_t rs_a, inc_t cs_a, \ - ctype** p, inc_t* rs_p, inc_t* cs_p, \ - inc_t* ps_p, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - mem_t* mem, \ - thrinfo_t* thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_a ) - diff --git a/frame/3/bli_l3_sup_packm_b.c b/frame/3/bli_l3_sup_packm_b.c deleted file mode 100644 index 7a2030ccf..000000000 --- a/frame/3/bli_l3_sup_packm_b.c +++ /dev/null @@ -1,430 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - const cntx_t* cntx, \ - rntm_t* rntm, \ - mem_t* mem, \ - const thrinfo_t* thread \ - ) \ -{ \ - /* Inspect whether we are going to be packing matrix B. */ \ - if ( will_pack == FALSE ) \ - { \ - } \ - else /* if ( will_pack == TRUE ) */ \ - { \ - /* NOTE: This "rounding up" of the last upanel is actually optional - for the rrc/crc cases, but absolutely necessary for the other cases - since we NEED that last micropanel to have the same ldim (cs_p) as - the other micropanels. Why? So that millikernels can use the same - upanel ldim for all iterations of the ir loop. */ \ - const dim_t k_pack = k; \ - const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \ -\ - /* Barrier to make sure all threads are caught up and ready to begin - the packm stage. */ \ - bli_thread_barrier( thread ); \ -\ - /* Compute the size of the memory block eneded. */ \ - siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \ -\ - /* Check the mem_t entry provided by the caller. If it is unallocated, - then we need to acquire a block from the memory broker. */ \ - if ( bli_mem_is_unalloc( mem ) ) \ - { \ - if ( bli_thread_am_ochief( thread ) ) \ - { \ - /* Acquire directly to the chief thread's mem_t that was - passed in. It needs to be that mem_t struct, and not a - local (temporary) mem_t, since there is no barrier until - after packing is finished, which could allow a race - condition whereby the chief thread exits the current - function before the other threads have a chance to copy - from it. (A barrier would fix that race condition, but - then again, I prefer to keep barriers to a minimum.) */ \ - bli_pba_acquire_m \ - ( \ - rntm, \ - size_needed, \ - pack_buf_type, \ - mem \ - ); \ - } \ -\ - /* Broadcast the address of the chief thread's passed-in mem_t - to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ -\ - /* Non-chief threads: Copy the contents of the chief thread's - passed-in mem_t to the passed-in mem_t for this thread. (The - chief thread already has the mem_t, so it does not need to - perform any copy.) */ \ - if ( !bli_thread_am_ochief( thread ) ) \ - { \ - *mem = *mem_p; \ - } \ - } \ - else /* if ( bli_mem_is_alloc( mem ) ) */ \ - { \ - /* If the mem_t entry provided by the caller does NOT contain a NULL - buffer, then a block has already been acquired from the memory - broker and cached by the caller. */ \ -\ - /* As a sanity check, we should make sure that the mem_t object isn't - associated with a block that is too small compared to the size of - the packed matrix buffer that is needed, according to the value - computed above. */ \ - siz_t mem_size = bli_mem_size( mem ); \ -\ - if ( mem_size < size_needed ) \ - { \ - if ( bli_thread_am_ochief( thread ) ) \ - { \ - /* The chief thread releases the existing block associated - with the mem_t, and then re-acquires a new block, saving - the associated mem_t to its passed-in mem_t. (See coment - above for why the acquisition needs to be directly to - the chief thread's passed-in mem_t and not a local - (temporary) mem_t. */ \ - bli_pba_release \ - ( \ - rntm, \ - mem \ - ); \ - bli_pba_acquire_m \ - ( \ - rntm, \ - size_needed, \ - pack_buf_type, \ - mem \ - ); \ - } \ -\ - /* Broadcast the address of the chief thread's passed-in mem_t - to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ -\ - /* Non-chief threads: Copy the contents of the chief thread's - passed-in mem_t to the passed-in mem_t for this thread. (The - chief thread already has the mem_t, so it does not need to - perform any copy.) */ \ - if ( !bli_thread_am_ochief( thread ) ) \ - { \ - *mem = *mem_p; \ - } \ - } \ - else \ - { \ - /* If the mem_t entry is already allocated and sufficiently large, - then we use it as-is. No action is needed. */ \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_b ) - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool did_pack, \ - rntm_t* rntm, \ - mem_t* mem, \ - const thrinfo_t* thread \ - ) \ -{ \ - /* Inspect whether we previously packed matrix A. */ \ - if ( did_pack == FALSE ) \ - { \ - /* If we didn't pack matrix A, there's nothing to be done. */ \ - } \ - else /* if ( did_pack == TRUE ) */ \ - { \ - if ( thread != NULL ) \ - if ( bli_thread_am_ochief( thread ) ) \ - { \ - /* Check the mem_t entry provided by the caller. Only proceed if it - is allocated, which it should be. */ \ - if ( bli_mem_is_alloc( mem ) ) \ - { \ - bli_pba_release \ - ( \ - rntm, \ - mem \ - ); \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_b ) - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool will_pack, \ - stor3_t stor_id, \ - pack_t* schema, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - dim_t* k_max, \ - dim_t* n_max, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype** p, inc_t* rs_p, inc_t* cs_p, \ - dim_t* pd_p, inc_t* ps_p, \ - cntx_t* cntx, \ - mem_t* mem, \ - thrinfo_t* thread \ - ) \ -{ \ - /* Inspect whether we are going to be packing matrix B. */ \ - if ( will_pack == FALSE ) \ - { \ - *k_max = k; \ - *n_max = n; \ -\ - /* Set the parameters for use with no packing of B (ie: using the - source matrix B directly). */ \ - { \ - /* Use the strides of the source matrix as the final values. */ \ - *rs_p = rs_b; \ - *cs_p = cs_b; \ -\ - *pd_p = nr; \ - *ps_p = nr * cs_b; \ -\ - /* Set the schema to "not packed" to indicate that packing will be - skipped. */ \ - *schema = BLIS_NOT_PACKED; \ - } \ -\ - /* Since we won't be packing, simply update the buffer address provided - by the caller to point to source matrix. */ \ - *p = b; \ - } \ - else /* if ( will_pack == TRUE ) */ \ - { \ - /* NOTE: This is "rounding up" of the last upanel is actually optional - for the rrc/crc cases, but absolutely necessary for the other cases - since we NEED that last micropanel to have the same ldim (cs_p) as - the other micropanels. Why? So that millikernels can use the same - upanel ldim for all iterations of the ir loop. */ \ - *k_max = k; \ - *n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \ -\ - /* Determine the dimensions and strides for the packed matrix B. */ \ - if ( stor_id == BLIS_RRC || \ - stor_id == BLIS_CRC ) \ - { \ - /* stor3_t id values _RRC and _CRC: pack B to plain row storage. */ \ - *rs_p = 1; \ - *cs_p = k; \ -\ - *pd_p = nr; \ - *ps_p = k * nr; \ -\ - /* Set the schema to "column packed" to indicate packing to plain - column storage. */ \ - *schema = BLIS_PACKED_COLUMNS; \ - } \ - else \ - { \ - /* All other stor3_t ids: pack B to row-stored column-panels. */ \ - *rs_p = nr; \ - *cs_p = 1; \ -\ - *pd_p = nr; \ - *ps_p = k * nr; \ -\ - /* Set the schema to "packed column panels" to indicate packing to - conventional row-stored column panels. */ \ - *schema = BLIS_PACKED_COL_PANELS; \ - } \ -\ - /* Set the buffer address provided by the caller to point to the - memory associated with the mem_t entry acquired from the memory - broker. */ \ - *p = bli_mem_buffer( mem ); \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_sup_init_b ) - - -// -// Define BLAS-like interfaces to the variant chooser. -// - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - stor3_t stor_id, \ - trans_t transc, \ - dim_t k_alloc, \ - dim_t n_alloc, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - ctype* kappa, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype** p, inc_t* rs_p, inc_t* cs_p, \ - inc_t* ps_p, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - mem_t* mem, \ - thrinfo_t* thread \ - ) \ -{ \ - pack_t schema; \ - dim_t k_max; \ - dim_t n_max; \ - dim_t pd_p; \ -\ - /* Prepare the packing destination buffer. If packing is not requested, - this function will reduce to a no-op. */ \ - PASTEMAC(ch,packm_sup_init_mem_b) \ - ( \ - will_pack, \ - pack_buf_type, \ - k_alloc, n_alloc, nr, \ - cntx, \ - rntm, \ - mem, \ - thread \ - ); \ -\ - /* Determine the packing buffer and related parameters for matrix B. If B - will not be packed, then b_use will be set to point to b and the _b_use - strides will be set accordingly. */ \ - PASTEMAC(ch,packm_sup_init_b) \ - ( \ - will_pack, \ - stor_id, \ - &schema, \ - k, n, nr, \ - &k_max, &n_max, \ - b, rs_b, cs_b, \ - p, rs_p, cs_p, \ - &pd_p, ps_p, \ - cntx, \ - mem, \ - thread \ - ); \ -\ - /* Inspect whether we are going to be packing matrix B. */ \ - if ( will_pack == FALSE ) \ - { \ - /* If we aren't going to pack matrix B, then there's nothing to do. */ \ -\ - /* - printf( "blis_ packm_sup_b: not packing B.\n" ); \ - */ \ - } \ - else /* if ( will_pack == TRUE ) */ \ - { \ - if ( schema == BLIS_PACKED_COLUMNS ) \ - { \ - /* - printf( "blis_ packm_sup_b: packing B to columns.\n" ); \ - */ \ -\ - /* For plain packing by columns, use var2. */ \ - PASTEMAC(ch,packm_sup_var2) \ - ( \ - transc, \ - schema, \ - k, \ - n, \ - kappa, \ - b, rs_b, cs_b, \ - *p, *rs_p, *cs_p, \ - cntx, \ - thread \ - ); \ - } \ - else /* if ( schema == BLIS_PACKED_COL_PANELS ) */ \ - { \ - /* - printf( "blis_ packm_sup_b: packing B to col panels.\n" ); \ - */ \ -\ - /* For packing to row-stored column panels, use var1. */ \ - PASTEMAC(ch,packm_sup_var1) \ - ( \ - transc, \ - schema, \ - k, \ - n, \ - k_max, \ - n_max, \ - kappa, \ - b, rs_b, cs_b, \ - *p, *rs_p, *cs_p, \ - pd_p, *ps_p, \ - cntx, \ - thread \ - ); \ - } \ -\ - /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( thread ); \ - } \ -} - -INSERT_GENTFUNC_BASIC0( packm_sup_b ) - diff --git a/frame/3/bli_l3_sup_packm_b.h b/frame/3/bli_l3_sup_packm_b.h deleted file mode 100644 index bd18e5887..000000000 --- a/frame/3/bli_l3_sup_packm_b.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - const cntx_t* cntx, \ - rntm_t* rntm, \ - mem_t* mem, \ - const thrinfo_t* thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool did_pack, \ - rntm_t* rntm, \ - mem_t* mem, \ - const thrinfo_t* thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool will_pack, \ - stor3_t stor_id, \ - pack_t* schema, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - dim_t* k_max, \ - dim_t* n_max, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype** p, inc_t* rs_p, inc_t* cs_p, \ - dim_t* pd_p, inc_t* ps_p, \ - cntx_t* cntx, \ - mem_t* mem, \ - thrinfo_t* thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_init_b ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - bool will_pack, \ - packbuf_t pack_buf_type, \ - stor3_t stor_id, \ - trans_t transc, \ - dim_t k_alloc, \ - dim_t n_alloc, \ - dim_t k, \ - dim_t n, \ - dim_t nr, \ - ctype* kappa, \ - ctype* b, inc_t rs_b, inc_t cs_b, \ - ctype** p, inc_t* rs_p, inc_t* cs_p, \ - inc_t* ps_p, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - mem_t* mem, \ - thrinfo_t* thread \ - ); \ - -INSERT_GENTPROT_BASIC0( packm_sup_b ) - diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c index 54ecab8ff..357251002 100644 --- a/frame/3/bli_l3_sup_packm_var.c +++ b/frame/3/bli_l3_sup_packm_var.c @@ -44,17 +44,17 @@ \ void PASTEMAC(ch,varname) \ ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - ctype* kappa, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - ctype* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p, \ - cntx_t* cntx, \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* cntx, \ thrinfo_t* thread \ ) \ { \ @@ -317,14 +317,14 @@ bli_thread_barrier( thread ); \ \ void PASTEMAC(ch,varname) \ ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - ctype* kappa, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - ctype* p, inc_t rs_p, inc_t cs_p, \ - cntx_t* cntx, \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p, \ + cntx_t* cntx, \ thrinfo_t* thread \ ) \ { \ diff --git a/frame/3/bli_l3_sup_packm_var.h b/frame/3/bli_l3_sup_packm_var.h index 9c62c9c68..17cf9a482 100644 --- a/frame/3/bli_l3_sup_packm_var.h +++ b/frame/3/bli_l3_sup_packm_var.h @@ -42,17 +42,17 @@ \ void PASTEMAC(ch,varname) \ ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - dim_t m_max, \ - dim_t n_max, \ - ctype* kappa, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - ctype* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p, \ - cntx_t* cntx, \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + cntx_t* cntx, \ thrinfo_t* thread \ ); @@ -63,14 +63,14 @@ INSERT_GENTPROT_BASIC0( packm_sup_var1 ) \ void PASTEMAC(ch,varname) \ ( \ - trans_t transc, \ - pack_t schema, \ - dim_t m, \ - dim_t n, \ - ctype* kappa, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - ctype* p, inc_t rs_p, inc_t cs_p, \ - cntx_t* cntx, \ + trans_t transc, \ + pack_t schema, \ + dim_t m, \ + dim_t n, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p, \ + cntx_t* cntx, \ thrinfo_t* thread \ ); diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c index 61c85d6e9..76f1a96b7 100644 --- a/frame/3/bli_l3_sup_var1n2m.c +++ b/frame/3/bli_l3_sup_var1n2m.c @@ -34,34 +34,10 @@ #include "blis.h" -#define FUNCPTR_T gemmsup_fp - -typedef void (*FUNCPTR_T) - ( - bool packa, - bool packb, - conj_t conja, - conj_t conjb, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t rs_a, inc_t cs_a, - void* b, inc_t rs_b, inc_t cs_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - stor3_t eff_id, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - // // -- var1n -------------------------------------------------------------------- // -static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n); - void bli_gemmsup_ref_var1n ( trans_t trans, @@ -70,67 +46,31 @@ void bli_gemmsup_ref_var1n const obj_t* b, const obj_t* beta, const obj_t* c, - stor3_t eff_id, + stor3_t stor_id, const cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ) { -#if 0 - obj_t at, bt; - - bli_obj_alias_to( a, &at ); - bli_obj_alias_to( b, &bt ); - - // Induce transpositions on A and/or B if either object is marked for - // transposition. We can induce "fast" transpositions since they objects - // are guaranteed to not have structure or be packed. - if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); } - if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); } - - const num_t dt = bli_obj_dt( c ); - - const conj_t conja = bli_obj_conj_status( a ); - const conj_t conjb = bli_obj_conj_status( b ); - - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - - const dim_t k = bli_obj_width( &at ); - - void* buf_a = bli_obj_buffer_at_off( &at ); - const inc_t rs_a = bli_obj_row_stride( &at ); - const inc_t cs_a = bli_obj_col_stride( &at ); - - void* buf_b = bli_obj_buffer_at_off( &bt ); - const inc_t rs_b = bli_obj_row_stride( &bt ); - const inc_t cs_b = bli_obj_col_stride( &bt ); - - void* buf_c = bli_obj_buffer_at_off( c ); - const inc_t rs_c = bli_obj_row_stride( c ); - const inc_t cs_c = bli_obj_col_stride( c ); + const num_t dt = bli_obj_dt( c ); - void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); - void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); + const dim_t dt_size = bli_dt_size( dt ); -#else - const num_t dt = bli_obj_dt( c ); + bool packa = bli_rntm_pack_a( rntm ); + bool packb = bli_rntm_pack_b( rntm ); - const bool packa = bli_rntm_pack_a( rntm ); - const bool packb = bli_rntm_pack_b( rntm ); + conj_t conja = bli_obj_conj_status( a ); + conj_t conjb = bli_obj_conj_status( b ); - const conj_t conja = bli_obj_conj_status( a ); - const conj_t conjb = bli_obj_conj_status( b ); - - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); dim_t k; - const void* buf_a = bli_obj_buffer_at_off( a ); + const void* buf_a = bli_obj_buffer_at_off( a ); inc_t rs_a; inc_t cs_a; - const void* buf_b = bli_obj_buffer_at_off( b ); + const void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b; inc_t cs_b; @@ -163,556 +103,407 @@ void bli_gemmsup_ref_var1n } void* buf_c = bli_obj_buffer_at_off( c ); - const inc_t rs_c = bli_obj_row_stride( c ); - const inc_t cs_c = bli_obj_col_stride( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); const void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); const void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); -#endif - - // Index into the type combination array to extract the correct - // function pointer. - FUNCPTR_T f = ftypes_var1n[dt]; - #if 1 // Optimize some storage/packing cases by transforming them into others. - // These optimizations are expressed by changing trans and/or eff_id. - bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx ); + // These optimizations are expressed by changing trans and/or stor_id. + bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &stor_id, cntx ); #endif - if ( bli_is_notrans( trans ) ) + // Note: This code explicitly performs the swaps that could be done + // implicitly in other BLIS contexts where a type-specific helper function + // was being called. + if ( bli_is_trans( trans ) ) + { + bool packtmp = packa; packa = packb; packb = packtmp; + conj_t conjtmp = conja; conja = conjb; conjb = conjtmp; + dim_t len_tmp = m; m = n; n = len_tmp; + const void* buf_tmp = buf_a; buf_a = buf_b; buf_b = buf_tmp; + inc_t str_tmp = rs_a; rs_a = cs_b; cs_b = str_tmp; + str_tmp = cs_a; cs_a = rs_b; rs_b = str_tmp; + str_tmp = rs_c; rs_c = cs_c; cs_c = str_tmp; + + stor_id = bli_stor3_trans( stor_id ); + } + + // This transposition of the stor3_t id value is inherent to variant 1. + // The reason: we assume that variant 2 is the "main" variant. The + // consequence of this is that we assume that the millikernels that + // iterate over m are registered to the "primary" kernel group associated + // with the kernel IO preference; similarly, mkernels that iterate over + // n are assumed to be registered to the "non-primary" group associated + // with the ("non-primary") anti-preference. Note that this pattern holds + // regardless of whether the mkernel set has a row or column preference.) + // See bli_l3_sup_int.c for a higher-level view of how this choice is made. + stor_id = bli_stor3_trans( stor_id ); + + // Query the context for various blocksizes. + const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); + const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); + const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); + const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); + const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); + + // Disable modification of KC since it seems to negatively impact certain + // operations (#644). + dim_t KC = KC0; + + /* + if ( packa && packb ) { - // Invoke the function. - f - ( - packa, - packb, - conja, - conjb, - m, - n, - k, - ( void* )buf_alpha, - ( void* )buf_a, rs_a, cs_a, - ( void* )buf_b, rs_b, cs_b, - ( void* )buf_beta, - buf_c, rs_c, cs_c, - eff_id, - ( cntx_t* )cntx, - rntm, - thread - ); + KC = KC0; } - else + else if ( packb ) { - // Invoke the function (transposing the operation). - f - ( - packb, - packa, - conjb, // swap the conj values. - conja, - n, // swap the m and n dimensions. - m, - k, - ( void* )buf_alpha, - ( void* )buf_b, cs_b, rs_b, // swap the positions of A and B. - ( void* )buf_a, cs_a, rs_a, // swap the strides of A and B. - ( void* )buf_beta, - buf_c, cs_c, rs_c, // swap the strides of C. - bli_stor3_trans( eff_id ), // transpose the stor3_t id. - ( cntx_t* )cntx, - rntm, - thread - ); + if ( stor_id == BLIS_RRR || + stor_id == BLIS_CCC ) KC = KC0; + else if ( stor_id == BLIS_RRC || + stor_id == BLIS_CRC ) KC = KC0; + else if ( stor_id == BLIS_RCR || + stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; + else KC = KC0; + } + else if ( packa ) + { + if ( stor_id == BLIS_RRR || + stor_id == BLIS_CCC ) KC = (( KC0 / 2 ) / 2 ) * 2; + else if ( stor_id == BLIS_RRC || + stor_id == BLIS_CRC ) KC = KC0; + else if ( stor_id == BLIS_RCR || + stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; + else KC = KC0; + } + else // if ( !packa && !packb ) + { + if ( FALSE ) KC = KC0; + else if ( stor_id == BLIS_RRC || + stor_id == BLIS_CRC ) KC = KC0; + else if ( m <= MR && n <= NR ) KC = KC0; + else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; + else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; + else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; + else KC = (( KC0 / 5 ) / 4 ) * 4; + } + */ + + // Nudge NC up to a multiple of MR and MC up to a multiple of NR. + // NOTE: This is unique to variant 1 (ie: not performed in variant 2) + // because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. + const dim_t NC = bli_align_dim_to_mult( NC0, MR ); + const dim_t MC = bli_align_dim_to_mult( MC0, NR ); + + // Query the maximum blocksize for MR, which implies a maximum blocksize + // extension for the final iteration. + const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); + const dim_t MRE = MRM - MR; + + // Compute partitioning step values for each matrix of each loop. + const inc_t jcstep_c = rs_c * dt_size; + const inc_t jcstep_a = rs_a * dt_size; + + const inc_t pcstep_a = cs_a * dt_size; + const inc_t pcstep_b = rs_b * dt_size; + + const inc_t icstep_c = cs_c * dt_size; + const inc_t icstep_b = cs_b * dt_size; + + const inc_t jrstep_c = rs_c * MR * dt_size; + + //const inc_t jrstep_a = rs_a * MR; + + //const inc_t irstep_c = cs_c * NR; + //const inc_t irstep_b = cs_b * NR; + + // Query the context for the sup microkernel address and cast it to its + // function pointer type. + gemmsup_ker_vft gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); + + const char* a_00 = buf_a; + const char* b_00 = buf_b; + char* c_00 = buf_c; + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + + auxinfo_t aux; + + mem_t mem_a = BLIS_MEM_INITIALIZER; + mem_t mem_b = BLIS_MEM_INITIALIZER; +\ + // Define an array of bszid_t ids, which will act as our substitute for + // the cntl_t tree. + // NOTE: These bszid_t values, and their order, match that of the bp + // algorithm (variant 2) because they are not used to query actual + // blocksizes but rather query the ways of parallelism for the various + // loops. For example, the 2nd loop in variant 1 partitions in the m + // dimension (in increments of MR), but parallelizes that m dimension + // with BLIS_JR_NT. + // Note that this panel-block algorithm partitions an NC x KC submatrix + // of A to be packed in the 4th loop, and a KC x MC submatrix of B to be + // packed in the 3rd loop. + // 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop + bszid_t bszids[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; +\ + // Determine whether we are using more than one thread. + const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); +\ + thrinfo_t* thread_jc = NULL; + thrinfo_t* thread_pc = NULL; + thrinfo_t* thread_pa = NULL; + thrinfo_t* thread_ic = NULL; + thrinfo_t* thread_pb = NULL; + thrinfo_t* thread_jr = NULL; +\ + // Pre-grow the thrinfo_t tree. + bszid_t* bszids_jc = bszids; + thread_jc = thread; + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); +\ + bszid_t* bszids_pc = &bszids_jc[1]; + thread_pc = bli_thrinfo_sub_node( thread_jc ); + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); + + bszid_t* bszids_pa = &bszids_pc[1]; + thread_pa = bli_thrinfo_sub_node( thread_pc ); + + bszid_t* bszids_ic = &bszids_pa[1]; + thread_ic = bli_thrinfo_sub_node( thread_pa ); + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); + + bszid_t* bszids_pb = &bszids_ic[1]; + thread_pb = bli_thrinfo_sub_node( thread_ic ); + + bszid_t* bszids_jr = &bszids_pb[1]; + thread_jr = bli_thrinfo_sub_node( thread_pb ); + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); + + // Compute the JC loop thread range for the current thread. + dim_t jc_start, jc_end; + bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end ); + const dim_t m_local = jc_end - jc_start; + + // Compute number of primary and leftover components of the JC loop. + //const dim_t jc_iter = ( m_local + NC - 1 ) / NC; + const dim_t jc_left = m_local % NC; + + // Loop over the m dimension (NC rows/columns at a time). + //for ( dim_t jj = 0; jj < jc_iter; jj += 1 ) + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) + { + // Calculate the thread's current JC block dimension. + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); + + const char* a_jc = a_00 + jj * jcstep_a; + char* c_jc = c_00 + jj * jcstep_c; + + // Compute the PC loop thread range for the current thread. + const dim_t pc_start = 0, pc_end = k; + const dim_t k_local = k; + + // Compute number of primary and leftover components of the PC loop. + //const dim_t pc_iter = ( k_local + KC - 1 ) / KC; + const dim_t pc_left = k_local % KC; + + // Loop over the k dimension (KC rows/columns at a time). + //for ( dim_t pp = 0; pp < pc_iter; pp += 1 ) + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) + { + // Calculate the thread's current PC block dimension. + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); + + const char* a_pc = a_jc + pp * pcstep_a; + const char* b_pc = b_00 + pp * pcstep_b; + + // Only apply beta to the first iteration of the pc loop. + const void* beta_use = ( pp == 0 ? buf_beta : one ); + + char* a_use; + inc_t rs_a_use, cs_a_use, ps_a_use; + + // Determine the packing buffer and related parameters for matrix + // A. (If A will not be packed, then a_use will be set to point to + // a and the _a_use strides will be set accordingly.) Then call + // the packm sup variant chooser, which will call the appropriate + // implementation based on the schema deduced from the stor_id. + // NOTE: packing matrix A in this panel-block algorithm corresponds + // to packing matrix B in the block-panel algorithm. + bli_packm_sup + ( + packa, + BLIS_BUFFER_FOR_B_PANEL, // This algorithm packs matrix A to + stor_id, // a "panel of B". + BLIS_NO_TRANSPOSE, + dt, + NC, KC, // This "panel of B" is (at most) NC x KC. + nc_cur, kc_cur, MR, + one, + a_pc, rs_a, cs_a, + ( void** )&a_use, &rs_a_use, &cs_a_use, + &ps_a_use, + cntx, + rntm, + &mem_a, + thread_pa + ); + + // Alias a_use so that it's clear this is our current block of + // matrix A. + const char* a_pc_use = a_use; + + // We don't need to embed the panel stride of A within the auxinfo_t + // object because this variant iterates through A in the jr loop, + // which occurs here, within the macrokernel, not within the + // millikernel. + //bli_auxinfo_set_ps_a( ps_a_use, &aux ); + + // Compute the IC loop thread range for the current thread. + dim_t ic_start, ic_end; + bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end ); + const dim_t n_local = ic_end - ic_start; + + // Compute number of primary and leftover components of the IC loop. + //const dim_t ic_iter = ( n_local + MC - 1 ) / MC; + const dim_t ic_left = n_local % MC; + + // Loop over the n dimension (MC rows at a time). + //for ( dim_t ii = 0; ii < ic_iter; ii += 1 ) + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) + { + // Calculate the thread's current IC block dimension. + const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); + + const char* b_ic = b_pc + ii * icstep_b; + char* c_ic = c_jc + ii * icstep_c; + + char* b_use; + inc_t rs_b_use, cs_b_use, ps_b_use; + + // Determine the packing buffer and related parameters for matrix + // B. (If B will not be packed, then b_use will be set to point to + // b and the _b_use strides will be set accordingly.) Then call + // the packm sup variant chooser, which will call the appropriate + // implementation based on the schema deduced from the stor_id. + // NOTE: packing matrix B in this panel-block algorithm corresponds + // to packing matrix A in the block-panel algorithm. + bli_packm_sup + ( + packb, + BLIS_BUFFER_FOR_A_BLOCK, // This algorithm packs matrix B to + stor_id, // a "block of A". + BLIS_NO_TRANSPOSE, + dt, + MC, KC, // This "block of A" is (at most) KC x MC. + mc_cur, kc_cur, NR, + one, + b_ic, cs_b, rs_b, + ( void** )&b_use, &cs_b_use, &rs_b_use, + &ps_b_use, + cntx, + rntm, + &mem_b, + thread_pb + ); + + // Alias b_use so that it's clear this is our current block of + // matrix B. + const char* b_ic_use = b_use; + + // Embed the panel stride of B within the auxinfo_t object. The + // millikernel will query and use this to iterate through + // micropanels of B. + bli_auxinfo_set_ps_b( ps_b_use, &aux ); + + // Compute number of primary and leftover components of the JR loop. + dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; + dim_t jr_left = nc_cur % MR; + + // An optimization: allow the last jr iteration to contain up to MRE + // rows of C and A. (If MRE > MR, the mkernel has agreed to handle + // these cases.) Note that this prevents us from declaring jr_iter and + // jr_left as const. NOTE: We forgo this optimization when packing A + // since packing an extended edge case is not yet supported. + if ( !packa && !is_mt ) + if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) + { + jr_iter--; jr_left += MR; + } + + // Compute the JR loop thread range for the current thread. + dim_t jr_start, jr_end; + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); + + // Loop over the m dimension (NR columns at a time). + //for ( dim_t j = 0; j < jr_iter; j += 1 ) + for ( dim_t j = jr_start; j < jr_end; j += 1 ) + { + const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); + + //ctype* a_jr = a_pc + j * jrstep_a; + const char* a_jr = a_pc_use + j * ps_a_use * dt_size; + char* c_jr = c_ic + j * jrstep_c; + + //const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; + //const dim_t ir_left = mc_cur % NR; + + // Loop over the n dimension (MR rows at a time). + { + // Invoke the gemmsup millikernel. + gemmsup_ker + ( + conja, + conjb, + nr_cur, // Notice: nr_cur <= MR. + mc_cur, // Recall: mc_cur partitions the n dimension! + kc_cur, + ( void* )buf_alpha, + ( void* )a_jr, rs_a_use, cs_a_use, + ( void* )b_ic_use, rs_b_use, cs_b_use, + ( void* )beta_use, + ( void* )c_jr, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + } + } + + // NOTE: This barrier is only needed if we are packing A (since + // that matrix is packed within the pc loop of this variant). + if ( packa ) bli_thread_barrier( thread_pa ); + } } -} + // Release any memory that was acquired for packing matrices A and B. + bli_packm_sup_finalize_mem + ( + packa, + rntm, + &mem_a, + thread_pa + ); + bli_packm_sup_finalize_mem + ( + packb, + rntm, + &mem_b, + thread_pb + ); -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - bool packa, \ - bool packb, \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t rs_a, inc_t cs_a, \ - void* b, inc_t rs_b, inc_t cs_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - stor3_t stor_id, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* If m or n is zero, return immediately. */ \ - if ( bli_zero_dim2( m, n ) ) return; \ -\ - /* If k < 1 or alpha is zero, scale by beta and return. */ \ - if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \ - { \ - if ( bli_thread_am_ochief( thread ) ) \ - { \ - PASTEMAC(ch,scalm) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m, n, \ - beta, \ - c, rs_c, cs_c \ - ); \ - } \ - return; \ - } \ -\ - /* This transposition of the stor3_t id value is inherent to variant 1. - The reason: we assume that variant 2 is the "main" variant. The - consequence of this is that we assume that the millikernels that - iterate over m are registered to the "primary" kernel group associated - with the kernel IO preference; similarly, mkernels that iterate over - n are assumed to be registered to the "non-primary" group associated - with the ("non-primary") anti-preference. Note that this pattern holds - regardless of whether the mkernel set has a row or column preference.) - See bli_l3_sup_int.c for a higher-level view of how this choice is made. */ \ - stor_id = bli_stor3_trans( stor_id ); \ -\ - /* Query the context for various blocksizes. */ \ - const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ - const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ - const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ - const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ -\ - /* Disable modification of KC since it seems to negatively impact certain operations (#644). */ \ - dim_t KC = KC0; \ - /* \ - dim_t KC; \ - if ( packa && packb ) \ - { \ - KC = KC0; \ - } \ - else if ( packb ) \ - { \ - if ( stor_id == BLIS_RRR || \ - stor_id == BLIS_CCC ) KC = KC0; \ - else if ( stor_id == BLIS_RRC || \ - stor_id == BLIS_CRC ) KC = KC0; \ - else if ( stor_id == BLIS_RCR || \ - stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ - else KC = KC0; \ - } \ - else if ( packa ) \ - { \ - if ( stor_id == BLIS_RRR || \ - stor_id == BLIS_CCC ) KC = (( KC0 / 2 ) / 2 ) * 2; \ - else if ( stor_id == BLIS_RRC || \ - stor_id == BLIS_CRC ) KC = KC0; \ - else if ( stor_id == BLIS_RCR || \ - stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ - else KC = KC0; \ - } \ - else *//* if ( !packa && !packb ) *//* \ - { \ - if ( FALSE ) KC = KC0; \ - else if ( stor_id == BLIS_RRC || \ - stor_id == BLIS_CRC ) KC = KC0; \ - else if ( m <= MR && n <= NR ) KC = KC0; \ - else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \ - else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \ - else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ - else KC = (( KC0 / 5 ) / 4 ) * 4; \ - }*/ \ -\ - /* Nudge NC up to a multiple of MR and MC up to a multiple of NR. - NOTE: This is unique to variant 1 (ie: not performed in variant 2) - because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \ - const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \ - const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \ -\ - /* Query the maximum blocksize for MR, which implies a maximum blocksize - extension for the final iteration. */ \ - const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \ - const dim_t MRE = MRM - MR; \ -\ - /* Compute partitioning step values for each matrix of each loop. */ \ - const inc_t jcstep_c = rs_c; \ - const inc_t jcstep_a = rs_a; \ -\ - const inc_t pcstep_a = cs_a; \ - const inc_t pcstep_b = rs_b; \ -\ - const inc_t icstep_c = cs_c; \ - const inc_t icstep_b = cs_b; \ -\ - const inc_t jrstep_c = rs_c * MR; \ -\ - /* - const inc_t jrstep_a = rs_a * MR; \ -\ - const inc_t irstep_c = cs_c * NR; \ - const inc_t irstep_b = cs_b * NR; \ - */ \ -\ - /* Query the context for the sup microkernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemmsup_ker_ft) \ - gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ -\ - ctype* a_00 = a; \ - ctype* b_00 = b; \ - ctype* c_00 = c; \ - ctype* alpha_cast = alpha; \ - ctype* beta_cast = beta; \ -\ - /* Make local copies of beta and one scalars to prevent any unnecessary - sharing of cache lines between the cores' caches. */ \ - ctype beta_local = *beta_cast; \ - ctype one_local = *PASTEMAC(ch,1); \ -\ - auxinfo_t aux; \ -\ - /* Parse and interpret the contents of the rntm_t object to properly - set the ways of parallelism for each loop. */ \ - /*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \ -\ - /* Initialize a mem_t entry for A and B. Strictly speaking, this is only - needed for the matrix we will be packing (if any), but we do it - unconditionally to be safe. An alternative way of initializing the - mem_t entries is: - - bli_mem_clear( &mem_a ); \ - bli_mem_clear( &mem_b ); \ - */ \ - mem_t mem_a = BLIS_MEM_INITIALIZER; \ - mem_t mem_b = BLIS_MEM_INITIALIZER; \ -\ - /* Define an array of bszid_t ids, which will act as our substitute for - the cntl_t tree. - NOTE: These bszid_t values, and their order, match that of the bp - algorithm (variant 2) because they are not used to query actual - blocksizes but rather query the ways of parallelism for the various - loops. For example, the 2nd loop in variant 1 partitions in the m - dimension (in increments of MR), but parallelizes that m dimension - with BLIS_JR_NT. The only difference is that the _packa and _packb - arrays have been adjusted for the semantic difference in order in - which packa and packb nodes are encountered in the thrinfo tree. - That is, this panel-block algorithm partitions an NC x KC submatrix - of A to be packed in the 4th loop, and a KC x MC submatrix of B - to be packed in the 3rd loop. */ \ - /* 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop */ \ - bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ - bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ - bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ - bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ - bszid_t* bszids; \ -\ - /* Set the bszids pointer to the correct bszids array above based on which - matrices (if any) are being packed. */ \ - if ( packa ) { if ( packb ) bszids = bszids_packab; \ - else bszids = bszids_packa; } \ - else { if ( packb ) bszids = bszids_packb; \ - else bszids = bszids_nopack; } \ -\ - /* Determine whether we are using more than one thread. */ \ - const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \ -\ - thrinfo_t* thread_jc = NULL; \ - thrinfo_t* thread_pc = NULL; \ - thrinfo_t* thread_pa = NULL; \ - thrinfo_t* thread_ic = NULL; \ - thrinfo_t* thread_pb = NULL; \ - thrinfo_t* thread_jr = NULL; \ -\ - /* Grow the thrinfo_t tree. */ \ - bszid_t* bszids_jc = bszids; \ - thread_jc = thread; \ - bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ -\ - /* Compute the JC loop thread range for the current thread. */ \ - dim_t jc_start, jc_end; \ - bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end ); \ - const dim_t m_local = jc_end - jc_start; \ -\ - /* Compute number of primary and leftover components of the JC loop. */ \ - /*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \ - const dim_t jc_left = m_local % NC; \ -\ - /* Loop over the m dimension (NC rows/columns at a time). */ \ - /*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \ - for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ - { \ - /* Calculate the thread's current JC block dimension. */ \ - const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ -\ - ctype* a_jc = a_00 + jj * jcstep_a; \ - ctype* c_jc = c_00 + jj * jcstep_c; \ -\ - /* Grow the thrinfo_t tree. */ \ - bszid_t* bszids_pc = &bszids_jc[1]; \ - thread_pc = bli_thrinfo_sub_node( thread_jc ); \ - bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ -\ - /* Compute the PC loop thread range for the current thread. */ \ - const dim_t pc_start = 0, pc_end = k; \ - const dim_t k_local = k; \ -\ - /* Compute number of primary and leftover components of the PC loop. */ \ - /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ - const dim_t pc_left = k_local % KC; \ -\ - /* Loop over the k dimension (KC rows/columns at a time). */ \ - /*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \ - for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ - { \ - /* Calculate the thread's current PC block dimension. */ \ - const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ -\ - ctype* a_pc = a_jc + pp * pcstep_a; \ - ctype* b_pc = b_00 + pp * pcstep_b; \ -\ - /* Only apply beta to the first iteration of the pc loop. */ \ - ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \ -\ - ctype* a_use; \ - inc_t rs_a_use, cs_a_use, ps_a_use; \ -\ - /* Set the bszid_t array and thrinfo_t pointer based on whether - we will be packing A. If we won't be packing A, we alias to - the _pc variables so that code further down can unconditionally - reference the _pa variables. Note that *if* we will be packing - A, the thrinfo_t node will have already been created by a - previous call to bli_thrinfo_grow(), since bszid values of - BLIS_NO_PART cause the tree to grow by two (e.g. to the next - bszid that is a normal bszid_t value). */ \ - bszid_t* bszids_pa; \ - if ( packa ) { bszids_pa = &bszids_pc[1]; \ - thread_pa = bli_thrinfo_sub_node( thread_pc ); } \ - else { bszids_pa = &bszids_pc[0]; \ - thread_pa = thread_pc; } \ -\ - /* Determine the packing buffer and related parameters for matrix - A. (If A will not be packed, then a_use will be set to point to - a and the _a_use strides will be set accordingly.) Then call - the packm sup variant chooser, which will call the appropriate - implementation based on the schema deduced from the stor_id. - NOTE: packing matrix A in this panel-block algorithm corresponds - to packing matrix B in the block-panel algorithm. */ \ - PASTEMAC(ch,packm_sup_a) \ - ( \ - packa, \ - BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */ \ - stor_id, /* a "panel of B". */ \ - BLIS_NO_TRANSPOSE, \ - NC, KC, /* This "panel of B" is (at most) NC x KC. */ \ - nc_cur, kc_cur, MR, \ - &one_local, \ - a_pc, rs_a, cs_a, \ - &a_use, &rs_a_use, &cs_a_use, \ - &ps_a_use, \ - cntx, \ - rntm, \ - &mem_a, \ - thread_pa \ - ); \ -\ - /* Alias a_use so that it's clear this is our current block of - matrix A. */ \ - ctype* a_pc_use = a_use; \ -\ - /* We don't need to embed the panel stride of A within the auxinfo_t - object because this variant iterates through A in the jr loop, - which occurs here, within the macrokernel, not within the - millikernel. */ \ - /*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \ -\ - /* Grow the thrinfo_t tree. */ \ - bszid_t* bszids_ic = &bszids_pa[1]; \ - thread_ic = bli_thrinfo_sub_node( thread_pa ); \ - bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ -\ - /* Compute the IC loop thread range for the current thread. */ \ - dim_t ic_start, ic_end; \ - bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end ); \ - const dim_t n_local = ic_end - ic_start; \ -\ - /* Compute number of primary and leftover components of the IC loop. */ \ - /*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \ - const dim_t ic_left = n_local % MC; \ -\ - /* Loop over the n dimension (MC rows at a time). */ \ - /*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \ - for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ - { \ - /* Calculate the thread's current IC block dimension. */ \ - const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ -\ - ctype* b_ic = b_pc + ii * icstep_b; \ - ctype* c_ic = c_jc + ii * icstep_c; \ -\ - ctype* b_use; \ - inc_t rs_b_use, cs_b_use, ps_b_use; \ -\ - /* Set the bszid_t array and thrinfo_t pointer based on whether - we will be packing A. If we won't be packing A, we alias to - the _pc variables so that code further down can unconditionally - reference the _pa variables. Note that *if* we will be packing - A, the thrinfo_t node will have already been created by a - previous call to bli_thrinfo_grow(), since bszid values of - BLIS_NO_PART cause the tree to grow by two (e.g. to the next - bszid that is a normal bszid_t value). */ \ - bszid_t* bszids_pb; \ - if ( packb ) { bszids_pb = &bszids_ic[1]; \ - thread_pb = bli_thrinfo_sub_node( thread_ic ); } \ - else { bszids_pb = &bszids_ic[0]; \ - thread_pb = thread_ic; } \ -\ - /* Determine the packing buffer and related parameters for matrix - B. (If B will not be packed, then b_use will be set to point to - b and the _b_use strides will be set accordingly.) Then call - the packm sup variant chooser, which will call the appropriate - implementation based on the schema deduced from the stor_id. - NOTE: packing matrix B in this panel-block algorithm corresponds - to packing matrix A in the block-panel algorithm. */ \ - PASTEMAC(ch,packm_sup_b) \ - ( \ - packb, \ - BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */ \ - stor_id, /* a "block of A". */ \ - BLIS_NO_TRANSPOSE, \ - KC, MC, /* This "block of A" is (at most) KC x MC. */ \ - kc_cur, mc_cur, NR, \ - &one_local, \ - b_ic, rs_b, cs_b, \ - &b_use, &rs_b_use, &cs_b_use, \ - &ps_b_use, \ - cntx, \ - rntm, \ - &mem_b, \ - thread_pb \ - ); \ -\ - /* Alias b_use so that it's clear this is our current block of - matrix B. */ \ - ctype* b_ic_use = b_use; \ -\ - /* Embed the panel stride of B within the auxinfo_t object. The - millikernel will query and use this to iterate through - micropanels of B. */ \ - bli_auxinfo_set_ps_b( ps_b_use, &aux ); \ -\ - /* Grow the thrinfo_t tree. */ \ - bszid_t* bszids_jr = &bszids_pb[1]; \ - thread_jr = bli_thrinfo_sub_node( thread_pb ); \ - bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ -\ - /* Compute number of primary and leftover components of the JR loop. */ \ - dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \ - dim_t jr_left = nc_cur % MR; \ -\ - /* An optimization: allow the last jr iteration to contain up to MRE - rows of C and A. (If MRE > MR, the mkernel has agreed to handle - these cases.) Note that this prevents us from declaring jr_iter and - jr_left as const. NOTE: We forgo this optimization when packing A - since packing an extended edge case is not yet supported. */ \ - if ( !packa && !is_mt ) \ - if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \ - { \ - jr_iter--; jr_left += MR; \ - } \ -\ - /* Compute the JR loop thread range for the current thread. */ \ - dim_t jr_start, jr_end; \ - bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ -\ - /* Loop over the m dimension (NR columns at a time). */ \ - /*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \ - for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ - { \ - const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \ -\ - /* - ctype* a_jr = a_pc + j * jrstep_a; \ - */ \ - ctype* a_jr = a_pc_use + j * ps_a_use; \ - ctype* c_jr = c_ic + j * jrstep_c; \ -\ - /* - const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \ - const dim_t ir_left = mc_cur % NR; \ - */ \ -\ - /* Loop over the n dimension (MR rows at a time). */ \ - { \ - /* Invoke the gemmsup millikernel. */ \ - gemmsup_ker \ - ( \ - conja, \ - conjb, \ - nr_cur, /* Notice: nr_cur <= MR. */ \ - mc_cur, /* Recall: mc_cur partitions the n dimension! */ \ - kc_cur, \ - alpha_cast, \ - a_jr, rs_a_use, cs_a_use, \ - b_ic_use, rs_b_use, cs_b_use, \ - beta_use, \ - c_jr, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - } \ - } \ -\ - /* NOTE: This barrier is only needed if we are packing A (since - that matrix is packed within the pc loop of this variant). */ \ - if ( packa ) bli_thread_barrier( thread_pa ); \ - } \ - } \ -\ - /* Release any memory that was acquired for packing matrices A and B. */ \ - PASTEMAC(ch,packm_sup_finalize_mem_a) \ - ( \ - packa, \ - rntm, \ - &mem_a, \ - thread_pa \ - ); \ - PASTEMAC(ch,packm_sup_finalize_mem_b) \ - ( \ - packb, \ - rntm, \ - &mem_b, \ - thread_pb \ - ); \ -\ /* -PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \ -*/ \ +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); +*/ } -INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1n ) - // // -- var2m -------------------------------------------------------------------- // -static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m); - void bli_gemmsup_ref_var2m ( trans_t trans, @@ -721,67 +512,30 @@ void bli_gemmsup_ref_var2m const obj_t* b, const obj_t* beta, const obj_t* c, - stor3_t eff_id, + stor3_t stor_id, const cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ) { -#if 0 - obj_t at, bt; - - bli_obj_alias_to( a, &at ); - bli_obj_alias_to( b, &bt ); - - // Induce transpositions on A and/or B if either object is marked for - // transposition. We can induce "fast" transpositions since they objects - // are guaranteed to not have structure or be packed. - if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); } - if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); } - - const num_t dt = bli_obj_dt( c ); - - const conj_t conja = bli_obj_conj_status( a ); - const conj_t conjb = bli_obj_conj_status( b ); - - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - - const dim_t k = bli_obj_width( &at ); - - void* buf_a = bli_obj_buffer_at_off( &at ); - const inc_t rs_a = bli_obj_row_stride( &at ); - const inc_t cs_a = bli_obj_col_stride( &at ); - - void* buf_b = bli_obj_buffer_at_off( &bt ); - const inc_t rs_b = bli_obj_row_stride( &bt ); - const inc_t cs_b = bli_obj_col_stride( &bt ); - - void* buf_c = bli_obj_buffer_at_off( c ); - const inc_t rs_c = bli_obj_row_stride( c ); - const inc_t cs_c = bli_obj_col_stride( c ); - - void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); - void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); - -#else - const num_t dt = bli_obj_dt( c ); + const num_t dt = bli_obj_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); - const bool packa = bli_rntm_pack_a( rntm ); - const bool packb = bli_rntm_pack_b( rntm ); + bool packa = bli_rntm_pack_a( rntm ); + bool packb = bli_rntm_pack_b( rntm ); - const conj_t conja = bli_obj_conj_status( a ); - const conj_t conjb = bli_obj_conj_status( b ); + conj_t conja = bli_obj_conj_status( a ); + conj_t conjb = bli_obj_conj_status( b ); - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); dim_t k; - const void* buf_a = bli_obj_buffer_at_off( a ); + const void* buf_a = bli_obj_buffer_at_off( a ); inc_t rs_a; inc_t cs_a; - const void* buf_b = bli_obj_buffer_at_off( b ); + const void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b; inc_t cs_b; @@ -814,516 +568,371 @@ void bli_gemmsup_ref_var2m } void* buf_c = bli_obj_buffer_at_off( c ); - const inc_t rs_c = bli_obj_row_stride( c ); - const inc_t cs_c = bli_obj_col_stride( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); const void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); const void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); -#endif - - // Index into the type combination array to extract the correct - // function pointer. - FUNCPTR_T f = ftypes_var2m[dt]; - #if 1 // Optimize some storage/packing cases by transforming them into others. - // These optimizations are expressed by changing trans and/or eff_id. - bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx ); + // These optimizations are expressed by changing trans and/or stor_id. + bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &stor_id, cntx ); #endif - if ( bli_is_notrans( trans ) ) + // Note: This code explicitly performs the swaps that could be done + // implicitly in other BLIS contexts where a type-specific helper function + // was being called. + if ( bli_is_trans( trans ) ) { - // Invoke the function. - f - ( - packa, - packb, - conja, - conjb, - m, - n, - k, - ( void* )buf_alpha, - ( void* )buf_a, rs_a, cs_a, - ( void* )buf_b, rs_b, cs_b, - ( void* )buf_beta, - buf_c, rs_c, cs_c, - eff_id, - ( cntx_t* )cntx, - rntm, - thread - ); + bool packtmp = packa; packa = packb; packb = packtmp; + conj_t conjtmp = conja; conja = conjb; conjb = conjtmp; + dim_t len_tmp = m; m = n; n = len_tmp; + const void* buf_tmp = buf_a; buf_a = buf_b; buf_b = buf_tmp; + inc_t str_tmp = rs_a; rs_a = cs_b; cs_b = str_tmp; + str_tmp = cs_a; cs_a = rs_b; rs_b = str_tmp; + str_tmp = rs_c; rs_c = cs_c; cs_c = str_tmp; + + stor_id = bli_stor3_trans( stor_id ); } - else + + // Query the context for various blocksizes. + const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); + const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); + const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); + const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); + const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); + + // Disable modification of KC since it seems to negatively impact certain + // operations (#644). + dim_t KC = KC0; + + /* + if ( packa && packb ) { - // Invoke the function (transposing the operation). - f - ( - packb, // swap the pack values. - packa, - conjb, // swap the conj values. - conja, - n, // swap the m and n dimensions. - m, - k, - ( void* )buf_alpha, - ( void* )buf_b, cs_b, rs_b, // swap the positions of A and B. - ( void* )buf_a, cs_a, rs_a, // swap the strides of A and B. - ( void* )buf_beta, - buf_c, cs_c, rs_c, // swap the strides of C. - bli_stor3_trans( eff_id ), // transpose the stor3_t id. - ( cntx_t* )cntx, - rntm, - thread - ); + KC = KC0; } -} + else if ( packb ) + { + if ( stor_id == BLIS_RRR || + stor_id == BLIS_CCC ) KC = KC0; + else if ( stor_id == BLIS_RRC || + stor_id == BLIS_CRC ) KC = KC0; + else if ( stor_id == BLIS_RCR || + stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; + else KC = KC0; + } + else if ( packa ) + { + if ( stor_id == BLIS_RRR || + stor_id == BLIS_CCC ) KC = (( KC0 / 2 ) / 2 ) * 2; + else if ( stor_id == BLIS_RRC || + stor_id == BLIS_CRC ) KC = KC0; + else if ( stor_id == BLIS_RCR || + stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; + else KC = KC0; + } + else // if ( !packa && !packb ) + { + if ( stor_id == BLIS_RRR || + stor_id == BLIS_CCC ) KC = KC0; + else if ( stor_id == BLIS_RRC || + stor_id == BLIS_CRC ) KC = KC0; + else if ( m <= MR && n <= NR ) KC = KC0; + else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; + else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; + else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; + else KC = (( KC0 / 5 ) / 4 ) * 4; + } + */ + // Query the maximum blocksize for NR, which implies a maximum blocksize + // extension for the final iteration. + const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); + const dim_t NRE = NRM - NR; + + // Compute partitioning step values for each matrix of each loop. + const inc_t jcstep_c = cs_c * dt_size; + const inc_t jcstep_b = cs_b * dt_size; + + const inc_t pcstep_a = cs_a * dt_size; + const inc_t pcstep_b = rs_b * dt_size; + + const inc_t icstep_c = rs_c * dt_size; + const inc_t icstep_a = rs_a * dt_size; + + const inc_t jrstep_c = cs_c * NR * dt_size; + + //const inc_t jrstep_b = cs_b * NR; + //( void )jrstep_b; + + //const inc_t irstep_c = rs_c * MR; + //const inc_t irstep_a = rs_a * MR; + + // Query the context for the sup microkernel address and cast it to its + // function pointer type. + gemmsup_ker_vft gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); + + const char* a_00 = buf_a; + const char* b_00 = buf_b; + char* c_00 = buf_c; + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + + auxinfo_t aux; + + mem_t mem_a = BLIS_MEM_INITIALIZER; + mem_t mem_b = BLIS_MEM_INITIALIZER; + + // Define an array of bszid_t ids, which will act as our substitute for + // the cntl_t tree. + // 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop + bszid_t bszids[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; + + // Determine whether we are using more than one thread. + const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); + + thrinfo_t* thread_jc = NULL; + thrinfo_t* thread_pc = NULL; + thrinfo_t* thread_pb = NULL; + thrinfo_t* thread_ic = NULL; + thrinfo_t* thread_pa = NULL; + thrinfo_t* thread_jr = NULL; + + // Pre-grow the thrinfo_t tree. + bszid_t* bszids_jc = bszids; + thread_jc = thread; + bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); + + bszid_t* bszids_pc = &bszids_jc[1]; + thread_pc = bli_thrinfo_sub_node( thread_jc ); + bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); + + bszid_t* bszids_pb = &bszids_pc[1]; + thread_pb = bli_thrinfo_sub_node( thread_pc ); + + bszid_t* bszids_ic = &bszids_pb[1]; + thread_ic = bli_thrinfo_sub_node( thread_pb ); + bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); + + bszid_t* bszids_pa = &bszids_ic[1]; + thread_pa = bli_thrinfo_sub_node( thread_ic ); + + bszid_t* bszids_jr = &bszids_pa[1]; + thread_jr = bli_thrinfo_sub_node( thread_pa ); + bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); + + // Compute the JC loop thread range for the current thread. + dim_t jc_start, jc_end; + bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); + const dim_t n_local = jc_end - jc_start; + + // Compute number of primary and leftover components of the JC loop. + //const dim_t jc_iter = ( n_local + NC - 1 ) / NC; + const dim_t jc_left = n_local % NC; + + // Loop over the n dimension (NC rows/columns at a time). + //for ( dim_t jj = 0; jj < jc_iter; jj += 1 ) + for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) + { + // Calculate the thread's current JC block dimension. + const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); + + const char* b_jc = b_00 + jj * jcstep_b; + char* c_jc = c_00 + jj * jcstep_c; + + // Compute the PC loop thread range for the current thread. + const dim_t pc_start = 0, pc_end = k; + const dim_t k_local = k; + + // Compute number of primary and leftover components of the PC loop. + //const dim_t pc_iter = ( k_local + KC - 1 ) / KC; + const dim_t pc_left = k_local % KC; + + // Loop over the k dimension (KC rows/columns at a time). + //for ( dim_t pp = 0; pp < pc_iter; pp += 1 ) + for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) + { + // Calculate the thread's current PC block dimension. + const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); + + const char* a_pc = a_00 + pp * pcstep_a; + const char* b_pc = b_jc + pp * pcstep_b; + + // Only apply beta to the first iteration of the pc loop. + const void* beta_use = ( pp == 0 ? buf_beta : one ); + + char* b_use; + inc_t rs_b_use, cs_b_use, ps_b_use; + + // Determine the packing buffer and related parameters for matrix + // B. (If B will not be packed, then a_use will be set to point to + // b and the _b_use strides will be set accordingly.) Then call + // the packm sup variant chooser, which will call the appropriate + // implementation based on the schema deduced from the stor_id. + bli_packm_sup + ( + packb, + BLIS_BUFFER_FOR_B_PANEL, // This algorithm packs matrix B to + stor_id, // a "panel of B." + BLIS_NO_TRANSPOSE, + dt, + NC, KC, // This "panel of B" is (at most) KC x NC. + nc_cur, kc_cur, NR, + one, + b_pc, cs_b, rs_b, + ( void** )&b_use, &cs_b_use, &rs_b_use, + &ps_b_use, + cntx, + rntm, + &mem_b, + thread_pb + ); + + // Alias b_use so that it's clear this is our current block of + // matrix B. + char* b_pc_use = b_use; + + // We don't need to embed the panel stride of B within the auxinfo_t + // object because this variant iterates through B in the jr loop, + // which occurs here, within the macrokernel, not within the + // millikernel. + //bli_auxinfo_set_ps_b( ps_b_use, &aux ); + + // Compute the IC loop thread range for the current thread. + dim_t ic_start, ic_end; + bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); + const dim_t m_local = ic_end - ic_start; + + // Compute number of primary and leftover components of the IC loop. + //const dim_t ic_iter = ( m_local + MC - 1 ) / MC; + const dim_t ic_left = m_local % MC; + + // Loop over the m dimension (MC rows at a time). + //for ( dim_t ii = 0; ii < ic_iter; ii += 1 ) + for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) + { + // Calculate the thread's current IC block dimension. + const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); + + const char* a_ic = a_pc + ii * icstep_a; + char* c_ic = c_jc + ii * icstep_c; + + char* a_use; + inc_t rs_a_use, cs_a_use, ps_a_use; + + // Determine the packing buffer and related parameters for matrix + // A. (If A will not be packed, then a_use will be set to point to + // a and the _a_use strides will be set accordingly.) Then call + // the packm sup variant chooser, which will call the appropriate + // implementation based on the schema deduced from the stor_id. + bli_packm_sup + ( + packa, + BLIS_BUFFER_FOR_A_BLOCK, // This algorithm packs matrix A to + stor_id, // a "block of A." + BLIS_NO_TRANSPOSE, + dt, + MC, KC, // This "block of A" is (at most) MC x KC. + mc_cur, kc_cur, MR, + one, + a_ic, rs_a, cs_a, + ( void** )&a_use, &rs_a_use, &cs_a_use, + &ps_a_use, + cntx, + rntm, + &mem_a, + thread_pa + ); + + // Alias a_use so that it's clear this is our current block of + // matrix A. + char* a_ic_use = a_use; + + // Embed the panel stride of A within the auxinfo_t object. The + // millikernel will query and use this to iterate through + // micropanels of A (if needed). + bli_auxinfo_set_ps_a( ps_a_use, &aux ); + + // Compute number of primary and leftover components of the JR loop. + dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; + dim_t jr_left = nc_cur % NR; + + // An optimization: allow the last jr iteration to contain up to NRE + // columns of C and B. (If NRE > NR, the mkernel has agreed to handle + // these cases.) Note that this prevents us from declaring jr_iter and + // jr_left as const. NOTE: We forgo this optimization when packing B + // since packing an extended edge case is not yet supported. + if ( !packb && !is_mt ) + if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) + { + jr_iter--; jr_left += NR; + } + + // Compute the JR loop thread range for the current thread. + dim_t jr_start, jr_end; + bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); + + // Loop over the n dimension (NR columns at a time). + //for ( dim_t j = 0; j < jr_iter; j += 1 ) + for ( dim_t j = jr_start; j < jr_end; j += 1 ) + { + const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); + + //ctype* b_jr = b_pc_use + j * jrstep_b; + const char* b_jr = b_pc_use + j * ps_b_use * dt_size; + char* c_jr = c_ic + j * jrstep_c; + + //const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; + //const dim_t ir_left = mc_cur % MR; + + // Loop over the m dimension (MR rows at a time). + { + // Invoke the gemmsup millikernel. + gemmsup_ker + ( + conja, + conjb, + mc_cur, + nr_cur, + kc_cur, + ( void* )buf_alpha, + ( void* )a_ic_use, rs_a_use, cs_a_use, + ( void* )b_jr, rs_b_use, cs_b_use, + ( void* )beta_use, + ( void* )c_jr, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + } + } + + // NOTE: This barrier is only needed if we are packing B (since + // that matrix is packed within the pc loop of this variant). + if ( packb ) bli_thread_barrier( thread_pb ); + } + } + + // Release any memory that was acquired for packing matrices A and B. + bli_packm_sup_finalize_mem + ( + packa, + rntm, + &mem_a, + thread_pa + ); + bli_packm_sup_finalize_mem + ( + packb, + rntm, + &mem_b, + thread_pb + ); -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - bool packa, \ - bool packb, \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t rs_a, inc_t cs_a, \ - void* b, inc_t rs_b, inc_t cs_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - stor3_t stor_id, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* If m or n is zero, return immediately. */ \ - if ( bli_zero_dim2( m, n ) ) return; \ -\ - /* If k < 1 or alpha is zero, scale by beta and return. */ \ - if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \ - { \ - if ( bli_thread_am_ochief( thread ) ) \ - { \ - PASTEMAC(ch,scalm) \ - ( \ - BLIS_NO_CONJUGATE, \ - 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m, n, \ - beta, \ - c, rs_c, cs_c \ - ); \ - } \ - return; \ - } \ -\ - /* Query the context for various blocksizes. */ \ - const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ - const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ - const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ - const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ - const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ -\ - /* Disable modification of KC since it seems to negatively impact certain operations (#644). */ \ - dim_t KC = KC0; \ - /* \ - dim_t KC; \ - if ( packa && packb ) \ - { \ - KC = KC0; \ - } \ - else if ( packb ) \ - { \ - if ( stor_id == BLIS_RRR || \ - stor_id == BLIS_CCC ) KC = KC0; \ - else if ( stor_id == BLIS_RRC || \ - stor_id == BLIS_CRC ) KC = KC0; \ - else if ( stor_id == BLIS_RCR || \ - stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ - else KC = KC0; \ - } \ - else if ( packa ) \ - { \ - if ( stor_id == BLIS_RRR || \ - stor_id == BLIS_CCC ) KC = (( KC0 / 2 ) / 2 ) * 2; \ - else if ( stor_id == BLIS_RRC || \ - stor_id == BLIS_CRC ) KC = KC0; \ - else if ( stor_id == BLIS_RCR || \ - stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ - else KC = KC0; \ - } \ - else *//* if ( !packa && !packb ) *//* \ - { \ - if ( stor_id == BLIS_RRR || \ - stor_id == BLIS_CCC ) KC = KC0; \ - else if ( stor_id == BLIS_RRC || \ - stor_id == BLIS_CRC ) KC = KC0; \ - else if ( m <= MR && n <= NR ) KC = KC0; \ - else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \ - else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \ - else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ - else KC = (( KC0 / 5 ) / 4 ) * 4; \ - }*/ \ -\ - /* Query the maximum blocksize for NR, which implies a maximum blocksize - extension for the final iteration. */ \ - const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \ - const dim_t NRE = NRM - NR; \ -\ - /* Compute partitioning step values for each matrix of each loop. */ \ - const inc_t jcstep_c = cs_c; \ - const inc_t jcstep_b = cs_b; \ -\ - const inc_t pcstep_a = cs_a; \ - const inc_t pcstep_b = rs_b; \ -\ - const inc_t icstep_c = rs_c; \ - const inc_t icstep_a = rs_a; \ -\ - const inc_t jrstep_c = cs_c * NR; \ -\ - /* - const inc_t jrstep_b = cs_b * NR; \ - ( void )jrstep_b; \ -\ - const inc_t irstep_c = rs_c * MR; \ - const inc_t irstep_a = rs_a * MR; \ - */ \ -\ - /* Query the context for the sup microkernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemmsup_ker_ft) \ - gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ -\ - ctype* a_00 = a; \ - ctype* b_00 = b; \ - ctype* c_00 = c; \ - ctype* alpha_cast = alpha; \ - ctype* beta_cast = beta; \ -\ - /* Make local copies of beta and one scalars to prevent any unnecessary - sharing of cache lines between the cores' caches. */ \ - ctype beta_local = *beta_cast; \ - ctype one_local = *PASTEMAC(ch,1); \ -\ - auxinfo_t aux; \ -\ - /* Parse and interpret the contents of the rntm_t object to properly - set the ways of parallelism for each loop. */ \ - /*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \ -\ - /* Initialize a mem_t entry for A and B. Strictly speaking, this is only - needed for the matrix we will be packing (if any), but we do it - unconditionally to be safe. An alternative way of initializing the - mem_t entries is: - - bli_mem_clear( &mem_a ); \ - bli_mem_clear( &mem_b ); \ - */ \ - mem_t mem_a = BLIS_MEM_INITIALIZER; \ - mem_t mem_b = BLIS_MEM_INITIALIZER; \ -\ - /* Define an array of bszid_t ids, which will act as our substitute for - the cntl_t tree. */ \ - /* 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop */ \ - bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ - bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ - bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ - bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ - bszid_t* bszids; \ -\ - /* Set the bszids pointer to the correct bszids array above based on which - matrices (if any) are being packed. */ \ - if ( packa ) { if ( packb ) bszids = bszids_packab; \ - else bszids = bszids_packa; } \ - else { if ( packb ) bszids = bszids_packb; \ - else bszids = bszids_nopack; } \ -\ - /* Determine whether we are using more than one thread. */ \ - const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \ -\ - thrinfo_t* thread_jc = NULL; \ - thrinfo_t* thread_pc = NULL; \ - thrinfo_t* thread_pb = NULL; \ - thrinfo_t* thread_ic = NULL; \ - thrinfo_t* thread_pa = NULL; \ - thrinfo_t* thread_jr = NULL; \ -\ - /* Grow the thrinfo_t tree. */ \ - bszid_t* bszids_jc = bszids; \ - thread_jc = thread; \ - bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ -\ - /* Compute the JC loop thread range for the current thread. */ \ - dim_t jc_start, jc_end; \ - bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ - const dim_t n_local = jc_end - jc_start; \ -\ - /* Compute number of primary and leftover components of the JC loop. */ \ - /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ - const dim_t jc_left = n_local % NC; \ -\ - /* Loop over the n dimension (NC rows/columns at a time). */ \ - /*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \ - for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ - { \ - /* Calculate the thread's current JC block dimension. */ \ - const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ -\ - ctype* b_jc = b_00 + jj * jcstep_b; \ - ctype* c_jc = c_00 + jj * jcstep_c; \ -\ - /* Grow the thrinfo_t tree. */ \ - bszid_t* bszids_pc = &bszids_jc[1]; \ - thread_pc = bli_thrinfo_sub_node( thread_jc ); \ - bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ -\ - /* Compute the PC loop thread range for the current thread. */ \ - const dim_t pc_start = 0, pc_end = k; \ - const dim_t k_local = k; \ -\ - /* Compute number of primary and leftover components of the PC loop. */ \ - /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ - const dim_t pc_left = k_local % KC; \ -\ - /* Loop over the k dimension (KC rows/columns at a time). */ \ - /*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \ - for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ - { \ - /* Calculate the thread's current PC block dimension. */ \ - const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ -\ - ctype* a_pc = a_00 + pp * pcstep_a; \ - ctype* b_pc = b_jc + pp * pcstep_b; \ -\ - /* Only apply beta to the first iteration of the pc loop. */ \ - ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \ -\ - ctype* b_use; \ - inc_t rs_b_use, cs_b_use, ps_b_use; \ -\ - /* Set the bszid_t array and thrinfo_t pointer based on whether - we will be packing B. If we won't be packing B, we alias to - the _pc variables so that code further down can unconditionally - reference the _pb variables. Note that *if* we will be packing - B, the thrinfo_t node will have already been created by a - previous call to bli_thrinfo_grow(), since bszid values of - BLIS_NO_PART cause the tree to grow by two (e.g. to the next - bszid that is a normal bszid_t value). */ \ - bszid_t* bszids_pb; \ - if ( packb ) { bszids_pb = &bszids_pc[1]; \ - thread_pb = bli_thrinfo_sub_node( thread_pc ); } \ - else { bszids_pb = &bszids_pc[0]; \ - thread_pb = thread_pc; } \ -\ - /* Determine the packing buffer and related parameters for matrix - B. (If B will not be packed, then a_use will be set to point to - b and the _b_use strides will be set accordingly.) Then call - the packm sup variant chooser, which will call the appropriate - implementation based on the schema deduced from the stor_id. */ \ - PASTEMAC(ch,packm_sup_b) \ - ( \ - packb, \ - BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */ \ - stor_id, /* a "panel of B." */ \ - BLIS_NO_TRANSPOSE, \ - KC, NC, /* This "panel of B" is (at most) KC x NC. */ \ - kc_cur, nc_cur, NR, \ - &one_local, \ - b_pc, rs_b, cs_b, \ - &b_use, &rs_b_use, &cs_b_use, \ - &ps_b_use, \ - cntx, \ - rntm, \ - &mem_b, \ - thread_pb \ - ); \ -\ - /* Alias b_use so that it's clear this is our current block of - matrix B. */ \ - ctype* b_pc_use = b_use; \ -\ - /* We don't need to embed the panel stride of B within the auxinfo_t - object because this variant iterates through B in the jr loop, - which occurs here, within the macrokernel, not within the - millikernel. */ \ - /*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \ -\ - /* Grow the thrinfo_t tree. */ \ - bszid_t* bszids_ic = &bszids_pb[1]; \ - thread_ic = bli_thrinfo_sub_node( thread_pb ); \ - bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ -\ - /* Compute the IC loop thread range for the current thread. */ \ - dim_t ic_start, ic_end; \ - bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \ - const dim_t m_local = ic_end - ic_start; \ -\ - /* Compute number of primary and leftover components of the IC loop. */ \ - /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ - const dim_t ic_left = m_local % MC; \ -\ - /* Loop over the m dimension (MC rows at a time). */ \ - /*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \ - for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ - { \ - /* Calculate the thread's current IC block dimension. */ \ - const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ -\ - ctype* a_ic = a_pc + ii * icstep_a; \ - ctype* c_ic = c_jc + ii * icstep_c; \ -\ - ctype* a_use; \ - inc_t rs_a_use, cs_a_use, ps_a_use; \ -\ - /* Set the bszid_t array and thrinfo_t pointer based on whether - we will be packing B. If we won't be packing A, we alias to - the _ic variables so that code further down can unconditionally - reference the _pa variables. Note that *if* we will be packing - A, the thrinfo_t node will have already been created by a - previous call to bli_thrinfo_grow(), since bszid values of - BLIS_NO_PART cause the tree to grow by two (e.g. to the next - bszid that is a normal bszid_t value). */ \ - bszid_t* bszids_pa; \ - if ( packa ) { bszids_pa = &bszids_ic[1]; \ - thread_pa = bli_thrinfo_sub_node( thread_ic ); } \ - else { bszids_pa = &bszids_ic[0]; \ - thread_pa = thread_ic; } \ -\ - /* Determine the packing buffer and related parameters for matrix - A. (If A will not be packed, then a_use will be set to point to - a and the _a_use strides will be set accordingly.) Then call - the packm sup variant chooser, which will call the appropriate - implementation based on the schema deduced from the stor_id. */ \ - PASTEMAC(ch,packm_sup_a) \ - ( \ - packa, \ - BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \ - stor_id, /* a "block of A." */ \ - BLIS_NO_TRANSPOSE, \ - MC, KC, /* This "block of A" is (at most) MC x KC. */ \ - mc_cur, kc_cur, MR, \ - &one_local, \ - a_ic, rs_a, cs_a, \ - &a_use, &rs_a_use, &cs_a_use, \ - &ps_a_use, \ - cntx, \ - rntm, \ - &mem_a, \ - thread_pa \ - ); \ -\ - /* Alias a_use so that it's clear this is our current block of - matrix A. */ \ - ctype* a_ic_use = a_use; \ -\ - /* Embed the panel stride of A within the auxinfo_t object. The - millikernel will query and use this to iterate through - micropanels of A (if needed). */ \ - bli_auxinfo_set_ps_a( ps_a_use, &aux ); \ -\ - /* Grow the thrinfo_t tree. */ \ - bszid_t* bszids_jr = &bszids_pa[1]; \ - thread_jr = bli_thrinfo_sub_node( thread_pa ); \ - bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ -\ - /* Compute number of primary and leftover components of the JR loop. */ \ - dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ - dim_t jr_left = nc_cur % NR; \ -\ - /* An optimization: allow the last jr iteration to contain up to NRE - columns of C and B. (If NRE > NR, the mkernel has agreed to handle - these cases.) Note that this prevents us from declaring jr_iter and - jr_left as const. NOTE: We forgo this optimization when packing B - since packing an extended edge case is not yet supported. */ \ - if ( !packb && !is_mt ) \ - if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \ - { \ - jr_iter--; jr_left += NR; \ - } \ -\ - /* Compute the JR loop thread range for the current thread. */ \ - dim_t jr_start, jr_end; \ - bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - /*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \ - for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ - { \ - const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ -\ - /* - ctype* b_jr = b_pc_use + j * jrstep_b; \ - */ \ - ctype* b_jr = b_pc_use + j * ps_b_use; \ - ctype* c_jr = c_ic + j * jrstep_c; \ -\ - /* - const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ - const dim_t ir_left = mc_cur % MR; \ - */ \ -\ - /* Loop over the m dimension (MR rows at a time). */ \ - { \ - /* Invoke the gemmsup millikernel. */ \ - gemmsup_ker \ - ( \ - conja, \ - conjb, \ - mc_cur, \ - nr_cur, \ - kc_cur, \ - alpha_cast, \ - a_ic_use, rs_a_use, cs_a_use, \ - b_jr, rs_b_use, cs_b_use, \ - beta_use, \ - c_jr, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - } \ - } \ -\ - /* NOTE: This barrier is only needed if we are packing B (since - that matrix is packed within the pc loop of this variant). */ \ - if ( packb ) bli_thread_barrier( thread_pb ); \ - } \ - } \ -\ - /* Release any memory that was acquired for packing matrices A and B. */ \ - PASTEMAC(ch,packm_sup_finalize_mem_a) \ - ( \ - packa, \ - rntm, \ - &mem_a, \ - thread_pa \ - ); \ - PASTEMAC(ch,packm_sup_finalize_mem_b) \ - ( \ - packb, \ - rntm, \ - &mem_b, \ - thread_pb \ - ); \ -\ /* -PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \ -*/ \ +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); +PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); +*/ } -INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2m ) - diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h index df9a747ab..be6b17f39 100644 --- a/frame/3/bli_l3_sup_vars.h +++ b/frame/3/bli_l3_sup_vars.h @@ -89,32 +89,6 @@ void PASTEMAC(ch,varname) \ INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - bool packa, \ - bool packb, \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t rs_a, inc_t cs_a, \ - void* b, inc_t rs_b, inc_t cs_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - stor3_t eff_id, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) -INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) - // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 1ae904abf..5f992bd67 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -53,22 +53,6 @@ void bli_gemm_front obj_t b_local; obj_t c_local; - // If C has a zero dimension, return early. - if ( bli_obj_has_zero_dim( c ) ) - { - return; - } - - // If alpha is zero, or if A or B has a zero dimension, scale C by beta - // and return early. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) || - bli_obj_has_zero_dim( a ) || - bli_obj_has_zero_dim( b ) ) - { - bli_scalm( beta, c ); - return; - } - #if 0 #ifdef BLIS_ENABLE_SMALL_MATRIX // Only handle small problems separately for homogeneous datatypes. diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c index e291b5f27..49b32c976 100644 --- a/frame/3/gemmt/bli_gemmt_front.c +++ b/frame/3/gemmt/bli_gemmt_front.c @@ -53,22 +53,6 @@ void bli_gemmt_front obj_t b_local; obj_t c_local; - // If C has a zero dimension, return early. - if ( bli_obj_has_zero_dim( c ) ) - { - return; - } - - // If alpha is zero, or if A or B has a zero dimension, scale C by beta - // and return early. - if ( bli_obj_equals( alpha, &BLIS_ZERO ) || - bli_obj_has_zero_dim( a ) || - bli_obj_has_zero_dim( b ) ) - { - bli_scalm( beta, c ); - return; - } - // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); From a1a5a9b4cbef9208da494c45a2f933a8e82559ac Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 21 Sep 2022 18:31:01 -0500 Subject: [PATCH 086/230] Implemented support for fat multithreading. (#665) Details: - Allow the user to configure BLIS in such a way that multiple threading implementations get compiled into the library, with one of those implementations chosen at runtime. For now, there are only three implementations available: OpenMP, pthreads, and single. (Here, 'single' merely refers to single-threaded mode.) The configure script now allows the user to give the -t option with a comma-separated list of values, such as '-t openmp,pthreads'. The first value in the list will always be the default at library initialization time, and 'single' is always silently appended to the end of the list. The user can specify which implementation should execute in one of three ways: by setting the BLIS_THREAD_IMPL environment variable prior to launch; by calling the bli_thread_set_thread_impl() global runtime API; or by encoding their choice into a rntm_t that is passed into one of the expert interfaces. Any of these three choices overrides the initialization-time default (i.e., the first value listed to the -t configure option). Requesting an implementation that was not compiled into the library will result in an error message followed by bli_abort(). - Relocated the 'auto' logic for the -t option from the top-level Makefile to the configure script. (Currently, this logic is pretty dumb, choosing 'openmp' for gcc and icc, and 'pthreads' for clang.) - Defined a new 'timpl_t' enum in bli_type_defs.h, with three valid values: BLIS_SINGLE, BLIS_OPENMP, BLIS_POSIX. - Reorganized the thrcomm_t struct into a single defintion with two preprocessor blocks, one each for additional fields needed by OpenMP and pthreads. - Added timpl_t argument to bli_thrcomm_bcast(), bli_thrcomm_barrier(), bli_thrcomm_init(), and bli_thrcomm_cleanup(), which these functions need since they are now wrappers that choose the implementation- specific function corresponding to the currently enabled threading implementation. - Added rntm_t* to bli_thread_broadcast(), bli_thread_barrier() so that those functions can pass the timpl_t value into bli_thrcomm_bcast() and bli_thrcomm_barrier(), respectively. - Defined bli_env_get_str() in bli_env.c to allow the querying of BLIS_THREAD_IMPL (which, unlike BLIS_NUM_THREADS and friends, is expected to be a string). - Defined bli_thread_get_thread_impl(), bli_thread_set_thread_impl() to get and set the current threading implementation at runtime. - Defined bli_rntm_thread_impl() and bli_rntm_set_thread_impl() to query and set the threading implementation within a rntm_t. Also choose BLIS_SINGLE as the default value when initializing rntm_t structs. - Added bli_info_get_*() functions to query whether OpenMP or pthreads would be chosen as the default at init-time. Note that this only tests whether OpenMP or pthreads is the first implementation in the list passed to the threading configure option (-t) and is *not* the same as querying which implementation is currently selected, since that can be influenced by BLIS_THREAD_IMPL and/or bli_thread_set_thread_impl(). - Changed l3int_t to l3int_ft. - Updated docs/Multithreading.md to document the new behavior. - Updated sandbox/gemmlike and addon/gemmd to work with the new fat threading feature. This included a few bugfixes to bring the codes up to date, as necessary. - Comment, whitespace updates. --- Makefile | 34 +-- addon/gemmd/attic/bli_gemm_ex.c | 17 +- addon/gemmd/bao_gemmd.c | 16 +- addon/gemmd/bao_gemmd_bp_var1.c | 2 +- addon/gemmd/bao_l3_packm_a.c | 8 +- addon/gemmd/bao_l3_packm_b.c | 8 +- addon/gemmd/thread/bao_l3_decor.c | 150 ++++++++++++ addon/gemmd/thread/bao_l3_decor.h | 58 +++-- addon/gemmd/thread/bao_l3_decor_openmp.c | 15 +- addon/gemmd/thread/bao_l3_decor_openmp.h | 17 +- addon/gemmd/thread/bao_l3_decor_pthreads.c | 15 +- addon/gemmd/thread/bao_l3_decor_pthreads.h | 17 +- addon/gemmd/thread/bao_l3_decor_single.c | 8 +- addon/gemmd/thread/bao_l3_decor_single.h | 25 +- build/bli_config.h.in | 6 + build/config.mk.in | 2 +- common.mk | 44 ++-- configure | 221 +++++++++++++++--- docs/Multithreading.md | 81 ++++++- frame/1m/packm/bli_packm_alloc.c | 4 +- frame/1m/packm/bli_packm_int.c | 4 +- frame/1m/unpackm/bli_unpackm_int.c | 3 +- frame/1m/unpackm/bli_unpackm_int.h | 1 + frame/3/bli_l3_int.c | 4 +- frame/3/bli_l3_sup_packm.c | 8 +- frame/3/bli_l3_sup_packm_var.c | 8 +- frame/3/bli_l3_sup_var1n2m.c | 14 +- frame/3/gemm/bli_gemm_blk_var3.c | 2 +- frame/3/trsm/bli_trsm_blk_var1.c | 2 +- frame/3/trsm/bli_trsm_blk_var3.c | 2 +- frame/base/bli_env.c | 8 + frame/base/bli_env.h | 1 + frame/base/bli_info.c | 16 ++ frame/base/bli_info.h | 2 + frame/base/bli_rntm.c | 22 +- frame/base/bli_rntm.h | 29 ++- frame/include/bli_config_macro_defs.h | 11 +- frame/include/bli_type_defs.h | 16 ++ frame/thread/bli_l3_decor.c | 176 ++++++++++++++ frame/thread/bli_l3_decor.h | 44 ++-- frame/thread/bli_l3_decor_openmp.c | 39 ++-- frame/thread/bli_l3_decor_openmp.h | 14 ++ frame/thread/bli_l3_decor_pthreads.c | 44 ++-- frame/thread/bli_l3_decor_pthreads.h | 14 ++ frame/thread/bli_l3_decor_single.c | 45 ++-- frame/thread/bli_l3_decor_single.h | 17 +- frame/thread/bli_l3_sup_decor.c | 137 +++++++++++ frame/thread/bli_l3_sup_decor.h | 39 +++- frame/thread/bli_l3_sup_decor_openmp.c | 25 +- frame/thread/bli_l3_sup_decor_openmp.h | 13 ++ frame/thread/bli_l3_sup_decor_pthreads.c | 53 +++-- frame/thread/bli_l3_sup_decor_pthreads.h | 13 ++ frame/thread/bli_l3_sup_decor_single.c | 24 +- frame/thread/bli_l3_sup_decor_single.h | 16 +- frame/thread/bli_thrcomm.c | 179 +++++++++++++- frame/thread/bli_thrcomm.h | 90 ++++++- frame/thread/bli_thrcomm_openmp.c | 76 +++--- frame/thread/bli_thrcomm_openmp.h | 49 +--- frame/thread/bli_thrcomm_pthreads.c | 43 +--- frame/thread/bli_thrcomm_pthreads.h | 33 +-- frame/thread/bli_thrcomm_single.c | 37 +-- frame/thread/bli_thrcomm_single.h | 44 +--- frame/thread/bli_thread.c | 113 +++++++-- frame/thread/bli_thread.h | 24 +- frame/thread/bli_thrinfo.c | 12 +- frame/thread/bli_thrinfo.h | 16 +- frame/thread/bli_thrinfo_sup.c | 6 +- sandbox/gemmlike/bls_gemm.c | 24 +- sandbox/gemmlike/bls_gemm.h | 24 +- sandbox/gemmlike/bls_gemm_bp_var1.c | 2 +- sandbox/gemmlike/bls_gemm_check.c | 12 +- sandbox/gemmlike/bls_gemm_check.h | 12 +- sandbox/gemmlike/bls_l3_packm_a.c | 8 +- sandbox/gemmlike/bls_l3_packm_b.c | 8 +- sandbox/gemmlike/thread/bls_l3_decor.c | 148 ++++++++++++ sandbox/gemmlike/thread/bls_l3_decor.h | 43 ++-- sandbox/gemmlike/thread/bls_l3_decor_openmp.c | 15 +- sandbox/gemmlike/thread/bls_l3_decor_openmp.h | 13 ++ .../gemmlike/thread/bls_l3_decor_pthreads.c | 55 +++-- .../gemmlike/thread/bls_l3_decor_pthreads.h | 13 ++ sandbox/gemmlike/thread/bls_l3_decor_single.c | 34 ++- sandbox/gemmlike/thread/bls_l3_decor_single.h | 18 +- testsuite/src/test_libblis.c | 35 ++- 83 files changed, 2083 insertions(+), 717 deletions(-) create mode 100644 addon/gemmd/thread/bao_l3_decor.c create mode 100644 frame/thread/bli_l3_decor.c create mode 100644 frame/thread/bli_l3_sup_decor.c create mode 100644 sandbox/gemmlike/thread/bls_l3_decor.c diff --git a/Makefile b/Makefile index f5396f79b..5c4a32b59 100644 --- a/Makefile +++ b/Makefile @@ -1149,24 +1149,24 @@ endif # ifeq ($(IS_WIN),no) # --- Query current configuration --- showconfig: check-env - @echo "configuration family: $(CONFIG_NAME)" - @echo "sub-configurations: $(CONFIG_LIST)" - @echo "requisite kernels sets: $(KERNEL_LIST)" - @echo "kernel-to-config map: $(KCONFIG_MAP)" + @echo "configuration family: $(CONFIG_NAME)" + @echo "sub-configurations: $(CONFIG_LIST)" + @echo "requisite kernels sets: $(KERNEL_LIST)" + @echo "kernel-to-config map: $(KCONFIG_MAP)" @echo "-------------------------" - @echo "BLIS version string: $(VERSION)" - @echo ".so major version: $(SO_MAJOR)" - @echo ".so minor.build vers: $(SO_MINORB)" - @echo "install libdir: $(INSTALL_LIBDIR)" - @echo "install includedir: $(INSTALL_INCDIR)" - @echo "install sharedir: $(INSTALL_SHAREDIR)" - @echo "debugging status: $(DEBUG_TYPE)" - @echo "multithreading status: $(THREADING_MODEL)" - @echo "enable BLAS API? $(MK_ENABLE_BLAS)" - @echo "enable CBLAS API? $(MK_ENABLE_CBLAS)" - @echo "build static library? $(MK_ENABLE_STATIC)" - @echo "build shared library? $(MK_ENABLE_SHARED)" - @echo "ARG_MAX hack enabled? $(ARG_MAX_HACK)" + @echo "BLIS version string: $(VERSION)" + @echo ".so major version: $(SO_MAJOR)" + @echo ".so minor.build vers: $(SO_MINORB)" + @echo "install libdir: $(INSTALL_LIBDIR)" + @echo "install includedir: $(INSTALL_INCDIR)" + @echo "install sharedir: $(INSTALL_SHAREDIR)" + @echo "debugging status: $(DEBUG_TYPE)" + @echo "enabled threading model(s): $(THREADING_MODEL)" + @echo "enable BLAS API? $(MK_ENABLE_BLAS)" + @echo "enable CBLAS API? $(MK_ENABLE_CBLAS)" + @echo "build static library? $(MK_ENABLE_STATIC)" + @echo "build shared library? $(MK_ENABLE_SHARED)" + @echo "ARG_MAX hack enabled? $(ARG_MAX_HACK)" # --- Clean rules --- diff --git a/addon/gemmd/attic/bli_gemm_ex.c b/addon/gemmd/attic/bli_gemm_ex.c index 0f40d1cb3..8b7d11d81 100644 --- a/addon/gemmd/attic/bli_gemm_ex.c +++ b/addon/gemmd/attic/bli_gemm_ex.c @@ -36,13 +36,13 @@ void bli_gemm_ex ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { bli_init_once(); @@ -82,7 +82,8 @@ void bli_gemm_ex // Invoke the operation's front end. bli_gemm_front ( - alpha, a, b, beta, c, cntx, rntm, NULL + ( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b, ( obj_t* )beta, ( obj_t* )c, + ( cntx_t* )cntx, ( rntm_t* )rntm, NULL ); } diff --git a/addon/gemmd/bao_gemmd.c b/addon/gemmd/bao_gemmd.c index 01185a9d7..8379ff6d4 100644 --- a/addon/gemmd/bao_gemmd.c +++ b/addon/gemmd/bao_gemmd.c @@ -81,16 +81,28 @@ void bao_gemmd_ex if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } + // Set the .pack_a and .pack_b fields to TRUE. This is only needed because + // this addon uses bli_thrinfo_sup_grow(), which calls + // bli_thrinfo_sup_create_for_cntl(), which employs an optimization if + // both fields are FALSE (as is often the case with sup). However, this + // addon implements the "large" code path, and so both A and B must + // always be packed. Setting the fields to TRUE will avoid the optimization + // while this addon implementation executes (and it also reinforces the + // fact that we *are* indeed packing A and B, albeit not in the sup context + // originally envisioned for the .pack_a and .pack_b fields). + bli_rntm_set_pack_a( TRUE, rntm ); + bli_rntm_set_pack_b( TRUE, rntm ); + // Obtain a valid (native) context from the gks if necessary. // NOTE: This must be done before calling the _check() function, since // that function assumes the context pointer is valid. - if ( cntx == NULL ) cntx = bli_gks_query_cntx(); + if ( cntx == NULL ) cntx = ( cntx_t* )bli_gks_query_cntx(); // Check parameters. if ( bli_error_checking_is_enabled() ) bao_gemmd_check( alpha, a, d, b, beta, c, cntx ); - // -- bli_gemmd_front() ---------------------------------------------------- + // -- bao_gemmd_front() ---------------------------------------------------- obj_t a_local; obj_t b_local; diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c index 689471367..e3f47982c 100644 --- a/addon/gemmd/bao_gemmd_bp_var1.c +++ b/addon/gemmd/bao_gemmd_bp_var1.c @@ -458,7 +458,7 @@ void PASTECH2(bao_,ch,varname) \ /* This barrier is needed to prevent threads from starting to pack the next row panel of B before the current row panel is fully computed upon. */ \ - bli_thread_barrier( thread_pb ); \ + bli_thread_barrier( rntm, thread_pb ); \ } \ } \ \ diff --git a/addon/gemmd/bao_l3_packm_a.c b/addon/gemmd/bao_l3_packm_a.c index 49bb34664..1d6502884 100644 --- a/addon/gemmd/bao_l3_packm_a.c +++ b/addon/gemmd/bao_l3_packm_a.c @@ -61,7 +61,7 @@ void PASTECH2(bao_,ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_barrier( thread ); \ + bli_thread_barrier( rntm, thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \ @@ -90,7 +90,7 @@ void PASTECH2(bao_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -139,7 +139,7 @@ void PASTECH2(bao_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -319,7 +319,7 @@ void PASTECH2(bao_,ch,opname) \ ); \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( thread ); \ + bli_thread_barrier( rntm, thread ); \ } //INSERT_GENTFUNC_BASIC0( packm_a ) diff --git a/addon/gemmd/bao_l3_packm_b.c b/addon/gemmd/bao_l3_packm_b.c index c41b062b6..8d020007c 100644 --- a/addon/gemmd/bao_l3_packm_b.c +++ b/addon/gemmd/bao_l3_packm_b.c @@ -61,7 +61,7 @@ void PASTECH2(bao_,ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_barrier( thread ); \ + bli_thread_barrier( rntm, thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \ @@ -90,7 +90,7 @@ void PASTECH2(bao_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -139,7 +139,7 @@ void PASTECH2(bao_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -319,7 +319,7 @@ void PASTECH2(bao_,ch,opname) \ ); \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( thread ); \ + bli_thread_barrier( rntm, thread ); \ } //INSERT_GENTFUNC_BASIC0( packm_b ) diff --git a/addon/gemmd/thread/bao_l3_decor.c b/addon/gemmd/thread/bao_l3_decor.c new file mode 100644 index 000000000..ff510b6f3 --- /dev/null +++ b/addon/gemmd/thread/bao_l3_decor.c @@ -0,0 +1,150 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// Initialize a function pointer array containing function addresses for +// each of the threading-specific level-3 thread decorators. + +static l3ao_decor_ft l3ao_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] = +{ + [BLIS_SINGLE] = bao_l3_thread_decorator_single, + [BLIS_OPENMP] = +#if defined(BLIS_ENABLE_OPENMP) + bao_l3_thread_decorator_openmp, +#elif defined(BLIS_ENABLE_PTHREADS) + NULL, +#else + NULL, +#endif + [BLIS_POSIX] = +#if defined(BLIS_ENABLE_PTHREADS) + bao_l3_thread_decorator_pthreads, +#elif defined(BLIS_ENABLE_OPENMP) + NULL, +#else + NULL, +#endif +}; + +// Define a dispatcher that chooses a threading-specific function from the +// above function pointer array. + +void bao_l3_thread_decorator + ( + l3aoint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + rntm_t rntm_l; + + // Query the threading implementation and the number of threads requested. + timpl_t ti = bli_rntm_thread_impl( rntm ); + dim_t nt = bli_rntm_num_threads( rntm ); + + if ( bli_error_checking_is_enabled() ) + bao_l3_thread_decorator_check( rntm ); + + if ( 1 < nt && ti == BLIS_SINGLE ) + { + // Here, we resolve conflicting information. The caller requested + // a sequential threading implementation, but also requested more + // than one thread. Here, we choose to favor the requested threading + // implementation over the number of threads, and so reset all + // parallelism parameters to 1. + rntm_l = *rntm; + nt = 1; + bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l ); + bli_rntm_set_num_threads_only( 1, &rntm_l ); + rntm = &rntm_l; + } + + // Use the timpl_t value to index into the corresponding function address + // from the function pointer array. + const l3ao_decor_ft fp = l3ao_decor_fpa[ ti ]; + + // Call the threading-specific decorator function. + fp + ( + func, + family, + alpha, + a, + d, + b, + beta, + c, + cntx, + rntm + ); +} + +void bao_l3_thread_decorator_check + ( + rntm_t* rntm + ) +{ + //err_t e_val; + + //e_val = bli_check_valid_thread_impl( bli_rntm_thread_impl( rntm ) ); + //bli_check_error_code( e_val ); + + const timpl_t ti = bli_rntm_thread_impl( rntm ); + + if ( +#ifndef BLIS_ENABLE_OPENMP + ti == BLIS_OPENMP || +#endif +#ifndef BLIS_ENABLE_PTHREADS + ti == BLIS_POSIX || +#endif + FALSE + ) + { + fprintf( stderr, "\n" ); + fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); + fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); + fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ ); + bli_abort(); + } +} + diff --git a/addon/gemmd/thread/bao_l3_decor.h b/addon/gemmd/thread/bao_l3_decor.h index b4fd2b9b7..4c087bdb6 100644 --- a/addon/gemmd/thread/bao_l3_decor.h +++ b/addon/gemmd/thread/bao_l3_decor.h @@ -4,7 +4,8 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021, The University of Texas at Austin + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,18 +33,13 @@ */ -#ifndef BLIS_SBX_L3_DECOR_H -#define BLIS_SBX_L3_DECOR_H - -// -- sup definitions ---------------------------------------------------------- - -// Level-3 sup internal function type. -typedef void (*l3sbxint_t) +// Level-3 internal function type. +typedef void (*l3aoint_ft) ( obj_t* alpha, obj_t* a, - obj_t* d, obj_t* b, + obj_t* d, obj_t* beta, obj_t* c, cntx_t* cntx, @@ -51,19 +47,39 @@ typedef void (*l3sbxint_t) thrinfo_t* thread ); -// Level-3 sup thread decorator prototype. +// Level-3 thread decorator function type. +typedef void (*l3ao_decor_ft) + ( + l3aoint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* d, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +// Level-3 thread decorator prototype. void bao_l3_thread_decorator ( - l3sbxint_t func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* d, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + l3aoint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* d, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +void bao_l3_thread_decorator_check + ( + rntm_t* rntm ); // Include definitions specific to the method of multithreading. @@ -71,5 +87,3 @@ void bao_l3_thread_decorator #include "bao_l3_decor_openmp.h" #include "bao_l3_decor_pthreads.h" -#endif - diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.c b/addon/gemmd/thread/bao_l3_decor_openmp.c index 1aca8de27..7deee95ed 100644 --- a/addon/gemmd/thread/bao_l3_decor_openmp.c +++ b/addon/gemmd/thread/bao_l3_decor_openmp.c @@ -36,16 +36,11 @@ #ifdef BLIS_ENABLE_OPENMP -// Define a dummy thread entry function, which is needed in the pthreads -// version, so that when building Windows DLLs (with OpenMP enabled or with -// no multithreading) we don't risk having an unresolved symbol. -void* bao_l3_thread_entry( void* data_void ) { return NULL; } - //#define PRINT_THRINFO -void bao_l3_thread_decorator +void bao_l3_thread_decorator_openmp ( - l3sbxint_t func, + l3aoint_ft func, opid_t family, obj_t* alpha, obj_t* a, @@ -66,7 +61,7 @@ void bao_l3_thread_decorator // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. - array_t* restrict array = bli_sba_checkout_array( n_threads ); + array_t* array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do // this up-front only so that we have the rntm_t.sba_pool field @@ -79,7 +74,7 @@ void bao_l3_thread_decorator bli_pba_rntm_set_pba( rntm ); // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); _Pragma( "omp parallel num_threads(n_threads)" ) @@ -94,8 +89,6 @@ void bao_l3_thread_decorator const dim_t tid = omp_get_thread_num(); // Check for a somewhat obscure OpenMP thread-mistmatch issue. - // NOTE: This calls the same function used for the conventional/large - // code path. bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); // Use the thread id to access the appropriate pool_t* within the diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.h b/addon/gemmd/thread/bao_l3_decor_openmp.h index 9c956d7c3..4ed3e7efc 100644 --- a/addon/gemmd/thread/bao_l3_decor_openmp.h +++ b/addon/gemmd/thread/bao_l3_decor_openmp.h @@ -32,13 +32,22 @@ */ -#ifndef BLIS_SBX_L3_DECOR_OPENMP_H -#define BLIS_SBX_L3_DECOR_OPENMP_H - // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP -#endif +void bao_l3_thread_decorator_openmp + ( + l3aoint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); #endif diff --git a/addon/gemmd/thread/bao_l3_decor_pthreads.c b/addon/gemmd/thread/bao_l3_decor_pthreads.c index 587b8400f..dfbfbaa61 100644 --- a/addon/gemmd/thread/bao_l3_decor_pthreads.c +++ b/addon/gemmd/thread/bao_l3_decor_pthreads.c @@ -39,7 +39,7 @@ // A data structure to assist in passing operands to additional threads. typedef struct thread_data { - l3sbxint_t func; + l3aoint_ft func; opid_t family; obj_t* alpha; obj_t* a; @@ -59,7 +59,7 @@ void* bao_l3_thread_entry( void* data_void ) { thread_data_t* data = data_void; - l3sbxint_t func = data->func; + l3aoint_ft func = data->func; opid_t family = data->family; obj_t* alpha = data->alpha; obj_t* a = data->a; @@ -111,9 +111,9 @@ void* bao_l3_thread_entry( void* data_void ) return NULL; } -void bao_l3_thread_decorator +void bao_l3_thread_decorator_pthreads ( - l3sbxint_t func, + l3aoint_ft func, opid_t family, obj_t* alpha, obj_t* a, @@ -216,5 +216,12 @@ void bao_l3_thread_decorator bli_free_intl( datas ); } +#else + +// Define a dummy function bli_l3_thread_entry(), which is needed for +// consistent dynamic linking behavior when building shared objects in Linux +// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol. +void* bao_l3_thread_entry( void* data_void ) { return NULL; } + #endif diff --git a/addon/gemmd/thread/bao_l3_decor_pthreads.h b/addon/gemmd/thread/bao_l3_decor_pthreads.h index 69adec45e..1c0b58900 100644 --- a/addon/gemmd/thread/bao_l3_decor_pthreads.h +++ b/addon/gemmd/thread/bao_l3_decor_pthreads.h @@ -32,16 +32,25 @@ */ -#ifndef BLIS_SBX_L3_DECOR_PTHREADS_H -#define BLIS_SBX_L3_DECOR_PTHREADS_H - // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bao_l3_thread_entry( void* data_void ); -#endif +void bao_l3_thread_decorator_pthreads + ( + l3aoint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); #endif diff --git a/addon/gemmd/thread/bao_l3_decor_single.c b/addon/gemmd/thread/bao_l3_decor_single.c index d60891d65..362c1e68c 100644 --- a/addon/gemmd/thread/bao_l3_decor_single.c +++ b/addon/gemmd/thread/bao_l3_decor_single.c @@ -34,13 +34,11 @@ #include "blis.h" -#ifndef BLIS_ENABLE_MULTITHREADING - #define SKIP_THRINFO_TREE -void bao_l3_thread_decorator +void bao_l3_thread_decorator_single ( - l3sbxint_t func, + l3aoint_ft func, opid_t family, //pack_t schema_a, //pack_t schema_b, @@ -139,5 +137,3 @@ void bao_l3_thread_decorator bli_sba_checkin_array( array ); } -#endif - diff --git a/addon/gemmd/thread/bao_l3_decor_single.h b/addon/gemmd/thread/bao_l3_decor_single.h index 211a43a89..813bb6d75 100644 --- a/addon/gemmd/thread/bao_l3_decor_single.h +++ b/addon/gemmd/thread/bao_l3_decor_single.h @@ -32,13 +32,18 @@ */ -#ifndef BLIS_SBX_L3_DECOR_SINGLE_H -#define BLIS_SBX_L3_DECOR_SINGLE_H - -// Definitions specific to situations when multithreading is disabled. -#ifndef BLIS_ENABLE_MULTITHREADING - -#endif - -#endif - +void bao_l3_thread_decorator_single + ( + l3aoint_ft func, + opid_t family, + //pack_t schema_a, + //pack_t schema_b, + obj_t* alpha, + obj_t* a, + obj_t* d, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); diff --git a/build/bli_config.h.in b/build/bli_config.h.in index fa6bbbe12..5208a90f8 100644 --- a/build/bli_config.h.in +++ b/build/bli_config.h.in @@ -53,10 +53,16 @@ #if @enable_openmp@ #define BLIS_ENABLE_OPENMP +#if @enable_openmp_as_def@ +#define BLIS_ENABLE_OPENMP_AS_DEFAULT +#endif #endif #if @enable_pthreads@ #define BLIS_ENABLE_PTHREADS +#if @enable_pthreads_as_def@ +#define BLIS_ENABLE_PTHREADS_AS_DEFAULT +#endif #endif #if @enable_jrir_slab@ diff --git a/build/config.mk.in b/build/config.mk.in index 7ef8c6bd0..849a7ccfa 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -127,7 +127,7 @@ DEBUG_TYPE := @debug_type@ # Whether operating system support was requested via --enable-system. ENABLE_SYSTEM := @enable_system@ -# The requested threading model. +# The requested threading model(s). THREADING_MODEL := @threading_model@ # Whether the compiler supports "#pragma omp simd" via the -fopenmp-simd option. diff --git a/common.mk b/common.mk index b49089419..00b9f8ad3 100644 --- a/common.mk +++ b/common.mk @@ -802,44 +802,46 @@ $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPPROCFLAGS,$(c))) # since that option forces --enable-threading=none, and thus -pthread never gets # added to begin with. +CTHREADFLAGS := + ifeq ($(CC_VENDOR),gcc) -ifeq ($(THREADING_MODEL),auto) -THREADING_MODEL := openmp -endif -ifeq ($(THREADING_MODEL),openmp) -CTHREADFLAGS := -fopenmp +#ifneq ($(findstring auto,$(THREADING_MODEL)),) +#THREADING_MODEL := openmp +#endif +ifneq ($(findstring openmp,$(THREADING_MODEL)),) +CTHREADFLAGS += -fopenmp LDFLAGS += -fopenmp endif -ifeq ($(THREADING_MODEL),pthreads) -CTHREADFLAGS := -pthread +ifneq ($(findstring pthreads,$(THREADING_MODEL)),) +CTHREADFLAGS += -pthread LDFLAGS += $(LIBPTHREAD) endif endif ifeq ($(CC_VENDOR),icc) -ifeq ($(THREADING_MODEL),auto) -THREADING_MODEL := openmp -endif -ifeq ($(THREADING_MODEL),openmp) -CTHREADFLAGS := -fopenmp +#ifneq ($(findstring auto,$(THREADING_MODEL)),) +#THREADING_MODEL := openmp +#endif +ifneq ($(findstring openmp,$(THREADING_MODEL)),) +CTHREADFLAGS += -fopenmp LDFLAGS += -fopenmp endif -ifeq ($(THREADING_MODEL),pthreads) -CTHREADFLAGS := -pthread +ifneq ($(findstring pthreads,$(THREADING_MODEL)),) +CTHREADFLAGS += -pthread LDFLAGS += $(LIBPTHREAD) endif endif ifeq ($(CC_VENDOR),clang) -ifeq ($(THREADING_MODEL),auto) -THREADING_MODEL := pthreads -endif -ifeq ($(THREADING_MODEL),openmp) -CTHREADFLAGS := -fopenmp +#ifneq ($(findstring auto,$(THREADING_MODEL)),) +#THREADING_MODEL := pthreads +#endif +ifneq ($(findstring openmp,$(THREADING_MODEL)),) +CTHREADFLAGS += -fopenmp LDFLAGS += -fopenmp endif -ifeq ($(THREADING_MODEL),pthreads) -CTHREADFLAGS := -pthread +ifneq ($(findstring pthreads,$(THREADING_MODEL)),) +CTHREADFLAGS += -pthread LDFLAGS += $(LIBPTHREAD) endif endif diff --git a/configure b/configure index a6018edab..858ce55de 100755 --- a/configure +++ b/configure @@ -169,10 +169,23 @@ print_usage() echo " " echo " -t MODEL, --enable-threading[=MODEL], --disable-threading" echo " " - echo " Enable threading in the library, using threading model" - echo " MODEL={openmp,pthreads,no}. If MODEL=no or " - echo " --disable-threading is specified, threading will be" - echo " disabled. The default is 'no'." + echo " Enable threading in the library, using threading model(s)" + echo " MODEL={single,openmp,pthreads,auto}. If multiple values" + echo " are specified within MODEL, they will all be compiled into" + echo " BLIS, and the choice of which to use will be determined at" + echo " runtime. If the user does not express a preference (by" + echo " setting the BLIS_THREAD_IMPL environment variable to" + echo " 'single', 'openmp', or 'pthreads'; by calling the global" + echo " runtime API bli_thread_set_thread_impl(); or by encoding a" + echo " choice on a per-call basis within a rntm_t passed into the" + echo " expert API), then the first model listed in MODEL will be" + echo " used by default. Note that 'single' is silently appended" + echo " to whatever the user specifies in MODEL, meaning that" + echo " single-threaded functionality will always be available," + echo " even if it is not requested and even if it is not enabled" + echo " by default. Even --disable-threading is actually shorthand" + echo " for --enable-threading=single (which is the default when" + echo " the option is not specified)." echo " " echo " --enable-system, --disable-system" echo " " @@ -2606,7 +2619,7 @@ main() threading_model=${OPTARG#*=} ;; disable-threading) - threading_model='off' + threading_model='single' ;; thread-part-jrir=*) thread_part_jrir=${OPTARG#*=} @@ -3420,36 +3433,182 @@ main() fi # Check the threading model flag and standardize its value, if needed. - # NOTE: 'omp' is deprecated but still supported; 'openmp' is preferred. + # Note that single-threaded mode will always be enabled, but not necessarily + # by default. + enable_single='yes' enable_openmp='no' - enable_openmp_01=0 enable_pthreads='no' + enable_single_01=1 + enable_openmp_01=0 enable_pthreads_01=0 - if [ "x${threading_model}" = "xauto" ]; then + parsed_tm='' + first_tm='' + enable_single_as_def_01=0 + enable_openmp_as_def_01=0 + enable_pthreads_as_def_01=0 + + # Convert whatever reasonable separator the user may have used into a space. + threading_model_list=$(echo "${threading_model}" | sed -e "s/[,+]/ /g") + + # Search for all recognized values and standardize them to one of four + # strings: 'single', 'openmp', 'pthreads', 'auto'. Notice that we keep + # the strings in the same order as they originally appeared. + for word in ${threading_model_list}; do + + if [ "x${word}" = "xsingle" ] || + [ "x${word}" = "xnone" ] || + [ "x${word}" = "xoff" ] || + [ "x${word}" = "xno" ]; then + + parsed_tm="${parsed_tm} single" + + elif [ "x${word}" = "xopenmp" ] || + [ "x${word}" = "xomp" ]; then + + parsed_tm="${parsed_tm} openmp" + + elif [ "x${word}" = "xpthreads" ] || + [ "x${word}" = "xpthread" ] || + [ "x${word}" = "xposix" ]; then + + parsed_tm="${parsed_tm} pthreads" + + elif [ "x${word}" = "xauto" ]; then + + parsed_tm="${parsed_tm} auto" + + else + + echo "${script_name}: *** Unsupported threading model: ${word}." + exit 1 + fi + done + + # Always enable single-threaded behavior. If the user explicitly + # requested 'single' as well as other modes, the first occurrence will + # be kept when duplicates are removed, which will preserve the order + # for purposes of determining which mode will be the default (absent + # any explicit choice at runtime). + parsed_tm="${parsed_tm} single" + + # Remove duplicates, if they exist. + parsed_tm=$(rm_duplicate_words_simple "${parsed_tm}") + + #echo "parsed_tm0: _${parsed_tm}_" + + # If parsed_tm contains 'auto', substitute in the automatic choice + # based on which compiler family is being used. + if [ "$(is_in_list "auto" "${parsed_tm}")" = "true" ]; then + + # If 'auto' was found in the threading model string, we ignore any + # other choice that may have been expressed and leave everything + # disabled. (The Makefile will automatically choose a model based + # on information such as the compiler.) echo "${script_name}: determining the threading model automatically." - elif [ "x${threading_model}" = "xopenmp" ] || - [ "x${threading_model}" = "xomp" ]; then - echo "${script_name}: using OpenMP for threading." - enable_openmp='yes' - enable_openmp_01=1 - threading_model="openmp" # Standardize the value. - elif [ "x${threading_model}" = "xpthreads" ] || - [ "x${threading_model}" = "xpthread" ] || - [ "x${threading_model}" = "xposix" ]; then - echo "${script_name}: using POSIX threads for threading." - enable_pthreads='yes' - enable_pthreads_01=1 - threading_model="pthreads" # Standardize the value. - elif [ "x${threading_model}" = "xoff" ] || - [ "x${threading_model}" = "xno" ] || - [ "x${threading_model}" = "xnone" ]; then - echo "${script_name}: threading is disabled." - threading_model="off" - else - echo "${script_name}: *** Unsupported threading model: ${threading_model}." - exit 1 + + # Use OpenMP for gcc and icc, but pthreads for clang. + if [ "${cc_vendor}" = "gcc" ]; then + + selected_tm="openmp" + echo "${script_name}: automatically selected OpenMP." + + elif [ "${cc_vendor}" = "icc" ]; then + + selected_tm="openmp" + echo "${script_name}: automatically selected OpenMP." + + elif [ "${cc_vendor}" = "clang" ]; then + + selected_tm="pthreads" + echo "${script_name}: automatically selected pthreads." + fi + + # Substitute the selected threading model for 'auto' in parsed_tm. + parsed_tm=$(substitute_words "auto" "${selected_tm}" "${parsed_tm}") fi + #echo "parsed_tm1: _${parsed_tm}_" + + # Remove any extra whitespace. + parsed_tm=$(canonicalize_ws "${parsed_tm}") + + #echo "parsed_tm2: _${parsed_tm}_" + + # Find the first word. This will be the default threading model. + first_tm=${parsed_tm%% *} + + #echo "first_tm0: _${first_tm}_" + + # Now that we've standardized the list, removed duplicates, and handled + # the possibility of 'auto' being among the listed threading models, we can + # proceed to formally processing each threading model to enable. Since + # 'auto' has been converted to 'openmp' or 'pthreads', we only need to + # handle the remaining three options (openmp, pthreads, and single) going + # forward. + for word in ${parsed_tm}; do + + if [ "x${word}" = "xsingle" ]; then + + echo "${script_name}: enabling support for single-threading." + enable_single='yes' + enable_single_01=1 + + elif [ "x${word}" = "xopenmp" ]; then + + echo "${script_name}: enabling support for threading via OpenMP." + enable_openmp='yes' + enable_openmp_01=1 + + elif [ "x${word}" = "xpthreads" ]; then + + echo "${script_name}: enabling support for threading via pthreads." + enable_pthreads='yes' + enable_pthreads_01=1 + fi + done + + # Define boolean variables that can easily be interpreted with #ifdef + # directives. + if [ "x${first_tm}" = "xsingle" ]; then + + enable_single_as_def_01=1 + enable_openmp_as_def_01=0 + enable_pthreads_as_def_01=0 + + elif [ "x${first_tm}" = "xopenmp" ]; then + + enable_single_as_def_01=0 + enable_openmp_as_def_01=1 + enable_pthreads_as_def_01=0 + + elif [ "x${first_tm}" = "xpthreads" ]; then + + enable_single_as_def_01=0 + enable_openmp_as_def_01=0 + enable_pthreads_as_def_01=1 + fi + + # If either OpenMP or pthreads was enabled, given that single-threaded mode is + # also always enabled, remind the user which one will serve as the default + # (that is, absent any explicit choice at runtime). + if [ "x${enable_openmp}" = "xyes" ] || + [ "x${enable_pthreads}" = "xyes" ]; then + + if [ "x${first_tm}" = "xsingle" ]; then + echo "${script_name}: threading will default to single-threaded." + elif [ "x${first_tm}" = "xopenmp" ]; then + echo "${script_name}: threading will default to OpenMP." + elif [ "x${first_tm}" = "xpthreads" ]; then + echo "${script_name}: threading will default to pthreads." + fi + fi + + # Copy the final parsed threading model list back to the original variable. + threading_model="${parsed_tm}" + + #echo "parsed_tm: _${parsed_tm}_" + #echo "first_tm: _${first_tm}_" + # Check the method of assigning micropanels to threads in the JR and IR # loops. enable_jrir_slab_01=0 @@ -3461,7 +3620,7 @@ main() echo "${script_name}: requesting round-robin threading in jr and ir loops." enable_jrir_rr_01=1 else - echo "${script_name}: *** Unsupported method of thread partitioning in jr and ir loops: ${threading_model}." + echo "${script_name}: *** Unsupported method of thread partitioning in jr and ir loops: ${thread_part_jrir}." exit 1 fi @@ -3891,7 +4050,9 @@ main() | perl -pe "s/\@kernel_list_defines\@/${kernel_list_defines}/g" \ | sed -e "s/@enable_system@/${enable_system_01}/g" \ | sed -e "s/@enable_openmp@/${enable_openmp_01}/g" \ + | sed -e "s/@enable_openmp_as_def@/${enable_openmp_as_def_01}/g" \ | sed -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \ + | sed -e "s/@enable_pthreads_as_def@/${enable_pthreads_as_def_01}/g" \ | sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \ | sed -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \ | sed -e "s/@enable_pba_pools@/${enable_pba_pools_01}/g" \ diff --git a/docs/Multithreading.md b/docs/Multithreading.md index 8e636f06a..933296f79 100644 --- a/docs/Multithreading.md +++ b/docs/Multithreading.md @@ -1,5 +1,5 @@ # Contents - +/ * **[Contents](Multithreading.md#contents)** * **[Introduction](Multithreading.md#introduction)** * **[Enabling multithreading](Multithreading.md#enabling-multithreading)** @@ -9,13 +9,16 @@ * [Globally via environment variables](Multithreading.md#globally-via-environment-variables) * [The automatic way](Multithreading.md#environment-variables-the-automatic-way) * [The manual way](Multithreading.md#environment-variables-the-manual-way) + * [Overriding the default threading implementation](Multithreading.md#environment-variables-overriding-the-default-threading-implementation) * [Globally at runtime](Multithreading.md#globally-at-runtime) * [The automatic way](Multithreading.md#globally-at-runtime-the-automatic-way) * [The manual way](Multithreading.md#globally-at-runtime-the-manual-way) + * [Overriding the default threading implementation](Multithreading.md#globally-at-runtime-overriding-the-default-threading-implementation) * [Locally at runtime](Multithreading.md#locally-at-runtime) * [Initializing a rntm_t](Multithreading.md#initializing-a-rntm-t) * [The automatic way](Multithreading.md#locally-at-runtime-the-automatic-way) * [The manual way](Multithreading.md#locally-at-runtime-the-manual-way) + * [Overriding the default threading implementation](Multithreading.md#locally-at-runtime-overriding-the-default-threading-implementation) * [Using the expert interface](Multithreading.md#locally-at-runtime-using-the-expert-interface) * **[Known issues](Multithreading.md#known-issues)** * **[Conclusion](Multithreading.md#conclusion)** @@ -35,13 +38,13 @@ To summarize: In order to observe multithreaded parallelism within a BLIS operat BLIS disables multithreading by default. In order to allow multithreaded parallelism from BLIS, you must first enable multithreading explicitly at configure-time. -As of this writing, BLIS optionally supports multithreading via either OpenMP or POSIX threads. +As of this writing, BLIS optionally supports multithreading via OpenMP or POSIX threads(or both). To enable multithreading via OpenMP, you must provide the `--enable-threading` option to the `configure` script: ``` $ ./configure --enable-threading=openmp auto ``` -In this example, we target the `auto` configuration, which is like asking `configure` to choose the most appropriate configuration based on some detection heuristic (e.g. `cpuid` on x86_64). Similarly, to enable multithreading via POSIX threads (pthreads), specify the threading model as `pthreads` instead of `openmp`: +In this example, we target the `auto` configuration, which is like asking `configure` to choose the most appropriate configuration based on some detection heuristic (e.g. `cpuid` on x86_64 hardware). Similarly, to enable multithreading via POSIX threads (pthreads), specify the threading model as `pthreads` instead of `openmp`: ``` $ ./configure --enable-threading=pthreads auto ``` @@ -50,7 +53,12 @@ You can also use the shorthand option for `--enable-threading`, which is `-t`: $ ./configure -t openmp auto $ ./configure -t pthreads auto ``` -For more complete and up-to-date information on the `--enable-threading` option, simply run `configure` with the `--help` (or `-h`) option: +You may even combine multiple threading implementations into the same library build. We call this "fat threading." When more than one option is given, the first option acts as the default. Note that no matter what arguments you specify for the `-t` option, the single-threaded implementation will always be available. +``` +$ ./configure -t openmp,pthreads auto +``` +In the above example, OpenMP will serve as the default threading implementation since it is listed first. This default can be overridden at runtime, though, which is discussed later on. +For more complete and up-to-date information on the `--enable-threading` option, run `configure` with the `--help` (or `-h`) option: ``` $ ./configure --help ``` @@ -129,11 +137,15 @@ Regardless of whether you end up using the automatic or manual way of expressing The automatic way of specifying parallelism entails setting the total number of threads you wish BLIS to employ in its parallelization. This total number of threads is captured by the `BLIS_NUM_THREADS` environment variable. You can set this variable prior to executing your BLIS-linked executable: ``` -$ export GOMP_CPU_AFFINITY="..." # optional step when using GNU libgomp. +$ export GOMP_CPU_AFFINITY="0-15" # optional step when using GNU libgomp. $ export BLIS_NUM_THREADS=16 $ ./my_blis_program ``` -This causes BLIS to automatically determine a reasonable threading strategy based on what is known about the operation and problem size. If `BLIS_NUM_THREADS` is not set, BLIS will attempt to query the value of `OMP_NUM_THREADS`. If neither variable is set, the default number of threads is 1. +If you don't want or need your environment variable assignments to persist after `my_blis_program` completes, you can instead set the variables only for the duration of the program as follows: +``` +$ GOMP_CPU_AFFINITY="0-15" BLIS_NUM_THREADS=16 ./my_blis_program +``` +Either of these approaches causes BLIS to automatically determine a reasonable threading strategy based on what is known about the operation and problem size. If `BLIS_NUM_THREADS` is not set, BLIS will attempt to query the value of `BLIS_NT` (a shorthand alternative to `BLIS_NUM_THREADS`). If neither variable is defined, then BLIS will attempt to read `OMP_NUM_THREADS`. If none of these variables is set, the default number of threads is 1. **Note**: We *highly* discourage use of the `OMP_NUM_THREADS` environment variable to specify multithreading within BLIS and may remove support for it in the future. If you wish to set parallelism globally via environment variables, please use `BLIS_NUM_THREADS`. @@ -166,6 +178,23 @@ Next, which combinations of loops to parallelize depends on which caches are sha ![The primary algorithm for level-3 operations in BLIS](http://www.cs.utexas.edu/users/field/mm_algorithm_color.png) +### Environment variables: overriding the default threading implementation + +Just as you may specify the number of threads for BLIS to use by setting environment variables prior to running your BLIS-linked application, you may also specify your preferred threading implementation. Suppose that you configured BLIS as follows: +``` +$ ./configure -t openmp,pthreads auto +``` +This will result in both OpenMP and pthreads implementations being compiled and included within the BLIS library, with OpenMP serving as the default (since it was listed first to the `-t` option). You can link your program against this BLIS library and force the use of pthreads (instead of OpenMP) via environment variables as follows: +``` +$ BLIS_THREAD_IMPL=pthreads BLIS_NUM_THREADS=8 ./my_blis_program +``` +You can even disable multithreading altogether by forcing the use of the single-threaded code path: +``` +$ BLIS_THREAD_IMPL=single ./my_blis_program +``` +Note that if `BLIS_THREAD_IMPL` is assigned to `single`, any other threading-related variables that may be set, such as `BLIS_NUM_THREADS` or any of the `BLIS_*_NT` variables, are ignored. +If `BLIS_THREAD_IMPL` is not set, BLIS will attempt to query its shorthand alternative, `BLIS_TI`. If neither value is set, the configure-time default (in the example shown above, OpenMP) will prevail. + ## Globally at runtime If you still wish to set the parallelization scheme globally, but you want to do so at runtime, BLIS provides a thread-safe API for specifying multithreading. Think of these functions as a way to modify the same internal data structure into which the environment variables are read. (Recall that the environment variables are only read once, when BLIS is initialized). @@ -207,6 +236,26 @@ bli_thread_set_ways( 2, 1, 4, 1, 1 ); we are requesting 2 ways of parallelism in the `JC` loop and 4 ways of parallelism in the `IC` loop. Unlike environment variables, which only allow the user to set the parallelization strategy prior to running the executable, `bli_thread_set_ways()` may be called any time during the normal course of the BLIS-linked application's execution. +### Globally at runtime: overriding the default threading implementation + +Let's assume that you configured BLIS as follows: +``` +$ ./configure -t openmp,pthreads auto +``` +This will result in both OpenMP and pthreads implementations being compiled and included within the BLIS library, with OpenMP serving as the default (since it was listed first to the `-t` option). You can link your program against this BLIS library and force the use of pthreads (instead of OpenMP) globally at runtime via the following API: +```c +void bli_thread_set_thread_impl( timpl_t ti ); +``` +The function takes a `timpl_t`, which is an enumerated type that has three valid values corresponding to the three possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling: +```c +bli_thread_set_thread_impl( BLIS_POSIX ) +``` +You can even disable multithreading altogether by forcing the use of the single-threaded code path: +```c +bli_thread_set_thread_impl( BLIS_SINGLE ) +``` +Note that if `BLIS_SINGLE` is specified, any other-related parameters previously set, such as via `bli_thread_set_num_threads()` or `bli_thread_set_ways()`, are ignored. + ## Locally at runtime In addition to the global methods based on environment variables and runtime function calls, BLIS also offers a local, *per-call* method of requesting parallelism at runtime. This method has the benefit of being thread-safe and flexible; your application can spawn two threads at the application level, with each thread requesting different degrees of parallelism from their respective calls to level-3 BLIS operations. @@ -262,6 +311,26 @@ bli_rntm_set_ways( 1, 1, 2, 3, 1, &rntm ); ``` we are requesting two ways of parallelism in the `IC` loop and three ways of parallelism in the `JR` loop. +### Locally at runtime: overriding the default threading implementation + +Let's assume that you configured BLIS as follows: +``` +$ ./configure -t openmp,pthreads auto +``` +This will result in both OpenMP and pthreads implementations being compiled and included within the BLIS library, with OpenMP serving as the default (since it was listed first to the `-t` option). You can link your program against this BLIS library and force the use of pthreads (instead of OpenMP) at runtime, on a per-call basis, by encoding your choice within your `rntm_t`: +```c +void bli_rntm_set_thread_impl( timpl_t ti, rntm_t* rntm ); +``` +The function takes a `timpl_t`, which is an enumerated type that has three valid values corresponding to the three possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling: +```c +bli_rntm_set_thread_impl( BLIS_POSIX, &rntm ); +``` +You can even disable multithreading altogether by forcing the use of the single-threaded code path: +```c +bli_rntm_set_thread_impl( BLIS_SINGLE, &rntm ); +``` +Note that if `BLIS_SINGLE` is specified, any other-related parameters previously set within the `rntm_t`, such as via `bli_rntm_set_num_threads()` or `bli_rntm_set_ways()`, are ignored. + ### Locally at runtime: using the expert interfaces Regardless of whether you specified parallelism into your `rntm_t` object via the automatic or manual method, eventually you must use the data structure when calling a BLIS operation in order for it to have any effect. diff --git a/frame/1m/packm/bli_packm_alloc.c b/frame/1m/packm/bli_packm_alloc.c index 22ed31ecc..07f54de78 100644 --- a/frame/1m/packm/bli_packm_alloc.c +++ b/frame/1m/packm/bli_packm_alloc.c @@ -103,7 +103,7 @@ void* bli_packm_alloc_ex // Broadcast the address of the chief thread's local mem_t entry to // all threads. - local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); + local_mem_p = bli_thread_broadcast( rntm, thread, &local_mem_s ); // Save the chief thread's local mem_t entry to the mem_t field in // this thread's control tree node. @@ -111,7 +111,7 @@ void* bli_packm_alloc_ex // Barrier so that the master thread doesn't return from the function // before we are done reading. - bli_thread_barrier( thread ); + bli_thread_barrier( rntm, thread ); } return bli_mem_buffer( cntl_mem_p ); diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index f76607508..ae788e671 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -51,7 +51,7 @@ void bli_packm_int // Barrier so that we know threads are done with previous computation // with the same packing buffer before starting to pack. - bli_thread_barrier( thread ); + bli_thread_barrier( rntm, thread ); // Invoke the variant with kappa_use. f @@ -65,6 +65,6 @@ void bli_packm_int ); // Barrier so that packing is done before computation. - bli_thread_barrier( thread ); + bli_thread_barrier( rntm, thread ); } diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c index f6b09d8ae..3b542b061 100644 --- a/frame/1m/unpackm/bli_unpackm_int.c +++ b/frame/1m/unpackm/bli_unpackm_int.c @@ -39,6 +39,7 @@ void bli_unpackm_int const obj_t* p, const obj_t* a, const cntx_t* cntx, + const rntm_t* rntm, const cntl_t* cntl, const thrinfo_t* thread ) @@ -73,6 +74,6 @@ void bli_unpackm_int } // Barrier so that unpacking is done before computation. - bli_thread_barrier( thread ); + bli_thread_barrier( rntm, thread ); } diff --git a/frame/1m/unpackm/bli_unpackm_int.h b/frame/1m/unpackm/bli_unpackm_int.h index 8258ea367..fc2c3e66d 100644 --- a/frame/1m/unpackm/bli_unpackm_int.h +++ b/frame/1m/unpackm/bli_unpackm_int.h @@ -37,6 +37,7 @@ void bli_unpackm_int const obj_t* p, const obj_t* a, const cntx_t* cntx, + const rntm_t* rntm, const cntl_t* cntl, const thrinfo_t* thread ); diff --git a/frame/3/bli_l3_int.c b/frame/3/bli_l3_int.c index b786236ab..b9d389839 100644 --- a/frame/3/bli_l3_int.c +++ b/frame/3/bli_l3_int.c @@ -70,7 +70,7 @@ void bli_l3_int { if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); - bli_thread_barrier( thread ); + bli_thread_barrier( rntm, thread ); return; } @@ -84,7 +84,7 @@ void bli_l3_int if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); - bli_thread_barrier( thread ); + bli_thread_barrier( rntm, thread ); return; } diff --git a/frame/3/bli_l3_sup_packm.c b/frame/3/bli_l3_sup_packm.c index b7a7ee02b..5ed7700dc 100644 --- a/frame/3/bli_l3_sup_packm.c +++ b/frame/3/bli_l3_sup_packm.c @@ -64,7 +64,7 @@ void bli_packm_sup_init_mem // Barrier to make sure all threads are caught up and ready to begin // the packm stage. - bli_thread_barrier( thread ); + bli_thread_barrier( rntm, thread ); // Compute the size of the memory block eneded. siz_t size_needed = bli_dt_size( dt ) * m_pack * k_pack; @@ -94,7 +94,7 @@ void bli_packm_sup_init_mem // Broadcast the address of the chief thread's passed-in mem_t // to all threads. - mem_t* mem_p = bli_thread_broadcast( thread, mem ); + mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); // Non-chief threads: Copy the contents of the chief thread's // passed-in mem_t to the passed-in mem_t for this thread. (The @@ -143,7 +143,7 @@ void bli_packm_sup_init_mem // Broadcast the address of the chief thread's passed-in mem_t // to all threads. - mem_t* mem_p = bli_thread_broadcast( thread, mem ); + mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); // Non-chief threads: Copy the contents of the chief thread's // passed-in mem_t to the passed-in mem_t for this thread. (The @@ -422,7 +422,7 @@ void bli_packm_sup } // Barrier so that packing is done before computation. - bli_thread_barrier( thread ); + bli_thread_barrier( rntm, thread ); } } diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c index 357251002..71357cec4 100644 --- a/frame/3/bli_l3_sup_packm_var.c +++ b/frame/3/bli_l3_sup_packm_var.c @@ -244,7 +244,7 @@ if ( col_stored ) { \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_barrier( thread ); \ +bli_thread_barrier( rntm, thread ); \ if ( bli_thread_work_id( thread ) == 1 ) \ { \ printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ @@ -255,7 +255,7 @@ bli_thread_barrier( thread ); \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_barrier( thread ); \ +bli_thread_barrier( rntm, thread ); \ } \ else { \ if ( bli_thread_work_id( thread ) == 0 ) \ @@ -268,7 +268,7 @@ else { \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_barrier( thread ); \ +bli_thread_barrier( rntm, thread ); \ if ( bli_thread_work_id( thread ) == 1 ) \ { \ printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ @@ -279,7 +279,7 @@ bli_thread_barrier( thread ); \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_barrier( thread ); \ +bli_thread_barrier( rntm, thread ); \ } \ */ /* diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c index 76f1a96b7..e4858621a 100644 --- a/frame/3/bli_l3_sup_var1n2m.c +++ b/frame/3/bli_l3_sup_var1n2m.c @@ -232,7 +232,7 @@ void bli_gemmsup_ref_var1n mem_t mem_a = BLIS_MEM_INITIALIZER; mem_t mem_b = BLIS_MEM_INITIALIZER; -\ + // Define an array of bszid_t ids, which will act as our substitute for // the cntl_t tree. // NOTE: These bszid_t values, and their order, match that of the bp @@ -246,22 +246,22 @@ void bli_gemmsup_ref_var1n // packed in the 3rd loop. // 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop bszid_t bszids[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; -\ + // Determine whether we are using more than one thread. const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); -\ + thrinfo_t* thread_jc = NULL; thrinfo_t* thread_pc = NULL; thrinfo_t* thread_pa = NULL; thrinfo_t* thread_ic = NULL; thrinfo_t* thread_pb = NULL; thrinfo_t* thread_jr = NULL; -\ + // Pre-grow the thrinfo_t tree. bszid_t* bszids_jc = bszids; thread_jc = thread; bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); -\ + bszid_t* bszids_pc = &bszids_jc[1]; thread_pc = bli_thrinfo_sub_node( thread_jc ); bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); @@ -472,7 +472,7 @@ void bli_gemmsup_ref_var1n // NOTE: This barrier is only needed if we are packing A (since // that matrix is packed within the pc loop of this variant). - if ( packa ) bli_thread_barrier( thread_pa ); + if ( packa ) bli_thread_barrier( rntm, thread_pa ); } } @@ -909,7 +909,7 @@ void bli_gemmsup_ref_var2m // NOTE: This barrier is only needed if we are packing B (since // that matrix is packed within the pc loop of this variant). - if ( packb ) bli_thread_barrier( thread_pb ); + if ( packb ) bli_thread_barrier( rntm, thread_pb ); } } diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index 1bbec1d95..cb20b7f36 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -88,7 +88,7 @@ void bli_gemm_blk_var3 bli_thrinfo_sub_node( thread ) ); - bli_thread_barrier( bli_thrinfo_sub_node( thread ) ); + bli_thread_barrier( rntm, bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index 79ac65c48..413b12818 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -118,7 +118,7 @@ void bli_trsm_blk_var1 // We must execute a barrier here because the upcoming rank-k update // requires the packed matrix B to be fully updated by the trsm // subproblem. - bli_thread_barrier( thread ); + bli_thread_barrier( rntm, thread ); // Isolate the remaining part of the column panel matrix A, which we do by // acquiring the subpartition ahead of A11 (that is, A21 or A01, depending diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c index 2ff3db6f1..229259a95 100644 --- a/frame/3/trsm/bli_trsm_blk_var3.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -89,7 +89,7 @@ void bli_trsm_blk_var3 ); //bli_thread_ibarrier( thread ); - bli_thread_barrier( bli_thrinfo_sub_node( thread ) ); + bli_thread_barrier( rntm, bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure diff --git a/frame/base/bli_env.c b/frame/base/bli_env.c index 0972f1771..fab6af89e 100644 --- a/frame/base/bli_env.c +++ b/frame/base/bli_env.c @@ -89,6 +89,14 @@ gint_t bli_env_get_var( const char* env, gint_t fallback ) return r_val; } +char* bli_env_get_str( const char* env ) +{ + // Query the environment variable. + return getenv( env ); +} + + + #if 0 #ifdef _MSC_VER #define strerror_r(errno,buf,len) strerror_s(buf,len,errno) diff --git a/frame/base/bli_env.h b/frame/base/bli_env.h index de86fadff..207fbf9d4 100644 --- a/frame/base/bli_env.h +++ b/frame/base/bli_env.h @@ -38,6 +38,7 @@ #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); +char* bli_env_get_str( const char* env ); //void bli_env_set_var( const char* env, dim_t value ); #endif diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index 72b54ca20..9d6e181d3 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -123,6 +123,22 @@ gint_t bli_info_get_enable_pthreads( void ) return 0; #endif } +gint_t bli_info_get_enable_openmp_as_default( void ) +{ +#ifdef BLIS_ENABLE_OPENMP_AS_DEFAULT + return 1; +#else + return 0; +#endif +} +gint_t bli_info_get_enable_pthreads_as_default( void ) +{ +#ifdef BLIS_ENABLE_PTHREADS_AS_DEFAULT + return 1; +#else + return 0; +#endif +} gint_t bli_info_get_thread_part_jrir_slab( void ) { #ifdef BLIS_ENABLE_JRIR_SLAB diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index 250504c23..b3514f434 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -70,6 +70,8 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); +BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp_as_default( void ); +BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads_as_default( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c index 1411ffaa3..895976679 100644 --- a/frame/base/bli_rntm.c +++ b/frame/base/bli_rntm.c @@ -285,6 +285,11 @@ void bli_rntm_set_ways_from_rntm bli_rntm_set_auto_factor_only( auto_factor, rntm ); bli_rntm_set_num_threads_only( nt, rntm ); bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm ); + + // NOTE: The caller should have already set the timpl_t field of the rntm_t, + // either in the course of it being initialized via BLIS_RNTM_INITIALIZER + // or bli_rntm_init(), or by the user (subsequently) setting the value + // directly via bli_rntm_set_thread_impl(). } void bli_rntm_set_ways_from_rntm_sup @@ -418,16 +423,19 @@ void bli_rntm_print const rntm_t* rntm ) { - dim_t af = bli_rntm_auto_factor( rntm ); + timpl_t ti = bli_rntm_thread_impl( rntm ); + + dim_t af = bli_rntm_auto_factor( rntm ); - dim_t nt = bli_rntm_num_threads( rntm ); + dim_t nt = bli_rntm_num_threads( rntm ); - dim_t jc = bli_rntm_jc_ways( rntm ); - dim_t pc = bli_rntm_pc_ways( rntm ); - dim_t ic = bli_rntm_ic_ways( rntm ); - dim_t jr = bli_rntm_jr_ways( rntm ); - dim_t ir = bli_rntm_ir_ways( rntm ); + dim_t jc = bli_rntm_jc_ways( rntm ); + dim_t pc = bli_rntm_pc_ways( rntm ); + dim_t ic = bli_rntm_ic_ways( rntm ); + dim_t jr = bli_rntm_jr_ways( rntm ); + dim_t ir = bli_rntm_ir_ways( rntm ); + printf( "thread impl: %d\n", ti ); printf( "rntm contents nt jc pc ic jr ir\n" ); printf( "autofac? %1d | %4d%4d%4d%4d%4d%4d\n", (int)af, (int)nt, (int)jc, (int)pc, diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h index 8b6538484..426b74d60 100644 --- a/frame/base/bli_rntm.h +++ b/frame/base/bli_rntm.h @@ -43,6 +43,8 @@ /* typedef struct rntm_s { + timpl_t thread_impl; + bool auto_factor; dim_t num_threads; @@ -61,6 +63,11 @@ typedef struct rntm_s // -- rntm_t query (public API) ------------------------------------------------ // +BLIS_INLINE timpl_t bli_rntm_thread_impl( const rntm_t* rntm ) +{ + return rntm->thread_impl; +} + BLIS_INLINE bool bli_rntm_auto_factor( const rntm_t* rntm ) { return rntm->auto_factor; @@ -133,6 +140,11 @@ BLIS_INLINE pba_t* bli_rntm_pba( const rntm_t* rntm ) // -- rntm_t modification (internal use only) ---------------------------------- // +BLIS_INLINE void bli_rntm_set_thread_impl_only( timpl_t thread_impl, rntm_t* rntm ) +{ + rntm->thread_impl = thread_impl; +} + BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; @@ -215,6 +227,12 @@ BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) // -- rntm_t modification (public API) ----------------------------------------- // +BLIS_INLINE void bli_rntm_set_thread_impl( timpl_t thread_impl, rntm_t* rntm ) +{ + // Set the threading implementation to use. + bli_rntm_set_thread_impl_only( thread_impl, rntm ); +} + BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. @@ -292,6 +310,7 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) #define BLIS_RNTM_INITIALIZER \ { \ + .thread_impl = SINGLE, \ .auto_factor = TRUE, \ .num_threads = 1, \ .thrloop = { 1, 1, 1, 1, 1, 1 }, \ @@ -304,6 +323,8 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { + bli_rntm_set_thread_impl_only( BLIS_SINGLE, rntm ); + bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); @@ -316,7 +337,9 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) bli_rntm_clear_pba( rntm ); } +// // -- rntm_t total thread calculation ------------------------------------------ +// BLIS_INLINE dim_t bli_rntm_calc_num_threads ( @@ -334,9 +357,9 @@ BLIS_INLINE dim_t bli_rntm_calc_num_threads return n_threads; } -// ----------------------------------------------------------------------------- - -// Function prototypes +// +// -- Function prototypes ------------------------------------------------------ +// BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index 0c75fb639..542973b18 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -83,17 +83,10 @@ // Default behavior is disabled. #endif -// Perform a sanity check to make sure the user doesn't try to enable -// both OpenMP and pthreads. -#if defined ( BLIS_ENABLE_OPENMP ) && \ - defined ( BLIS_ENABLE_PTHREADS ) - #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." -#endif - // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when -// we want to detect use of either OpenMP or pthreads (as opposed -// to neither being used). +// we want to detect use of either OpenMP or pthreads, or both (as +// opposed to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index b5c3ec255..d37e62f8a 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -622,6 +622,20 @@ typedef enum #define bli_nat BLIS_NAT +// -- Threading implementation type -- + +typedef enum +{ + BLIS_SINGLE = 0, + BLIS_OPENMP, + BLIS_POSIX, + + // BLIS_NUM_THREAD_IMPLS must be last! + BLIS_NUM_THREAD_IMPLS + +} timpl_t; + + // -- Kernel ID types -- typedef enum @@ -1434,6 +1448,8 @@ typedef struct cntx_s typedef struct rntm_s { // "External" fields: these may be queried by the end-user. + timpl_t thread_impl; + bool auto_factor; dim_t num_threads; diff --git a/frame/thread/bli_l3_decor.c b/frame/thread/bli_l3_decor.c new file mode 100644 index 000000000..33fb834be --- /dev/null +++ b/frame/thread/bli_l3_decor.c @@ -0,0 +1,176 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// Initialize a function pointer array containing function addresses for +// each of the threading-specific level-3 thread decorators. + +static l3_decor_ft l3_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] = +{ + [BLIS_SINGLE] = bli_l3_thread_decorator_single, + [BLIS_OPENMP] = +#if defined(BLIS_ENABLE_OPENMP) + bli_l3_thread_decorator_openmp, +#elif defined(BLIS_ENABLE_PTHREADS) + NULL, +#else + NULL, +#endif + [BLIS_POSIX] = +#if defined(BLIS_ENABLE_PTHREADS) + bli_l3_thread_decorator_pthreads, +#elif defined(BLIS_ENABLE_OPENMP) + NULL, +#else + NULL, +#endif +}; + +// Define a dispatcher that chooses a threading-specific function from the +// above function pointer array. + +void bli_l3_thread_decorator + ( + l3int_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl + ) +{ + rntm_t rntm_l; + + // Query the threading implementation and the number of threads requested. + timpl_t ti = bli_rntm_thread_impl( rntm ); + dim_t nt = bli_rntm_num_threads( rntm ); + +#if 0 + printf( "(pre-opt) application requested rntm.thread_impl = %s\n", + ( ti == BLIS_SINGLE ? "single" : + ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); +#endif + + if ( bli_error_checking_is_enabled() ) + bli_l3_thread_decorator_check( rntm ); + +#ifdef BLIS_ENABLE_NT1_VIA_SINGLE + if ( nt == 1 ) + { + // An optimization. If the caller requests only one thread, force + // the sequential level-3 thread decorator even if that means + // overriding the caller's preferred threading implementation (as + // communicated via the rntm_t). + rntm_l = *rntm; + ti = BLIS_SINGLE; + bli_rntm_set_thread_impl( BLIS_SINGLE, &rntm_l ); + rntm = &rntm_l; + } +#endif + + if ( 1 < nt && ti == BLIS_SINGLE ) + { + // Here, we resolve conflicting information. The caller requested + // a sequential threading implementation, but also requested more + // than one thread. Here, we choose to favor the requested threading + // implementation over the number of threads, and so reset all + // parallelism parameters to 1. + rntm_l = *rntm; + nt = 1; + bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l ); + bli_rntm_set_num_threads_only( 1, &rntm_l ); + rntm = &rntm_l; + } + +#if 0 + printf( "(post-opt) moving forward with rntm.thread_impl = %s\n", + ( ti == BLIS_SINGLE ? "single" : + ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); +#endif + + // Use the timpl_t value to index into the corresponding function address + // from the function pointer array. + const l3_decor_ft fp = l3_decor_fpa[ ti ]; + + // Call the threading-specific decorator function. + fp + ( + func, + family, + alpha, + a, + b, + beta, + c, + cntx, + rntm, + cntl + ); +} + +void bli_l3_thread_decorator_check + ( + rntm_t* rntm + ) +{ + //err_t e_val; + + //e_val = bli_check_valid_thread_impl( bli_rntm_thread_impl( rntm ) ); + //bli_check_error_code( e_val ); + + const timpl_t ti = bli_rntm_thread_impl( rntm ); + + if ( +#ifndef BLIS_ENABLE_OPENMP + ti == BLIS_OPENMP || +#endif +#ifndef BLIS_ENABLE_PTHREADS + ti == BLIS_POSIX || +#endif + FALSE + ) + { + fprintf( stderr, "\n" ); + fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); + fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); + fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ ); + bli_abort(); + } +} + diff --git a/frame/thread/bli_l3_decor.h b/frame/thread/bli_l3_decor.h index e2208aae6..087eda874 100644 --- a/frame/thread/bli_l3_decor.h +++ b/frame/thread/bli_l3_decor.h @@ -36,10 +36,8 @@ #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H -// -- conventional definitions ------------------------------------------------- - // Level-3 internal function type. -typedef void (*l3int_t) +typedef void (*l3int_ft) ( const obj_t* alpha, const obj_t* a, @@ -52,19 +50,39 @@ typedef void (*l3int_t) thrinfo_t* thread ); +// Level-3 thread decorator function type. +typedef void (*l3_decor_ft) + ( + l3int_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl + ); + // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( - l3int_t func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + l3int_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl + ); + +void bli_l3_thread_decorator_check + ( + rntm_t* rntm ); // Include definitions specific to the method of multithreading for the diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c index 2c71c7532..890c174cf 100644 --- a/frame/thread/bli_l3_decor_openmp.c +++ b/frame/thread/bli_l3_decor_openmp.c @@ -37,30 +37,33 @@ #ifdef BLIS_ENABLE_OPENMP -// Define a dummy function bli_l3_thread_entry(), which is needed in the -// pthreads version, so that when building Windows DLLs (with OpenMP enabled -// or no multithreading) we don't risk having an unresolved symbol. -void* bli_l3_thread_entry( void* data_void ) { return NULL; } - //#define PRINT_THRINFO +//#define PRINT_IMPL -void bli_l3_thread_decorator +void bli_l3_thread_decorator_openmp ( - l3int_t func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + l3int_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ) { // Query the total number of threads from the rntm_t object. const dim_t n_threads = bli_rntm_num_threads( rntm ); +#ifdef PRINT_IMPL + const timpl_t ti = bli_rntm_thread_impl( rntm ); + printf( "l3_decor_openmp: l3 decor with rntm.thread_impl = %s\n", + ( ti == BLIS_SINGLE ? "single" : + ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); +#endif + #ifdef PRINT_THRINFO err_t r_val; thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ), &r_val ); @@ -233,8 +236,10 @@ void bli_l3_thread_decorator_thread_check bli_abort(); } + const timpl_t ti = bli_rntm_thread_impl( rntm ); + //n_threads = 1; // not needed since it has no effect? - bli_thrcomm_init( 1, gl_comm ); + bli_thrcomm_init( ti, 1, gl_comm ); bli_rntm_set_num_threads_only( 1, rntm ); bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm ); //} diff --git a/frame/thread/bli_l3_decor_openmp.h b/frame/thread/bli_l3_decor_openmp.h index 6ff7f16a9..95e1582e5 100644 --- a/frame/thread/bli_l3_decor_openmp.h +++ b/frame/thread/bli_l3_decor_openmp.h @@ -39,6 +39,20 @@ // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP +void bli_l3_thread_decorator_openmp + ( + l3int_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl + ); + void bli_l3_thread_decorator_thread_check ( dim_t n_threads, diff --git a/frame/thread/bli_l3_decor_pthreads.c b/frame/thread/bli_l3_decor_pthreads.c index 80247dfb1..d31414d3b 100644 --- a/frame/thread/bli_l3_decor_pthreads.c +++ b/frame/thread/bli_l3_decor_pthreads.c @@ -40,7 +40,7 @@ // A data structure to assist in passing operands to additional threads. typedef struct thread_data { - l3int_t func; + l3int_ft func; opid_t family; const obj_t* alpha; const obj_t* a; @@ -60,7 +60,7 @@ void* bli_l3_thread_entry( void* data_void ) { const thread_data_t* data = data_void; - const l3int_t func = data->func; + const l3int_ft func = data->func; const opid_t family = data->family; const obj_t* alpha = data->alpha; const obj_t* a = data->a; @@ -139,25 +139,34 @@ void* bli_l3_thread_entry( void* data_void ) return NULL; } -void bli_l3_thread_decorator +//#define PRINT_IMPL + +void bli_l3_thread_decorator_pthreads ( - l3int_t func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + l3int_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ) { err_t r_val; - // Query the total number of threads from the context. + // Query the total number of threads from the rntm_t object. const dim_t n_threads = bli_rntm_num_threads( rntm ); +#ifdef PRINT_IMPL + const timpl_t ti = bli_rntm_thread_impl( rntm ); + printf( "l3_decor_pthrea: l3 decor with rntm.thread_impl = %s\n", + ( ti == BLIS_SINGLE ? "single" : + ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); +#endif + // NOTE: The sba was initialized in bli_init(). // Check out an array_t from the small block allocator. This is done @@ -244,5 +253,12 @@ void bli_l3_thread_decorator bli_free_intl( datas ); } +#else + +// Define a dummy function bli_l3_thread_entry(), which is needed for +// consistent dynamic linking behavior when building shared objects in Linux +// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol. +void* bli_l3_thread_entry( void* data_void ) { return NULL; } + #endif diff --git a/frame/thread/bli_l3_decor_pthreads.h b/frame/thread/bli_l3_decor_pthreads.h index 772e05ca7..edf36cf6e 100644 --- a/frame/thread/bli_l3_decor_pthreads.h +++ b/frame/thread/bli_l3_decor_pthreads.h @@ -41,6 +41,20 @@ // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); +void bli_l3_thread_decorator_pthreads + ( + l3int_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl + ); + #endif #endif diff --git a/frame/thread/bli_l3_decor_single.c b/frame/thread/bli_l3_decor_single.c index c2c43b370..6f0f8603b 100644 --- a/frame/thread/bli_l3_decor_single.c +++ b/frame/thread/bli_l3_decor_single.c @@ -35,22 +35,32 @@ #include "blis.h" -#ifndef BLIS_ENABLE_MULTITHREADING +//#define PRINT_IMPL -void bli_l3_thread_decorator +void bli_l3_thread_decorator_single ( - l3int_t func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + l3int_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl ) { + // For sequential execution, we use only one thread. + const dim_t n_threads = 1; + +#ifdef PRINT_IMPL + const timpl_t ti = bli_rntm_thread_impl( rntm ); + printf( "l3_decor_single: l3 decor with rntm.thread_impl = %s\n", + ( ti == BLIS_SINGLE ? "single" : + ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); +#endif + obj_t a_t, b_t; bli_obj_alias_to( a, &a_t ); bli_obj_alias_to( b, &b_t ); @@ -66,9 +76,6 @@ void bli_l3_thread_decorator bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t ); bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t ); - // For sequential execution, we use only one thread. - const dim_t n_threads = 1; - // NOTE: The sba was initialized in bli_init(). // Check out an array_t from the small block allocator. This is done @@ -86,6 +93,12 @@ void bli_l3_thread_decorator // Allcoate a global communicator for the root thrinfo_t structures. thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); +#if 0 + timpl_t ti2 = bli_rntm_thread_impl( rntm ); + printf( "l3_decor_single: created thrcomm_t.ti = %s\n", + ( ti2 == BLIS_SINGLE ? "single" : + ( ti2 == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); +#endif { @@ -150,5 +163,3 @@ void bli_l3_thread_decorator bli_sba_checkin_array( array ); } -#endif - diff --git a/frame/thread/bli_l3_decor_single.h b/frame/thread/bli_l3_decor_single.h index 481763a90..c118ad7be 100644 --- a/frame/thread/bli_l3_decor_single.h +++ b/frame/thread/bli_l3_decor_single.h @@ -35,10 +35,19 @@ #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H -// Definitions specific to situations when multithreading is disabled. -#ifndef BLIS_ENABLE_MULTITHREADING - -#endif +void bli_l3_thread_decorator_single + ( + l3int_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl + ); #endif diff --git a/frame/thread/bli_l3_sup_decor.c b/frame/thread/bli_l3_sup_decor.c new file mode 100644 index 000000000..53c7b41be --- /dev/null +++ b/frame/thread/bli_l3_sup_decor.c @@ -0,0 +1,137 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// Initialize a function pointer array containing function addresses for +// each of the threading-specific level-3 sup thread decorators. + +static l3_sup_decor_ft l3_sup_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] = +{ + [BLIS_SINGLE] = bli_l3_sup_thread_decorator_single, + [BLIS_OPENMP] = +#if defined(BLIS_ENABLE_OPENMP) + bli_l3_sup_thread_decorator_openmp, +#elif defined(BLIS_ENABLE_PTHREADS) + NULL, +#else + NULL, +#endif + [BLIS_POSIX] = +#if defined(BLIS_ENABLE_PTHREADS) + bli_l3_sup_thread_decorator_pthreads, +#elif defined(BLIS_ENABLE_OPENMP) + NULL, +#else + NULL, +#endif +}; + +// Define a dispatcher that chooses a threading-specific function from the +// above function pointer array. + +err_t bli_l3_sup_thread_decorator + ( + l3supint_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm + ) +{ + rntm_t rntm_l; + + // Query the threading implementation and the number of threads requested. + timpl_t ti = bli_rntm_thread_impl( rntm ); + dim_t nt = bli_rntm_num_threads( rntm ); + +#ifdef BLIS_ENABLE_NT1_VIA_SINGLE + if ( nt == 1 ) + { + // An optimization. If the caller requests only one thread, force + // the sequential level-3 thread decorator even if that means + // overriding the caller's preferred threading implementation (as + // communicated via the rntm_t). + rntm_l = *rntm; + ti = BLIS_SINGLE; + bli_rntm_set_thread_impl( BLIS_SINGLE, &rntm_l ); + rntm = &rntm_l; + } +#endif + + if ( 1 < nt && ti == BLIS_SINGLE ) + { + // Here, we resolve conflicting information. The caller requested + // a sequential threading implementation, but also requested more + // than one thread. Here, we choose to favor the requested threading + // implementation over the number of threads, and so reset all + // parallelism parameters to 1. + rntm_l = *rntm; + nt = 1; + bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l ); + bli_rntm_set_num_threads_only( 1, &rntm_l ); + rntm = &rntm_l; + } + + // Use the timpl_t value to index into the corresponding function address + // from the function pointer array. + const l3_sup_decor_ft fp = l3_sup_decor_fpa[ ti ]; + + // Call the threading-specific decorator function. + return fp + ( + func, + family, + alpha, + a, + b, + beta, + c, + cntx, + rntm + ); +} + +void bli_l3_sup_thread_decorator_check + ( + rntm_t* rntm + ) +{ + bli_l3_sup_thread_decorator_check( rntm ); +} + diff --git a/frame/thread/bli_l3_sup_decor.h b/frame/thread/bli_l3_sup_decor.h index 6e0401151..a271920b4 100644 --- a/frame/thread/bli_l3_sup_decor.h +++ b/frame/thread/bli_l3_sup_decor.h @@ -39,7 +39,7 @@ // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. -typedef err_t (*l3supint_t) +typedef err_t (*l3supint_ft) ( const obj_t* alpha, const obj_t* a, @@ -51,18 +51,37 @@ typedef err_t (*l3supint_t) thrinfo_t* thread ); +// Level-3 sup thread decorator function type. +typedef err_t (*l3_sup_decor_ft) + ( + l3supint_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm + ); + // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( - l3supint_t func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm + l3supint_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm + ); + +void bli_l3_sup_thread_decorator_check + ( + rntm_t* rntm ); // Include definitions specific to the method of multithreading for the diff --git a/frame/thread/bli_l3_sup_decor_openmp.c b/frame/thread/bli_l3_sup_decor_openmp.c index ff6bc667d..7d06ad622 100644 --- a/frame/thread/bli_l3_sup_decor_openmp.c +++ b/frame/thread/bli_l3_sup_decor_openmp.c @@ -37,24 +37,19 @@ #ifdef BLIS_ENABLE_OPENMP -// Define a dummy function bli_l3_sup_thread_entry(), which is needed in the -// pthreads version, so that when building Windows DLLs (with OpenMP enabled -// or no multithreading) we don't risk having an unresolved symbol. -void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; } - //#define PRINT_THRINFO -err_t bli_l3_sup_thread_decorator +err_t bli_l3_sup_thread_decorator_openmp ( - l3supint_t func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm + l3supint_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { // Query the total number of threads from the rntm_t object. diff --git a/frame/thread/bli_l3_sup_decor_openmp.h b/frame/thread/bli_l3_sup_decor_openmp.h index 1d1097a82..4c5059d00 100644 --- a/frame/thread/bli_l3_sup_decor_openmp.h +++ b/frame/thread/bli_l3_sup_decor_openmp.h @@ -38,6 +38,19 @@ // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP +err_t bli_l3_sup_thread_decorator_openmp + ( + l3supint_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm + ); + #endif #endif diff --git a/frame/thread/bli_l3_sup_decor_pthreads.c b/frame/thread/bli_l3_sup_decor_pthreads.c index 375a85730..7be5cf8fb 100644 --- a/frame/thread/bli_l3_sup_decor_pthreads.c +++ b/frame/thread/bli_l3_sup_decor_pthreads.c @@ -40,18 +40,18 @@ // A data structure to assist in passing operands to additional threads. typedef struct thread_data { - l3supint_t func; - opid_t family; - const obj_t* alpha; - const obj_t* a; - const obj_t* b; - const obj_t* beta; - const obj_t* c; - const cntx_t* cntx; - rntm_t* rntm; - dim_t tid; - thrcomm_t* gl_comm; - array_t* array; + l3supint_ft func; + opid_t family; + const obj_t* alpha; + const obj_t* a; + const obj_t* b; + const obj_t* beta; + const obj_t* c; + const cntx_t* cntx; + rntm_t* rntm; + dim_t tid; + thrcomm_t* gl_comm; + array_t* array; } thread_data_t; // Entry point for additional threads @@ -59,7 +59,7 @@ void* bli_l3_sup_thread_entry( void* data_void ) { thread_data_t* data = data_void; - l3supint_t func = data->func; + l3supint_ft func = data->func; opid_t family = data->family; const obj_t* alpha = data->alpha; const obj_t* a = data->a; @@ -109,17 +109,17 @@ void* bli_l3_sup_thread_entry( void* data_void ) return NULL; } -err_t bli_l3_sup_thread_decorator +err_t bli_l3_sup_thread_decorator_pthreads ( - l3supint_t func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm + l3supint_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { err_t r_val; @@ -214,5 +214,12 @@ err_t bli_l3_sup_thread_decorator return BLIS_SUCCESS; } +#else + +// Define a dummy function bli_l3_thread_entry(), which is needed for +// consistent dynamic linking behavior when building shared objects in Linux +// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol. +void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; } + #endif diff --git a/frame/thread/bli_l3_sup_decor_pthreads.h b/frame/thread/bli_l3_sup_decor_pthreads.h index 1362b4035..310ea4e8b 100644 --- a/frame/thread/bli_l3_sup_decor_pthreads.h +++ b/frame/thread/bli_l3_sup_decor_pthreads.h @@ -41,6 +41,19 @@ // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); +err_t bli_l3_sup_thread_decorator_pthreads + ( + l3supint_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm + ); + #endif #endif diff --git a/frame/thread/bli_l3_sup_decor_single.c b/frame/thread/bli_l3_sup_decor_single.c index df767ad29..a419154e7 100644 --- a/frame/thread/bli_l3_sup_decor_single.c +++ b/frame/thread/bli_l3_sup_decor_single.c @@ -35,21 +35,19 @@ #include "blis.h" -#ifndef BLIS_ENABLE_MULTITHREADING - #define SKIP_THRINFO_TREE -err_t bli_l3_sup_thread_decorator +err_t bli_l3_sup_thread_decorator_single ( - l3supint_t func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm + l3supint_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { // For sequential execution, we use only one thread. @@ -138,5 +136,3 @@ err_t bli_l3_sup_thread_decorator return BLIS_SUCCESS; } -#endif - diff --git a/frame/thread/bli_l3_sup_decor_single.h b/frame/thread/bli_l3_sup_decor_single.h index 418c3814c..8ca279baf 100644 --- a/frame/thread/bli_l3_sup_decor_single.h +++ b/frame/thread/bli_l3_sup_decor_single.h @@ -35,10 +35,18 @@ #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H -// Definitions specific to situations when multithreading is disabled. -#ifndef BLIS_ENABLE_MULTITHREADING - -#endif +err_t bli_l3_sup_thread_decorator_single + ( + l3supint_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm + ); #endif diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index ef46a7ad4..6cd4325df 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -35,8 +35,183 @@ #include "blis.h" +// -- Method-agnostic functions ------------------------------------------------ + +thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ) +{ + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_thrcomm_create(): " ); + #endif + + thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) ); + + const timpl_t ti = bli_rntm_thread_impl( rntm ); + + bli_thrcomm_init( ti, n_threads, comm ); + + return comm; +} + +void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ) +{ + if ( comm == NULL ) return; + + const timpl_t ti = bli_rntm_thread_impl( rntm ); + + bli_thrcomm_cleanup( ti, comm ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_thrcomm_free(): " ); + #endif + + bli_sba_release( rntm, comm ); +} + +// -- Method-specific functions ------------------------------------------------ + +// Initialize a function pointer array for each family of threading-specific +// functions (init, cleanup, and barrier). + +static thrcomm_init_ft init_fpa[ BLIS_NUM_THREAD_IMPLS ] = +{ + [BLIS_SINGLE] = bli_thrcomm_init_single, + [BLIS_OPENMP] = +#if defined(BLIS_ENABLE_OPENMP) + bli_thrcomm_init_openmp, +#elif defined(BLIS_ENABLE_PTHREADS) + NULL, +#else + NULL, +#endif + [BLIS_POSIX] = +#if defined(BLIS_ENABLE_PTHREADS) + bli_thrcomm_init_pthreads, +#elif defined(BLIS_ENABLE_OPENMP) + NULL, +#else + NULL, +#endif +}; +static thrcomm_cleanup_ft cleanup_fpa[ BLIS_NUM_THREAD_IMPLS ] = +{ + [BLIS_SINGLE] = bli_thrcomm_cleanup_single, + [BLIS_OPENMP] = +#if defined(BLIS_ENABLE_OPENMP) + bli_thrcomm_cleanup_openmp, +#elif defined(BLIS_ENABLE_PTHREADS) + NULL, +#else + NULL, +#endif + [BLIS_POSIX] = +#if defined(BLIS_ENABLE_PTHREADS) + bli_thrcomm_cleanup_pthreads, +#elif defined(BLIS_ENABLE_OPENMP) + NULL, +#else + NULL, +#endif +}; +static thrcomm_barrier_ft barrier_fpa[ BLIS_NUM_THREAD_IMPLS ] = +{ + [BLIS_SINGLE] = bli_thrcomm_barrier_single, + [BLIS_OPENMP] = +#if defined(BLIS_ENABLE_OPENMP) + bli_thrcomm_barrier_openmp, +#elif defined(BLIS_ENABLE_PTHREADS) + bli_thrcomm_barrier_pthreads, +#else + bli_thrcomm_barrier_single, +#endif + [BLIS_POSIX] = +#if defined(BLIS_ENABLE_PTHREADS) + bli_thrcomm_barrier_pthreads, +#elif defined(BLIS_ENABLE_OPENMP) + bli_thrcomm_barrier_openmp, +#else + bli_thrcomm_barrier_single, +#endif +}; + +// Define dispatchers that choose a threading-specific function from each +// of the above function pointer arrays. + +void bli_thrcomm_init( timpl_t ti, dim_t nt, thrcomm_t* comm ) +{ + const thrcomm_init_ft fp = init_fpa[ ti ]; + + if ( fp == NULL ) bli_abort(); + + // Call the threading-specific init function. + fp( nt, comm ); + + // Embed the type of threading implementation within the thrcomm_t struct. + // This can be used later to make sure the application doesn't use a + // thrcomm_t initialized with threading type A with the API for threading + // type B. Note that we wait until after the init function has returned + // in case that function zeros out the entire struct before setting the + // fields. + comm->ti = ti; +} + +void bli_thrcomm_cleanup( timpl_t ti, thrcomm_t* comm ) +{ + const thrcomm_cleanup_ft fp = cleanup_fpa[ ti ]; + + if ( fp == NULL ) bli_abort(); + + // If comm is BLIS_SINGLE_COMM, we return early since there is no cleanup, + // especially if it is being used with a threading implementation that + // would normally want to free its thrcomm_t resources. + if ( comm == &BLIS_SINGLE_COMM ) return; + + // Sanity check. Make sure the threading implementation we were asked to use + // is the same as the implementation that initialized the thrcomm_t object. + if ( ti != comm->ti ) + { + printf( "bli_thrcomm_cleanup(): thrcomm_t.ti = %s, but request via rntm_t.ti = %s\n", + ( comm->ti == BLIS_SINGLE ? "single" : + ( comm->ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ), + ( ti == BLIS_SINGLE ? "single" : + ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); + bli_abort(); + } + + // Call the threading-specific cleanup function. + fp( comm ); +} + +void bli_thrcomm_barrier( timpl_t ti, dim_t tid, thrcomm_t* comm ) +{ + const thrcomm_barrier_ft fp = barrier_fpa[ ti ]; + + if ( fp == NULL ) bli_abort(); + + // Sanity check. Make sure the threading implementation we were asked to use + // is the same as the implementation that initialized the thrcomm_t object. + // We skip this check if comm is BLIS_SINGLE_COMM since the timpl_t value + // embedded in comm will often be different than that of BLIS_SINGLE_COMM + // (but we don't return early since we still need to barrier... wait, or do + // we?). + if ( ti != comm->ti && comm != &BLIS_SINGLE_COMM ) + { + printf( "bli_thrcomm_barrier(): thrcomm_t.ti = %s, but request via rntm_t.ti = %s\n", + ( comm->ti == BLIS_SINGLE ? "single" : + ( comm->ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ), + ( ti == BLIS_SINGLE ? "single" : + ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); + bli_abort(); + } + + // Call the threading-specific barrier function. + fp( tid, comm ); +} + +// -- Other functions ---------------------------------------------------------- + void* bli_thrcomm_bcast ( + timpl_t ti, dim_t id, void* to_send, thrcomm_t* comm @@ -46,9 +221,9 @@ void* bli_thrcomm_bcast if ( id == 0 ) comm->sent_object = to_send; - bli_thrcomm_barrier( id, comm ); + bli_thrcomm_barrier( ti, id, comm ); void* object = comm->sent_object; - bli_thrcomm_barrier( id, comm ); + bli_thrcomm_barrier( ti, id, comm ); return object; } diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index d0ffb1346..4532fd00d 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -36,12 +36,82 @@ #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H +// Define barrier_t, which is specific to the tree barrier in the OpenMP +// implementation. This needs to be done first since it is (potentially) +// used within the definition of thrcomm_t below. + +#ifdef BLIS_ENABLE_OPENMP +#ifdef BLIS_TREE_BARRIER +struct barrier_s +{ + int arity; + int count; + struct barrier_s* dad; + volatile int signal; +}; +typedef struct barrier_s barrier_t; +#endif +#endif + +// Define the thrcomm_t structure, which will be common to all threading +// implementations. + +typedef struct thrcomm_s +{ + // -- Fields common to all threading implementations -- + + void* sent_object; + dim_t n_threads; + timpl_t ti; + + // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon + // redefining bool_t as bool we discovered that some gcc __atomic built-ins + // don't allow the use of bool for the variables being operated upon. + // (Specifically, this was observed of __atomic_fetch_xor(), but it likely + // applies to all other related built-ins.) Thus, we get around this by + // redefining barrier_sense as a gint_t. + //volatile gint_t barrier_sense; + gint_t barrier_sense; + dim_t barrier_threads_arrived; + + // -- Fields specific to OpenMP -- + + #ifdef BLIS_ENABLE_OPENMP + #ifdef BLIS_TREE_BARRIER + // This field is only needed if the tree barrier implementation is being + // compiled. The non-tree barrier code does not use it. + barrier_t** barriers; + #endif + #endif + + // -- Fields specific to pthreads -- + + #ifdef BLIS_ENABLE_PTHREADS + #ifdef BLIS_USE_PTHREAD_BARRIER + // This field is only needed if the pthread_barrier_t implementation is + // being compiled. The non-pthread_barrier_t code does not use it. + bli_pthread_barrier_t barrier; + #endif + #endif + +} thrcomm_t; + + + + + // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. #include "bli_thrcomm_single.h" #include "bli_thrcomm_openmp.h" #include "bli_thrcomm_pthreads.h" +// Define a function pointer type for each of the functions that are +// "overloaded" by each method of multithreading. +typedef void (*thrcomm_init_ft)( dim_t nt, thrcomm_t* comm ); +typedef void (*thrcomm_cleanup_ft)( thrcomm_t* comm ); +typedef void (*thrcomm_barrier_ft)( dim_t tid, thrcomm_t* comm ); + // thrcomm_t query (field only) @@ -51,16 +121,22 @@ BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) } -// Thread communicator prototypes. +// Threading method-agnostic function prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); -void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); -void bli_thrcomm_cleanup( thrcomm_t* comm ); - -BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); -BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); -void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); +// Threading method-specific function prototypes. +// NOTE: These are the prototypes to the dispatcher functions and thus they +// require the timpl_t as an argument. The threading-specific functions can +// (and do) omit the timpl_t from their function signatures since their +// threading implementation is intrinsically known. +void bli_thrcomm_init( timpl_t ti, dim_t n_threads, thrcomm_t* comm ); +void bli_thrcomm_cleanup( timpl_t ti, thrcomm_t* comm ); +BLIS_EXPORT_BLIS void bli_thrcomm_barrier( timpl_t ti, dim_t thread_id, thrcomm_t* comm ); + +// Other function prototypes. +BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( timpl_t ti, dim_t inside_id, void* to_send, thrcomm_t* comm ); +void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 9bb35ea31..a42dabe18 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -37,35 +37,13 @@ #ifdef BLIS_ENABLE_OPENMP -thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ) -{ - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_thrcomm_create(): " ); - #endif - - thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) ); - - bli_thrcomm_init( n_threads, comm ); - - return comm; -} - -void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ) -{ - if ( comm == NULL ) return; - - bli_thrcomm_cleanup( comm ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_thrcomm_free(): " ); - #endif - - bli_sba_release( rntm, comm ); -} - #ifndef BLIS_TREE_BARRIER -void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) +// Define the non-tree barrier implementations of the init, cleanup, and +// barrier functions. These are the default unless the tree barrier +// versions are requested at compile-time. + +void bli_thrcomm_init_openmp( dim_t n_threads, thrcomm_t* comm ) { if ( comm == NULL ) return; comm->sent_object = NULL; @@ -75,14 +53,15 @@ void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) } -void bli_thrcomm_cleanup( thrcomm_t* comm ) +void bli_thrcomm_cleanup_openmp( thrcomm_t* comm ) { - if ( comm == NULL ) return; + //if ( comm == NULL ) return; + return; } //'Normal' barrier for openmp //barrier routine taken from art of multicore programming -void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) +void bli_thrcomm_barrier_openmp( dim_t t_id, thrcomm_t* comm ) { #if 0 if ( comm == NULL || comm->n_threads == 1 ) @@ -109,7 +88,10 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) #else -void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) +// Define the tree barrier implementations of the init, cleanup, and +// barrier functions. + +void bli_thrcomm_init_openmp( dim_t n_threads, thrcomm_t* comm ) { err_t r_val; @@ -120,6 +102,23 @@ void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, comm->barriers, 0 ); } +void bli_thrcomm_cleanup_openmp( thrcomm_t* comm ) +{ + if ( comm == NULL ) return; + for ( dim_t i = 0; i < comm->n_threads; i++ ) + { + bli_thrcomm_tree_barrier_free( comm->barriers[i] ); + } + bli_free_intl( comm->barriers ); +} + +void bli_thrcomm_barrier_openmp( dim_t t_id, thrcomm_t* comm ) +{ + bli_thrcomm_tree_barrier( comm->barriers[t_id] ); +} + +// -- Helper functions --------------------------------------------------------- + //Tree barrier used for Intel Xeon Phi barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ) { @@ -164,16 +163,6 @@ barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_ return me; } -void bli_thrcomm_cleanup( thrcomm_t* comm ) -{ - if ( comm == NULL ) return; - for ( dim_t i = 0; i < comm->n_threads; i++ ) - { - bli_thrcomm_tree_barrier_free( comm->barriers[i] ); - } - bli_free_intl( comm->barriers ); -} - void bli_thrcomm_tree_barrier_free( barrier_t* barrier ) { if ( barrier == NULL ) @@ -187,11 +176,6 @@ void bli_thrcomm_tree_barrier_free( barrier_t* barrier ) return; } -void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) -{ - bli_thrcomm_tree_barrier( comm->barriers[t_id] ); -} - void bli_thrcomm_tree_barrier( barrier_t* barack ) { int my_signal = barack->signal; diff --git a/frame/thread/bli_thrcomm_openmp.h b/frame/thread/bli_thrcomm_openmp.h index 3abfd0a41..8c33d0c2f 100644 --- a/frame/thread/bli_thrcomm_openmp.h +++ b/frame/thread/bli_thrcomm_openmp.h @@ -36,53 +36,22 @@ #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H -// Define thrcomm_t for situations when OpenMP multithreading is enabled. +// Define these prototypes for situations when OpenMP multithreading is +// enabled. #ifdef BLIS_ENABLE_OPENMP #include -// Define thrcomm_t for tree barriers and non-tree barriers. -#ifdef BLIS_TREE_BARRIER -struct barrier_s -{ - int arity; - int count; - struct barrier_s* dad; - volatile int signal; -}; -typedef struct barrier_s barrier_t; - -struct thrcomm_s -{ - void* sent_object; - dim_t n_threads; - barrier_t** barriers; -}; -#else -struct thrcomm_s -{ - void* sent_object; - dim_t n_threads; - - // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon - // redefining bool_t as bool we discovered that some gcc __atomic built-ins - // don't allow the use of bool for the variables being operated upon. - // (Specifically, this was observed of __atomic_fetch_xor(), but it likely - // applies to all other related built-ins.) Thus, we get around this by - // redefining barrier_sense as a gint_t. - //volatile gint_t barrier_sense; - gint_t barrier_sense; - dim_t barrier_threads_arrived; -}; -#endif - -typedef struct thrcomm_s thrcomm_t; +// OpenMP-specific function prototypes. +void bli_thrcomm_init_openmp( dim_t nt, thrcomm_t* comm ); +void bli_thrcomm_cleanup_openmp( thrcomm_t* comm ); +void bli_thrcomm_barrier_openmp( dim_t tid, thrcomm_t* comm ); -// Prototypes specific to tree barriers. +// Prototypes specific to the OpenMP tree barrier implementation. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); -void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); -void bli_thrcomm_tree_barrier( barrier_t* barack ); +void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); +void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index d0896f94d..39b15d590 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -37,35 +37,12 @@ #ifdef BLIS_ENABLE_PTHREADS -thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ) -{ - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_thrcomm_create(): " ); - #endif - - thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) ); - - bli_thrcomm_init( n_threads, comm ); - - return comm; -} - -void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ) -{ - if ( comm == NULL ) return; - - bli_thrcomm_cleanup( comm ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_thrcomm_free(): " ); - #endif - - bli_sba_release( rntm, comm ); -} - #ifdef BLIS_USE_PTHREAD_BARRIER -void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) +// Define the pthread_barrier_t implementations of the init, cleanup, and +// barrier functions. + +void bli_thrcomm_init_pthreads( dim_t n_threads, thrcomm_t* comm ) { if ( comm == NULL ) return; comm->sent_object = NULL; @@ -73,7 +50,7 @@ void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) bli_pthread_barrier_init( &comm->barrier, NULL, n_threads ); } -void bli_thrcomm_cleanup( thrcomm_t* comm ) +void bli_thrcomm_cleanup_pthreads( thrcomm_t* comm ) { if ( comm == NULL ) return; bli_pthread_barrier_destroy( &comm->barrier ); @@ -86,7 +63,11 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) #else -void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) +// Define the non-pthread_barrier_t implementations of the init, cleanup, +// and barrier functions. These are the default unless the pthread_barrier_t +// versions are requested at compile-time. + +void bli_thrcomm_init_pthreads( dim_t n_threads, thrcomm_t* comm ) { if ( comm == NULL ) return; comm->sent_object = NULL; @@ -95,11 +76,11 @@ void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) comm->barrier_threads_arrived = 0; } -void bli_thrcomm_cleanup( thrcomm_t* comm ) +void bli_thrcomm_cleanup_pthreads( thrcomm_t* comm ) { } -void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) +void bli_thrcomm_barrier_pthreads( dim_t t_id, thrcomm_t* comm ) { #if 0 if ( comm == NULL || comm->n_threads == 1 ) return; diff --git a/frame/thread/bli_thrcomm_pthreads.h b/frame/thread/bli_thrcomm_pthreads.h index 2c2e88551..9a2447b99 100644 --- a/frame/thread/bli_thrcomm_pthreads.h +++ b/frame/thread/bli_thrcomm_pthreads.h @@ -35,36 +35,13 @@ #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H -// Define thrcomm_t for situations when POSIX multithreading is enabled. +// Define these prototypes for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS -#ifdef BLIS_USE_PTHREAD_BARRIER -struct thrcomm_s -{ - void* sent_object; - dim_t n_threads; - - bli_pthread_barrier_t barrier; -}; -#else -struct thrcomm_s -{ - void* sent_object; - dim_t n_threads; - - // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon - // redefining bool_t as bool we discovered that some gcc __atomic built-ins - // don't allow the use of bool for the variables being operated upon. - // (Specifically, this was observed of __atomic_fetch_xor(), but it likely - // applies to all other related built-ins.) Thus, we get around this by - // redefining barrier_sense as a gint_t. - //volatile gint_t barrier_sense; - gint_t barrier_sense; - dim_t barrier_threads_arrived; -}; -#endif - -typedef struct thrcomm_s thrcomm_t; +// pthreads-specific function prototypes. +void bli_thrcomm_init_pthreads( dim_t nt, thrcomm_t* comm ); +void bli_thrcomm_cleanup_pthreads( thrcomm_t* comm ); +void bli_thrcomm_barrier_pthreads( dim_t tid, thrcomm_t* comm ); #endif diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index cedb3c5b6..cb12e37f3 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -35,36 +35,7 @@ #include "blis.h" -#ifndef BLIS_ENABLE_MULTITHREADING - -//Constructors and destructors for constructors -thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ) -{ - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_thrcomm_create(): " ); - #endif - - thrcomm_t* comm = bli_sba_acquire( rntm, sizeof( thrcomm_t ) ); - - bli_thrcomm_init( n_threads, comm ); - - return comm; -} - -void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ) -{ - if ( comm == NULL ) return; - - bli_thrcomm_cleanup( comm ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_thrcomm_free(): " ); - #endif - - bli_sba_release( rntm, comm ); -} - -void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) +void bli_thrcomm_init_single( dim_t n_threads, thrcomm_t* comm ) { if ( comm == NULL ) return; @@ -74,15 +45,13 @@ void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) comm->barrier_threads_arrived = 0; } -void bli_thrcomm_cleanup( thrcomm_t* comm ) +void bli_thrcomm_cleanup_single( thrcomm_t* comm ) { if ( comm == NULL ) return; } -void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) +void bli_thrcomm_barrier_single( dim_t t_id, thrcomm_t* comm ) { return; } -#endif - diff --git a/frame/thread/bli_thrcomm_single.h b/frame/thread/bli_thrcomm_single.h index c10727df2..fffb3fb75 100644 --- a/frame/thread/bli_thrcomm_single.h +++ b/frame/thread/bli_thrcomm_single.h @@ -35,45 +35,13 @@ #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H -// Define thrcomm_t for situations when multithreading is disabled. -#ifndef BLIS_ENABLE_MULTITHREADING +// Always define these prototypes since disabling multithreading is always +// an option. -//thread communicators may be implementation dependent -#ifdef BLIS_TREE_BARRIER -struct barrier_s -{ - int arity; - int count; - struct barrier_s* dad; - int signal; -}; -typedef struct barrier_s barrier_t; - -struct thrcomm_s -{ - void* sent_object; - dim_t n_threads; - barrier_t** barriers; -}; -#else -struct thrcomm_s -{ - void* sent_object; - dim_t n_threads; - - // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon - // redefining bool_t as bool we discovered that some gcc __atomic built-ins - // don't allow the use of bool for the variables being operated upon. - // (Specifically, this was observed of __atomic_fetch_xor(), but it likely - // applies to all other related built-ins.) Thus, we get around this by - // redefining barrier_sense as a gint_t. - gint_t barrier_sense; - dim_t barrier_threads_arrived; -}; -#endif -typedef struct thrcomm_s thrcomm_t; - -#endif +// Sequential-specific function prototypes. +void bli_thrcomm_init_single( dim_t nt, thrcomm_t* comm ); +void bli_thrcomm_cleanup_single( thrcomm_t* comm ); +void bli_thrcomm_barrier_single( dim_t tid, thrcomm_t* comm ); #endif diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 9bad6a456..9369b373b 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -50,7 +50,7 @@ extern bli_pthread_mutex_t global_rntm_mutex; void bli_thread_init( void ) { - bli_thrcomm_init( 1, &BLIS_SINGLE_COMM ); + bli_thrcomm_init( BLIS_SINGLE, 1, &BLIS_SINGLE_COMM ); bli_packm_thrinfo_init_single( &BLIS_PACKM_SINGLE_THREADED ); bli_l3_thrinfo_init_single( &BLIS_GEMM_SINGLE_THREADED ); @@ -1554,6 +1554,14 @@ dim_t bli_thread_get_num_threads( void ) return bli_rntm_num_threads( &global_rntm ); } +timpl_t bli_thread_get_thread_impl( void ) +{ + // We must ensure that global_rntm has been initialized. + bli_init_once(); + + return bli_rntm_thread_impl( &global_rntm ); +} + // ---------------------------------------------------------------------------- void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ) @@ -1584,8 +1592,24 @@ void bli_thread_set_num_threads( dim_t n_threads ) bli_pthread_mutex_unlock( &global_rntm_mutex ); } +void bli_thread_set_thread_impl( timpl_t ti ) +{ + // We must ensure that global_rntm has been initialized. + bli_init_once(); + + // Acquire the mutex protecting global_rntm. + bli_pthread_mutex_lock( &global_rntm_mutex ); + + bli_rntm_set_thread_impl_only( ti, &global_rntm ); + + // Release the mutex protecting global_rntm. + bli_pthread_mutex_unlock( &global_rntm_mutex ); +} + // ---------------------------------------------------------------------------- +//#define PRINT_IMPL + void bli_thread_init_rntm_from_env ( rntm_t* rntm @@ -1606,18 +1630,69 @@ void bli_thread_init_rntm_from_env // function guarantees that the rntm_t has sane values in the event that the // application passed in a custom rntm_t via an expert interface. - bool auto_factor = FALSE; - dim_t nt; - dim_t jc, pc, ic, jr, ir; + bool auto_factor = FALSE; + dim_t nt; + dim_t jc, pc, ic, jr, ir; + timpl_t ti; #ifdef BLIS_ENABLE_MULTITHREADING + // Try to read BLIS_THREAD_IMPL. + char* ti_env = bli_env_get_str( "BLIS_THREAD_IMPL" ); + + // If BLIS_THREAD_IMPL was not set, try to read BLIS_TI. + if ( ti_env == NULL ) ti_env = bli_env_get_str( "BLIS_TI" ); + + if ( ti_env != NULL ) + { + // If BLIS_THREAD_IMPL was set, parse the value. If the value was + // anything other than a "openmp" or "pthreads" (or reasonable + // variations thereof), interpret it as a request for single-threaded + // execution. + if ( !strncmp( ti_env, "openmp", 6 ) ) ti = BLIS_OPENMP; + else if ( !strncmp( ti_env, "omp", 3 ) ) ti = BLIS_OPENMP; + else if ( !strncmp( ti_env, "pthreads", 8 ) ) ti = BLIS_POSIX; + else if ( !strncmp( ti_env, "pthread", 7 ) ) ti = BLIS_POSIX; + else if ( !strncmp( ti_env, "posix", 5 ) ) ti = BLIS_POSIX; + else ti = BLIS_SINGLE; + + #ifdef PRINT_IMPL + if ( ti == BLIS_OPENMP ) + printf( "detected BLIS_THREAD_IMPL=openmp.\n" ); + else if ( ti == BLIS_POSIX ) + printf( "detected BLIS_THREAD_IMPL=pthreads.\n" ); + else + printf( "detected BLIS_THREAD_IMPL=single.\n" ); + #endif + } + else + { + // If BLIS_THREAD_IMPL was unset, default to the implementation that + // was determined at configure-time. + #ifdef BLIS_ENABLE_OPENMP_AS_DEFAULT + ti = BLIS_OPENMP; + #endif + #ifdef BLIS_ENABLE_PTHREADS_AS_DEFAULT + ti = BLIS_POSIX; + #endif + + #ifdef PRINT_IMPL + printf( "BLIS_THREAD_IMPL unset.\n" ); + if ( ti == BLIS_OPENMP ) + printf( "defaulting to BLIS_THREAD_IMPL=openmp.\n" ); + else if ( ti == BLIS_POSIX ) + printf( "defaulting to BLIS_THREAD_IMPL=pthreads.\n" ); + #endif + } + // Try to read BLIS_NUM_THREADS first. nt = bli_env_get_var( "BLIS_NUM_THREADS", -1 ); - // If BLIS_NUM_THREADS was not set, try to read OMP_NUM_THREADS. - if ( nt == -1 ) - nt = bli_env_get_var( "OMP_NUM_THREADS", -1 ); + // If BLIS_NUM_THREADS was not set, try to read BLIS_NT. + if ( nt == -1 ) nt = bli_env_get_var( "BLIS_NT", -1 ); + + // If neither BLIS_NUM_THREADS nor BLIS_NT were set, try OMP_NUM_THREADS. + if ( nt == -1 ) nt = bli_env_get_var( "OMP_NUM_THREADS", -1 ); // Read the environment variables for the number of threads (ways of // parallelism) for each individual loop. @@ -1650,13 +1725,15 @@ void bli_thread_init_rntm_from_env // Now we use the values of nt_set and ways_set to determine how to // interpret the original values we found in the rntm_t object. - if ( ways_set == TRUE ) + if ( ( ways_set == TRUE && nt_set == TRUE ) || + ( ways_set == TRUE && nt_set == FALSE ) ) { // If the per-loop ways of parallelism were set, then we use the values // that were given and interpreted above. The only thing left to do is - // calculate the correct number of threads. Notice that if the user also - // happened to set BLIS_NUM_THREADS, that value is discarded in favor of - // the implied value from the per-loop ways of parallelism. + // calculate the correct number of threads. Notice that whatever value + // may have been asigned to BLIS_NUM_THREADS will be ignored, and the + // total number of threads will be taken to be the number implied from + // the per-loop ways of parallelism. nt = jc * pc * ic * jr * ir; auto_factor = FALSE; @@ -1682,21 +1759,27 @@ void bli_thread_init_rntm_from_env #else + // Note that we don't even bother checking BLIS_THREAD_IMPL if neither + // OpenMP nor pthreads was enabled at compile time. + ti = BLIS_SINGLE; + // When multithreading is disabled, always set the per-loop ways of // parallelism to 1. - nt = 1; jc = pc = ic = jr = ir = 1; + nt = 1; + auto_factor = FALSE; #endif // Save the results back in the runtime object. - bli_rntm_set_auto_factor_only( auto_factor, rntm ); + bli_rntm_set_thread_impl_only( ti, rntm ); bli_rntm_set_num_threads_only( nt, rntm ); bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm ); + bli_rntm_set_auto_factor_only( auto_factor, rntm ); -#if 0 + #if 0 printf( "bli_thread_init_rntm_from_env()\n" ); bli_rntm_print( rntm ); -#endif + #endif } diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 5e9c650b5..509072e57 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -195,17 +195,19 @@ dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- -BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); -BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); -BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); -BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); -BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); -BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); - -BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); -BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); - -void bli_thread_init_rntm_from_env( rntm_t* rntm ); +BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); +BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); +BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); +BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); +BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); +BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); +BLIS_EXPORT_BLIS timpl_t bli_thread_get_thread_impl( void ); + +BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); +BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); +BLIS_EXPORT_BLIS void bli_thread_set_thread_impl( timpl_t ti ); + +void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c index bbe711400..3730ab946 100644 --- a/frame/thread/bli_thrinfo.c +++ b/frame/thread/bli_thrinfo.c @@ -360,7 +360,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl // Broadcast the temporary array to all threads in the parent's // communicator. - new_comms = bli_thread_broadcast( thread_par, new_comms ); + new_comms = bli_thread_broadcast( rntm, thread_par, new_comms ); // Chiefs in the child communicator allocate the communicator // object and store it in the array element corresponding to the @@ -368,7 +368,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl if ( child_comm_id == 0 ) new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in ); - bli_thread_barrier( thread_par ); + bli_thread_barrier( rntm, thread_par ); // All threads create a new thrinfo_t node using the communicator // that was created by their chief, as identified by parent_work_id. @@ -384,7 +384,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl NULL // sub_node ); - bli_thread_barrier( thread_par ); + bli_thread_barrier( rntm, thread_par ); // The parent's chief thread frees the temporary array of thrcomm_t // pointers. @@ -497,7 +497,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode const dim_t child_comm_id = parent_comm_id % child_nt_in; const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); - bli_thread_barrier( thread_par ); + bli_thread_barrier( rntm, thread_par ); // NOTE: Recall that parent_comm_id == child_comm_id, so checking for the // parent's chief-ness is equivalent to checking for chief-ness in the new @@ -508,7 +508,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode // Broadcast the new thrcomm_t address to the other threads in the // parent's group. - new_comm = bli_thread_broadcast( thread_par, new_comm ); + new_comm = bli_thread_broadcast( rntm, thread_par, new_comm ); // All threads create a new thrinfo_t node using the communicator // that was created by their chief, as identified by parent_work_id. @@ -524,7 +524,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode NULL // sub_node ); - bli_thread_barrier( thread_par ); + bli_thread_barrier( rntm, thread_par ); return thread_chl; } diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h index 6b9809684..9d234bc91 100644 --- a/frame/thread/bli_thrinfo.h +++ b/frame/thread/bli_thrinfo.h @@ -171,14 +171,22 @@ BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* // other thrinfo_t-related functions -BLIS_INLINE void* bli_thread_broadcast( const thrinfo_t* t, void* p ) +BLIS_INLINE void* bli_thread_broadcast( const rntm_t* rntm, const thrinfo_t* t, void* p ) { - return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); + // We can't use any bli_rntm_*() APIs here because they haven't been + // defined yet. So we have to manually access the timpl_t field (le ugh). + //const timpl_t ti = bli_rntm_thread_impl( rntm ); + + return bli_thrcomm_bcast( rntm->thread_impl, t->ocomm_id, p, t->ocomm ); } -BLIS_INLINE void bli_thread_barrier( const thrinfo_t* t ) +BLIS_INLINE void bli_thread_barrier( const rntm_t* rntm, const thrinfo_t* t ) { - bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); + // We can't use any bli_rntm_*() APIs here because they haven't been + // defined yet. So we have to manually access the timpl_t field (le ugh). + //const timpl_t ti = bli_rntm_thread_impl( rntm ); + + bli_thrcomm_barrier( rntm->thread_impl, t->ocomm_id, t->ocomm ); } diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c index 966247fd0..26a40e00f 100644 --- a/frame/thread/bli_thrinfo_sup.c +++ b/frame/thread/bli_thrinfo_sup.c @@ -250,7 +250,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl // Broadcast the temporary array to all threads in the parent's // communicator. - new_comms = bli_thread_broadcast( thread_par, new_comms ); + new_comms = bli_thread_broadcast( rntm, thread_par, new_comms ); // Chiefs in the child communicator allocate the communicator // object and store it in the array element corresponding to the @@ -258,7 +258,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl if ( child_comm_id == 0 ) new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in ); - bli_thread_barrier( thread_par ); + bli_thread_barrier( rntm, thread_par ); // All threads create a new thrinfo_t node using the communicator // that was created by their chief, as identified by parent_work_id. @@ -274,7 +274,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl NULL // sub_node ); - bli_thread_barrier( thread_par ); + bli_thread_barrier( rntm, thread_par ); // The parent's chief thread frees the temporary array of thrcomm_t // pointers. diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c index d960928a4..1e567a114 100644 --- a/sandbox/gemmlike/bls_gemm.c +++ b/sandbox/gemmlike/bls_gemm.c @@ -40,11 +40,11 @@ void bls_gemm ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c ) { bls_gemm_ex @@ -61,13 +61,13 @@ void bls_gemm void bls_gemm_ex ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ) { bli_init_once(); diff --git a/sandbox/gemmlike/bls_gemm.h b/sandbox/gemmlike/bls_gemm.h index b296ac1c0..d01c6647e 100644 --- a/sandbox/gemmlike/bls_gemm.h +++ b/sandbox/gemmlike/bls_gemm.h @@ -38,22 +38,22 @@ void bls_gemm ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c ); void bls_gemm_ex ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm ); // diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c index 1e3e5ea03..c8fd50083 100644 --- a/sandbox/gemmlike/bls_gemm_bp_var1.c +++ b/sandbox/gemmlike/bls_gemm_bp_var1.c @@ -446,7 +446,7 @@ void PASTECH2(bls_,ch,varname) \ /* This barrier is needed to prevent threads from starting to pack the next row panel of B before the current row panel is fully computed upon. */ \ - bli_thread_barrier( thread_pb ); \ + bli_thread_barrier( rntm, thread_pb ); \ } \ } \ \ diff --git a/sandbox/gemmlike/bls_gemm_check.c b/sandbox/gemmlike/bls_gemm_check.c index 369017338..9cfcf8063 100644 --- a/sandbox/gemmlike/bls_gemm_check.c +++ b/sandbox/gemmlike/bls_gemm_check.c @@ -36,12 +36,12 @@ void bls_gemm_check ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ) { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); diff --git a/sandbox/gemmlike/bls_gemm_check.h b/sandbox/gemmlike/bls_gemm_check.h index 8b9706991..bd96c4cff 100644 --- a/sandbox/gemmlike/bls_gemm_check.h +++ b/sandbox/gemmlike/bls_gemm_check.h @@ -39,11 +39,11 @@ void bls_gemm_check ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx ); diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c index 0dcc531fd..9e1f67fc5 100644 --- a/sandbox/gemmlike/bls_l3_packm_a.c +++ b/sandbox/gemmlike/bls_l3_packm_a.c @@ -61,7 +61,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_barrier( thread ); \ + bli_thread_barrier( rntm, thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \ @@ -90,7 +90,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -139,7 +139,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -317,7 +317,7 @@ void PASTECH2(bls_,ch,opname) \ ); \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( thread ); \ + bli_thread_barrier( rntm, thread ); \ } //INSERT_GENTFUNC_BASIC0( packm_a ) diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c index 9d563109a..cb8275fae 100644 --- a/sandbox/gemmlike/bls_l3_packm_b.c +++ b/sandbox/gemmlike/bls_l3_packm_b.c @@ -61,7 +61,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_barrier( thread ); \ + bli_thread_barrier( rntm, thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \ @@ -90,7 +90,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -139,7 +139,7 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -317,7 +317,7 @@ void PASTECH2(bls_,ch,opname) \ ); \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( thread ); \ + bli_thread_barrier( rntm, thread ); \ } //INSERT_GENTFUNC_BASIC0( packm_b ) diff --git a/sandbox/gemmlike/thread/bls_l3_decor.c b/sandbox/gemmlike/thread/bls_l3_decor.c new file mode 100644 index 000000000..7fa799f14 --- /dev/null +++ b/sandbox/gemmlike/thread/bls_l3_decor.c @@ -0,0 +1,148 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// Initialize a function pointer array containing function addresses for +// each of the threading-specific level-3 thread decorators. + +static l3sbx_decor_ft l3_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] = +{ + [BLIS_SINGLE] = bls_l3_thread_decorator_single, + [BLIS_OPENMP] = +#if defined(BLIS_ENABLE_OPENMP) + bls_l3_thread_decorator_openmp, +#elif defined(BLIS_ENABLE_PTHREADS) + NULL, +#else + NULL, +#endif + [BLIS_POSIX] = +#if defined(BLIS_ENABLE_PTHREADS) + bls_l3_thread_decorator_pthreads, +#elif defined(BLIS_ENABLE_OPENMP) + NULL, +#else + NULL, +#endif +}; + +// Define a dispatcher that chooses a threading-specific function from the +// above function pointer array. + +void bls_l3_thread_decorator + ( + l3sbxint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ) +{ + rntm_t rntm_l; + + // Query the threading implementation and the number of threads requested. + timpl_t ti = bli_rntm_thread_impl( rntm ); + dim_t nt = bli_rntm_num_threads( rntm ); + + if ( bli_error_checking_is_enabled() ) + bls_l3_thread_decorator_check( rntm ); + + if ( 1 < nt && ti == BLIS_SINGLE ) + { + // Here, we resolve conflicting information. The caller requested + // a sequential threading implementation, but also requested more + // than one thread. Here, we choose to favor the requested threading + // implementation over the number of threads, and so reset all + // parallelism parameters to 1. + rntm_l = *rntm; + nt = 1; + bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l ); + bli_rntm_set_num_threads_only( 1, &rntm_l ); + rntm = &rntm_l; + } + + // Use the timpl_t value to index into the corresponding function address + // from the function pointer array. + const l3sbx_decor_ft fp = l3_decor_fpa[ ti ]; + + // Call the threading-specific decorator function. + fp + ( + func, + family, + alpha, + a, + b, + beta, + c, + cntx, + rntm + ); +} + +void bls_l3_thread_decorator_check + ( + rntm_t* rntm + ) +{ + //err_t e_val; + + //e_val = bli_check_valid_thread_impl( bli_rntm_thread_impl( rntm ) ); + //bli_check_error_code( e_val ); + + const timpl_t ti = bli_rntm_thread_impl( rntm ); + + if ( +#ifndef BLIS_ENABLE_OPENMP + ti == BLIS_OPENMP || +#endif +#ifndef BLIS_ENABLE_PTHREADS + ti == BLIS_POSIX || +#endif + FALSE + ) + { + fprintf( stderr, "\n" ); + fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); + fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); + fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ ); + bli_abort(); + } +} + diff --git a/sandbox/gemmlike/thread/bls_l3_decor.h b/sandbox/gemmlike/thread/bls_l3_decor.h index bb8a95bb4..58b076270 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor.h +++ b/sandbox/gemmlike/thread/bls_l3_decor.h @@ -35,10 +35,8 @@ #ifndef BLIS_SBX_L3_DECOR_H #define BLIS_SBX_L3_DECOR_H -// -- sup definitions ---------------------------------------------------------- - // Level-3 sup internal function type. -typedef void (*l3sbxint_t) +typedef void (*l3sbxint_ft) ( obj_t* alpha, obj_t* a, @@ -50,18 +48,37 @@ typedef void (*l3sbxint_t) thrinfo_t* thread ); -// Level-3 sup thread decorator prototype. +// Level-3 thread decorator function type. +typedef void (*l3sbx_decor_ft) + ( + l3sbxint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +// Level-3 thread decorator prototype. void bls_l3_thread_decorator ( - l3sbxint_t func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + l3sbxint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + +void bls_l3_thread_decorator_check + ( + rntm_t* rntm ); // Include definitions specific to the method of multithreading. diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c index bf0d4d8bc..9c29ef27e 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c +++ b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c @@ -36,16 +36,11 @@ #ifdef BLIS_ENABLE_OPENMP -// Define a dummy thread entry function, which is needed in the pthreads -// version, so that when building Windows DLLs (with OpenMP enabled or with -// no multithreading) we don't risk having an unresolved symbol. -void* bls_l3_thread_entry( void* data_void ) { return NULL; } - //#define PRINT_THRINFO -void bls_l3_thread_decorator +void bls_l3_thread_decorator_openmp ( - l3sbxint_t func, + l3sbxint_ft func, opid_t family, obj_t* alpha, obj_t* a, @@ -65,7 +60,7 @@ void bls_l3_thread_decorator // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. - array_t* restrict array = bli_sba_checkout_array( n_threads ); + array_t* array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do // this up-front only so that we have the rntm_t.sba_pool field @@ -78,7 +73,7 @@ void bls_l3_thread_decorator bli_pba_rntm_set_pba( rntm ); // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); _Pragma( "omp parallel num_threads(n_threads)" ) @@ -93,8 +88,6 @@ void bls_l3_thread_decorator const dim_t tid = omp_get_thread_num(); // Check for a somewhat obscure OpenMP thread-mistmatch issue. - // NOTE: This calls the same function used for the conventional/large - // code path. bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); // Use the thread id to access the appropriate pool_t* within the diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.h b/sandbox/gemmlike/thread/bls_l3_decor_openmp.h index 9c956d7c3..8198a1ba1 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.h +++ b/sandbox/gemmlike/thread/bls_l3_decor_openmp.h @@ -38,6 +38,19 @@ // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP +void bls_l3_thread_decorator_openmp + ( + l3sbxint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + #endif #endif diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c index ff723a4ce..95d0e968e 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c +++ b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c @@ -39,18 +39,18 @@ // A data structure to assist in passing operands to additional threads. typedef struct thread_data { - l3sbxint_t func; - opid_t family; - obj_t* alpha; - obj_t* a; - obj_t* b; - obj_t* beta; - obj_t* c; - cntx_t* cntx; - rntm_t* rntm; - dim_t tid; - thrcomm_t* gl_comm; - array_t* array; + l3sbxint_ft func; + opid_t family; + obj_t* alpha; + obj_t* a; + obj_t* b; + obj_t* beta; + obj_t* c; + cntx_t* cntx; + rntm_t* rntm; + dim_t tid; + thrcomm_t* gl_comm; + array_t* array; } thread_data_t; // Entry point function for additional threads. @@ -58,7 +58,7 @@ void* bls_l3_thread_entry( void* data_void ) { thread_data_t* data = data_void; - l3sbxint_t func = data->func; + l3sbxint_ft func = data->func; opid_t family = data->family; obj_t* alpha = data->alpha; obj_t* a = data->a; @@ -108,17 +108,17 @@ void* bls_l3_thread_entry( void* data_void ) return NULL; } -void bls_l3_thread_decorator +void bls_l3_thread_decorator_pthreads ( - l3sbxint_t func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + l3sbxint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm ) { err_t r_val; @@ -145,7 +145,7 @@ void bls_l3_thread_decorator bli_pba_rntm_set_pba( rntm ); // Allocate a global communicator for the root thrinfo_t structures. - thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); // Allocate an array of pthread objects and auxiliary data structs to pass // to the thread entry functions. @@ -211,5 +211,12 @@ void bls_l3_thread_decorator bli_free_intl( datas ); } +#else + +// Define a dummy function bli_l3_thread_entry(), which is needed for +// consistent dynamic linking behavior when building shared objects in Linux +// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol. +void* bli_l3_thread_entry( void* data_void ) { return NULL; } + #endif diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h index ef5c3bad4..162086bb0 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h +++ b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h @@ -41,6 +41,19 @@ // Thread entry point prototype. void* bls_l3_thread_entry( void* data_void ); +void bls_l3_thread_decorator_pthreads + ( + l3sbxint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); + #endif #endif diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.c b/sandbox/gemmlike/thread/bls_l3_decor_single.c index 8bb04817f..b5f5a6669 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_single.c +++ b/sandbox/gemmlike/thread/bls_l3_decor_single.c @@ -34,23 +34,21 @@ #include "blis.h" -#ifndef BLIS_ENABLE_MULTITHREADING - #define SKIP_THRINFO_TREE -void bls_l3_thread_decorator +void bls_l3_thread_decorator_single ( - l3sbxint_t func, - opid_t family, - //pack_t schema_a, - //pack_t schema_b, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + l3sbxint_ft func, + opid_t family, + //pack_t schema_a, + //pack_t schema_b, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm ) { // For sequential execution, we use only one thread. @@ -62,7 +60,7 @@ void bls_l3_thread_decorator // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. - array_t* restrict array = bli_sba_checkout_array( n_threads ); + array_t* array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. bli_sba_rntm_set_pool( 0, array, rntm ); @@ -72,14 +70,14 @@ void bls_l3_thread_decorator #ifndef SKIP_THRINFO_TREE // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); #endif { // NOTE: We don't need to create another copy of the rntm_t since // it was already copied in one of the high-level oapi functions. - rntm_t* restrict rntm_p = rntm; + rntm_t* rntm_p = rntm; // There is only one thread id (for the thief thread). const dim_t tid = 0; @@ -137,5 +135,3 @@ void bls_l3_thread_decorator bli_sba_checkin_array( array ); } -#endif - diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.h b/sandbox/gemmlike/thread/bls_l3_decor_single.h index 211a43a89..82dfbc993 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_single.h +++ b/sandbox/gemmlike/thread/bls_l3_decor_single.h @@ -35,10 +35,20 @@ #ifndef BLIS_SBX_L3_DECOR_SINGLE_H #define BLIS_SBX_L3_DECOR_SINGLE_H -// Definitions specific to situations when multithreading is disabled. -#ifndef BLIS_ENABLE_MULTITHREADING - -#endif +void bls_l3_thread_decorator_single + ( + l3sbxint_ft func, + opid_t family, + //pack_t schema_a, + //pack_t schema_b, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm + ); #endif diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 3bfde8788..a355385a3 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -775,13 +775,34 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) else int_type_size = sizeof(gint_t) * 8; - char impl_str[16]; + char impl_str[32]; + char def_impl_set_str[32]; + char def_impl_unset_str[32]; char jrir_str[16]; - // Describe the threading implementation. - if ( bli_info_get_enable_openmp() ) sprintf( impl_str, "openmp" ); - else if ( bli_info_get_enable_pthreads() ) sprintf( impl_str, "pthreads" ); - else /* threading disabled */ sprintf( impl_str, "disabled" ); + const bool has_openmp = bli_info_get_enable_openmp(); + const bool has_pthreads = bli_info_get_enable_pthreads(); + const bool openmp_is_def = bli_info_get_enable_openmp_as_default(); + const bool pthreads_is_def = bli_info_get_enable_pthreads_as_default(); + const timpl_t ti = bli_thread_get_thread_impl(); + + // List the available threading implementation(s). + if ( has_openmp && has_pthreads ) sprintf( impl_str, "openmp,pthreads,single" ); + else if ( has_openmp ) sprintf( impl_str, "openmp,single" ); + else if ( has_pthreads ) sprintf( impl_str, "pthreads,single" ); + else sprintf( impl_str, "single only" ); + + // Describe the default threading implementation that would be active if + // or when BLIS_THREAD_IMPL is unset. + if ( openmp_is_def ) sprintf( def_impl_unset_str, "openmp" ); + else if ( pthreads_is_def ) sprintf( def_impl_unset_str, "pthreads" ); + else sprintf( def_impl_unset_str, "single" ); + + // Describe the default threading implementation as the testsuite was + // currently run. + if ( ti == BLIS_OPENMP ) sprintf( def_impl_set_str, "openmp" ); + else if ( ti == BLIS_POSIX ) sprintf( def_impl_set_str, "pthreads" ); + else sprintf( def_impl_set_str, "single" ); // Describe the status of jrir thread partitioning. if ( bli_info_get_thread_part_jrir_slab() ) sprintf( jrir_str, "slab" ); @@ -878,7 +899,9 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "--- BLIS parallelization info ---\n" ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "multithreading %s\n", impl_str ); + libblis_test_fprintf_c( os, "multithreading modes %s\n", impl_str ); + libblis_test_fprintf_c( os, " default mode %s\n", def_impl_unset_str ); + libblis_test_fprintf_c( os, " current mode %s\n", def_impl_set_str ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "thread auto-factorization \n" ); libblis_test_fprintf_c( os, " m dim thread ratio %d\n", ( int )BLIS_THREAD_RATIO_M ); From 036a4f9d822df25a76a653e70be76fb02284d3d3 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 22 Sep 2022 18:36:50 -0500 Subject: [PATCH 087/230] Refactored some rntm_t management code. (#666) Details: - Separated the "sanitizing" code from the auto-factorization code in bli_rntm_set_ways_from_rntm() and _rntm_set_ways_from_rntm_sup(). The santizing code now resides in bli_rntm_sanitize() while the factorization code resides in bli_rntm_factorize() and bli_rntm_factorize_sup(). (There are two different functions because the conventional and sup factorization codes are currently somewhat different.) Also note that the factorization code now relies on the .auto_factor field to have already been set, either during rntm_t initialization or when the rntm_t was previously updated and santized. So rather than locally determining whether to auto- factorize, those functions just read the .auto_factor field and proceed accordingly. - Refactored and removed most code from bli_thread_init_rntm_from_env(). This function now reads the environment variables needed to set nt, jc, pc, ic, jr, and ir; sets them into the global rntm_t; and then calls bli_rntm_sanitize() in order to make sure that the contents are in a "good" state. Thanks to Devin Matthews for suggesting this refactoring. - Redefined bli_rntm_set_num_threads() and bli_rntm_set_ways() such that if multithreading is disabled at compile time (that is, if the cpp macro BLIS_ENABLE_MULTITHREADING is undefined), they ignore the caller's request and instead clear the nt and ways fields. - Redefined bli_thread_set_num_threads() and bli_thread_set_ways() such that if multithreading is disabled at compile time (that is, if the cpp macro BLIS_ENABLE_MULTITHREADING is undefined), they ignore the caller's request and do nothing. - Redefined bli_rntm_set_num_threads() and bli_rntm_set_ways() as true functions rather than static inline functions. - In bli_rntm.c, statically initialize the global_rntm global variable via the BLIS_RNTM_INITIALIZER macro. - In bli_rntm.h, defined bli_rntm_clear_auto_factor(), which sets the .auto_factor field of the rntm_t to FALSE. - Reorganized order of some inline function definitions in bli_rntm.h. - Changed the default value given to the .auto_factor field by the BLIS_RNTM_INITIALIZER macro from TRUE to FALSE. - Call bli_rntm_clear_auto_factor() instead of bli_rntm_set_auto_factor_only() in bli_rntm_init(). - Comment/whitespace updates. --- frame/3/bli_l3_sup_ref.c | 4 +- frame/base/bli_rntm.c | 453 ++++++++++++++++++++------------------ frame/base/bli_rntm.h | 90 ++++---- frame/thread/bli_thread.c | 178 ++++++--------- frame/thread/bli_thread.h | 1 + 5 files changed, 364 insertions(+), 362 deletions(-) diff --git a/frame/3/bli_l3_sup_ref.c b/frame/3/bli_l3_sup_ref.c index 8eb7a6d4b..76314aba7 100644 --- a/frame/3/bli_l3_sup_ref.c +++ b/frame/3/bli_l3_sup_ref.c @@ -89,7 +89,7 @@ err_t bli_gemmsup_ref // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop. - bli_rntm_set_ways_from_rntm_sup + bli_rntm_factorize_sup ( bli_obj_length( c ), bli_obj_width( c ), @@ -163,7 +163,7 @@ err_t bli_gemmtsup_ref // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop. - bli_rntm_set_ways_from_rntm_sup + bli_rntm_factorize_sup ( bli_obj_length( c ), bli_obj_width( c ), diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c index 895976679..786998f23 100644 --- a/frame/base/bli_rntm.c +++ b/frame/base/bli_rntm.c @@ -36,12 +36,12 @@ // The global rntm_t structure, which holds the global thread settings // along with a few other key parameters. -rntm_t global_rntm; +rntm_t global_rntm = BLIS_RNTM_INITIALIZER; // A mutex to allow synchronous access to global_rntm. bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER; -// ---------------------------------------------------------------------------- +// ----------------------------------------------------------------------------- void bli_rntm_init_from_global( rntm_t* rntm ) { @@ -59,6 +59,76 @@ void bli_rntm_init_from_global( rntm_t* rntm ) // ----------------------------------------------------------------------------- +void bli_rntm_set_num_threads + ( + dim_t nt, + rntm_t* rntm + ) +{ +#ifdef BLIS_ENABLE_MULTITHREADING + + // Record the total number of threads to use. + bli_rntm_set_num_threads_only( nt, rntm ); + + // Set the individual ways of parallelism to default states. This + // must be done before sanitization so that the .num_threads field + // will prevail over any previous ways that may have been set. + bli_rntm_clear_ways_only( rntm ); + + // Ensure that the rntm_t is in a consistent state. + bli_rntm_sanitize( rntm ); + +#else + + // When multithreading is disabled at compile time, ignore the user's + // request. And just to be safe, reassert the default rntm_t values. + bli_rntm_clear_num_threads_only( rntm ); + bli_rntm_clear_ways_only( rntm ); + +#endif +} + +void bli_rntm_set_ways + ( + dim_t jc, + dim_t pc, + dim_t ic, + dim_t jr, + dim_t ir, + rntm_t* rntm + ) +{ +#ifdef BLIS_ENABLE_MULTITHREADING + + // Record the number of ways of parallelism per loop. + bli_rntm_set_jc_ways_only( jc, rntm ); + bli_rntm_set_pc_ways_only( 1, rntm ); // Disable pc_nt values. + bli_rntm_set_ic_ways_only( ic, rntm ); + bli_rntm_set_jr_ways_only( jr, rntm ); + bli_rntm_set_ir_ways_only( ir, rntm ); + bli_rntm_set_pr_ways_only( 1, rntm ); + + // Set the total number of threads to its default state. This isn't + // strictly necessary, but is done in case the priority of nt vs. + // ways ever changes. (Currently, the ways always prevail over the + // number of threads, if both are set.) + bli_rntm_clear_num_threads_only( rntm ); + + // Ensure that the rntm_t is in a consistent state. + bli_rntm_sanitize( rntm ); + +#else + + // When multithreading is disabled at compile time, ignore the user's + // request. And just to be safe, reassert the default rntm_t values. + bli_rntm_clear_num_threads_only( rntm ); + bli_rntm_clear_ways_only( rntm ); + +#endif +} + +// ----------------------------------------------------------------------------- + void bli_rntm_set_ways_for_op ( opid_t l3_op, @@ -71,7 +141,7 @@ void bli_rntm_set_ways_for_op { // Set the number of ways for each loop, if needed, depending on what // kind of information is already stored in the rntm_t object. - bli_rntm_set_ways_from_rntm( m, n, k, rntm ); + bli_rntm_factorize( m, n, k, rntm ); #if 0 printf( "bli_rntm_set_ways_for_op()\n" ); @@ -153,146 +223,112 @@ bli_rntm_print( rntm ); } } -void bli_rntm_set_ways_from_rntm +void bli_rntm_sanitize ( - dim_t m, - dim_t n, - dim_t k, rntm_t* rntm ) { - // NOTE: While much of the multithreading cpp case of this function may seem - // redundant with bli_thread_init_rntm_from_env(), we need them both. The - // bli_thread_init_rntm_from_env() function is only called to initialize the - // global rntm_t. There, the consistency logic serves to make sure that sane - // values will be returned if the application (in the time between library - // initialization and when computation begins) subsequently queries the - // number of threads or ways via the runtime API. This function also needs - // the same consistency logic, but for a different reason: this function - // guarantees that the rntm_t has sane values in the event that the - // application passed in a custom rntm_t via an expert interface. - - - bool auto_factor = FALSE; - dim_t nt; - dim_t jc, pc, ic, jr, ir; - #ifdef BLIS_ENABLE_MULTITHREADING - nt = bli_rntm_num_threads( rntm ); - jc = bli_rntm_jc_ways( rntm ); - pc = bli_rntm_pc_ways( rntm ); - ic = bli_rntm_ic_ways( rntm ); - jr = bli_rntm_jr_ways( rntm ); - ir = bli_rntm_ir_ways( rntm ); + timpl_t ti = bli_rntm_thread_impl( rntm ); + dim_t nt = bli_rntm_num_threads( rntm ); + dim_t jc = bli_rntm_jc_ways( rntm ); + dim_t pc = bli_rntm_pc_ways( rntm ); + dim_t ic = bli_rntm_ic_ways( rntm ); + dim_t jr = bli_rntm_jr_ways( rntm ); + dim_t ir = bli_rntm_ir_ways( rntm ); + + bool auto_factor = FALSE; bool nt_set = FALSE; bool ways_set = FALSE; - // Some users are mischievous/dumb. Make sure they don't cause trouble. - if ( nt < 1 ) nt = 1; - if ( jc < 1 ) jc = 1; - if ( pc < 1 ) pc = 1; - if ( ic < 1 ) ic = 1; - if ( jr < 1 ) jr = 1; - if ( ir < 1 ) ir = 1; - - // First, we establish whether or not the number of threads or ways of - // parallelism were set to meaningful values. - if ( nt > 1 ) { nt_set = TRUE; } - if ( jc > 1 ) { ways_set = TRUE; } - if ( pc > 1 ) { ways_set = TRUE; pc = 1; } // Disable pc_nt values. - if ( ic > 1 ) { ways_set = TRUE; } - if ( jr > 1 ) { ways_set = TRUE; } - if ( ir > 1 ) { ways_set = TRUE; } - - // Now we use the values of nt_set and ways_set to determine how to - // interpret the original values we found in the rntm_t object. - - if ( ways_set == TRUE ) + if ( ti == BLIS_SINGLE ) { - // If the per-loop ways of parallelism were set, then we use the values - // that were given and interpreted above. The only thing left to do is - // calculate the correct number of threads. Notice that if the user also - // happened to set the total number of threads that value is discarded - // in favor of the implied value from the per-loop ways of parallelism. + // If the threading implementation was set to BLIS_SINGLE, we ignore + // everything else. - nt = jc * pc * ic * jr * ir; + nt = 1; + jc = pc = ic = jr = ir = 1; auto_factor = FALSE; } - else if ( ways_set == FALSE && nt_set == TRUE ) + else // if ( ti != BLIS_SINGLE ) { - // If the ways were not set but the number of thread was set, then we - // will attempt to automatically generate a thread factorization that - // will work given the problem size. - - #ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS - // If use of prime numbers is disallowed for automatic thread - // factorizations, we first check if the number of threads requested - // is prime. If it is prime, and it exceeds a minimum threshold, then - // we reduce the number of threads by one so that the number is not - // prime. This will allow for automatic thread factorizations to span - // two dimensions (loops), which tends to be more efficient. - if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1; - #endif - - //printf( "m n = %d %d BLIS_THREAD_RATIO_M _N = %d %d\n", - // (int)m, (int)n, (int)BLIS_THREAD_RATIO_M, - // (int)BLIS_THREAD_RATIO_N ); - - bli_thread_partition_2x2( nt, m*BLIS_THREAD_RATIO_M, - n*BLIS_THREAD_RATIO_N, &ic, &jc ); - - //printf( "jc ic = %d %d\n", (int)jc, (int)ic ); - - for ( ir = BLIS_THREAD_MAX_IR ; ir > 1 ; ir-- ) + // If the threading implementation was set to one of the true + // multithreading implementations (e.g. BLIS_OPENMP, BLIS_POSIX), + // we proceed to interpret and process the rntm_t's fields. + + // Some users are mischievous/dumb. Make sure they don't cause trouble. + if ( nt < 1 ) nt = 1; + if ( jc < 1 ) jc = 1; + if ( pc < 1 ) pc = 1; + if ( ic < 1 ) ic = 1; + if ( jr < 1 ) jr = 1; + if ( ir < 1 ) ir = 1; + + // Now establish whether or not the number of threads or ways of + // parallelism were set to meaningful values. + if ( nt > 1 ) { nt_set = TRUE; } + if ( jc > 1 ) { ways_set = TRUE; } + if ( pc > 1 ) { ways_set = TRUE; pc = 1; } // Disable pc_nt values. + if ( ic > 1 ) { ways_set = TRUE; } + if ( jr > 1 ) { ways_set = TRUE; } + if ( ir > 1 ) { ways_set = TRUE; } + + // Next, we use the values of nt_set and ways_set to determine how to + // interpret the original values we found in the rntm_t object. + + if ( ways_set == TRUE ) { - if ( ic % ir == 0 ) { ic /= ir; break; } + // If the per-loop ways of parallelism were set, then we use the values + // that were given and interpreted above. Since the per-loop ways are + // known, we can calculate the total number of threads. Notice that if + // the user also happened to set the total number of threads, that value + // is discarded in favor of the implied value from the per-loop ways of + // parallelism. + + nt = jc * pc * ic * jr * ir; + auto_factor = FALSE; } + else if ( ways_set == FALSE && nt_set == TRUE ) + { + // If the ways were not set but the number of thread was set, then we + // will attempt to automatically generate a thread factorization that + // will work given the problem size. This happens later, in + // bli_rntm_factorize(). - for ( jr = BLIS_THREAD_MAX_JR ; jr > 1 ; jr-- ) + auto_factor = TRUE; + } + else // if ( ways_set == FALSE && nt_set == FALSE ) { - if ( jc % jr == 0 ) { jc /= jr; break; } + // If neither the ways nor the number of threads were set, then the + // rntm_t was not meaningfully changed since initialization. This means + // the ways are already 1, which will lead to the default behavior of + // single-threaded execution. } - - // Force the number of ways of parallelism in the pc loop to 1 - // just in case the caller set it to something greater than 1. - pc = 1; - - // Make note that auto-factorization was performed. - auto_factor = TRUE; - } - else // if ( ways_set == FALSE && nt_set == FALSE ) - { - // If neither the ways nor the number of threads were set, then the - // rntm_t was not meaningfully changed since initialization. This means - // the fields are all 1, which will lead to the default behavior of - // single-threaded execution. - //nt = jc = pc = ic = jr = ir = 1; - //auto_factor = FALSE; } + // Save the results back in the rntm_t object. + // Note: We don't need to set the .thread_impl field of the rntm_t because + // it was not changed in the sanitization process. + //bli_rntm_set_thread_impl_only( ti, rntm ); + bli_rntm_set_num_threads_only( nt, rntm ); + bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm ); + bli_rntm_set_auto_factor_only( auto_factor, rntm ); + #else // When multithreading is disabled, always set the per-loop ways of // parallelism to 1. - nt = 1; - jc = pc = ic = jr = ir = 1; + bli_rntm_set_thread_impl_only( BLIS_SINGLE, rntm ); + bli_rntm_set_num_threads_only( 1, rntm ); + bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm ); + bli_rntm_set_auto_factor_only( FALSE, rntm ); #endif - - // Save the results back in the rntm_t object. - bli_rntm_set_auto_factor_only( auto_factor, rntm ); - bli_rntm_set_num_threads_only( nt, rntm ); - bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm ); - - // NOTE: The caller should have already set the timpl_t field of the rntm_t, - // either in the course of it being initialized via BLIS_RNTM_INITIALIZER - // or bli_rntm_init(), or by the user (subsequently) setting the value - // directly via bli_rntm_set_thread_impl(). } -void bli_rntm_set_ways_from_rntm_sup +void bli_rntm_factorize ( dim_t m, dim_t n, @@ -300,122 +336,115 @@ void bli_rntm_set_ways_from_rntm_sup rntm_t* rntm ) { - bool auto_factor = FALSE; - dim_t nt; - dim_t jc, pc, ic, jr, ir; - #ifdef BLIS_ENABLE_MULTITHREADING - nt = bli_rntm_num_threads( rntm ); - jc = bli_rntm_jc_ways( rntm ); - pc = bli_rntm_pc_ways( rntm ); - ic = bli_rntm_ic_ways( rntm ); - jr = bli_rntm_jr_ways( rntm ); - ir = bli_rntm_ir_ways( rntm ); + // The .auto_factor field would have been set either at initialization or + // when the rntm_t was sanitized after being updated by the user. + if ( bli_rntm_auto_factor( rntm ) ) + { + dim_t nt = bli_rntm_num_threads( rntm ); + dim_t jc = bli_rntm_jc_ways( rntm ); + dim_t pc = bli_rntm_pc_ways( rntm ); + dim_t ic = bli_rntm_ic_ways( rntm ); + dim_t jr = bli_rntm_jr_ways( rntm ); + dim_t ir = bli_rntm_ir_ways( rntm ); - bool nt_set = FALSE; - bool ways_set = FALSE; + if ( 0 < m && 0 < n && 0 <= k ) + { + #ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS + // If use of prime numbers is disallowed for automatic thread + // factorizations, we first check if the number of threads requested + // is prime. If it is prime, and it exceeds a minimum threshold, then + // we reduce the number of threads by one so that the number is not + // prime. This will allow for automatic thread factorizations to span + // two dimensions (loops), which tends to be more efficient. + if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1; + #endif + + //printf( "m n = %d %d BLIS_THREAD_RATIO_M _N = %d %d\n", + // (int)m, (int)n, (int)BLIS_THREAD_RATIO_M, + // (int)BLIS_THREAD_RATIO_N ); + + bli_thread_partition_2x2( nt, m*BLIS_THREAD_RATIO_M, + n*BLIS_THREAD_RATIO_N, &ic, &jc ); + + //printf( "jc ic = %d %d\n", (int)jc, (int)ic ); + + for ( ir = BLIS_THREAD_MAX_IR ; ir > 1 ; ir-- ) + { + if ( ic % ir == 0 ) { ic /= ir; break; } + } - // Some users are mischievous/dumb. Make sure they don't cause trouble. - if ( nt < 1 ) nt = 1; - if ( jc < 1 ) jc = 1; - if ( pc < 1 ) pc = 1; - if ( ic < 1 ) ic = 1; - if ( jr < 1 ) jr = 1; - if ( ir < 1 ) ir = 1; - - // First, we establish whether or not the number of threads or ways of - // parallelism were set to meaningful values. - if ( nt > 1 ) { nt_set = TRUE; } - if ( jc > 1 ) { ways_set = TRUE; } - if ( pc > 1 ) { ways_set = TRUE; pc = 1; } // Disable pc_nt values. - if ( ic > 1 ) { ways_set = TRUE; } - if ( jr > 1 ) { ways_set = TRUE; } - if ( ir > 1 ) { ways_set = TRUE; } - - // Now we use the values of nt_set and ways_set to determine how to - // interpret the original values we found in the rntm_t object. - - if ( ways_set == TRUE ) - { - // If the per-loop ways of parallelism were set, then we use the values - // that were given and interpreted above. The only thing left to do is - // calculate the correct number of threads. Notice that if the user also - // happened to set the total number of threads that value is discarded - // in favor of the implied value from the per-loop ways of parallelism. + for ( jr = BLIS_THREAD_MAX_JR ; jr > 1 ; jr-- ) + { + if ( jc % jr == 0 ) { jc /= jr; break; } + } + } - nt = jc * pc * ic * jr * ir; - auto_factor = FALSE; + // Save the results back in the rntm_t object. + bli_rntm_set_num_threads_only( nt, rntm ); + bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm ); } - else if ( ways_set == FALSE && nt_set == TRUE ) + +#else + + // When multithreading is disabled at compile time, the rntm can keep its + // default initialization values since using one thread requires no + // factorization. + +#endif +} + +void bli_rntm_factorize_sup + ( + dim_t m, + dim_t n, + dim_t k, + rntm_t* rntm + ) +{ +#ifdef BLIS_ENABLE_MULTITHREADING + + // The .auto_factor field would have been set either at initialization or + // when the rntm_t was sanitized after being updated by the user. + if ( bli_rntm_auto_factor( rntm ) ) { - // If the ways were not set but the number of thread was set, then we - // will attempt to automatically generate a thread factorization that - // work given the problem size. - - #ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS - // If use of prime numbers is disallowed for automatic thread - // factorizations, we first check if the number of threads requested - // is prime. If it is prime, and it exceeds a minimum threshold, then - // we reduce the number of threads by one so that the number is not - // prime. This will allow for automatic thread factorizations to span - // two dimensions (loops), which tends to be more efficient. - if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1; - #endif - - //bli_thread_partition_2x2( nt, m*BLIS_THREAD_SUP_RATIO_M, - // n*BLIS_THREAD_SUP_RATIO_N, &ic, &jc ); - bli_thread_partition_2x2( nt, m, - n, &ic, &jc ); - - //printf( "bli_rntm_set_ways_from_rntm_sup(): jc = %d ic = %d\n", - // (int)jc, (int)ic ); - - #if 0 - for ( ir = BLIS_THREAD_SUP_MAX_IR ; ir > 1 ; ir-- ) - { - if ( ic % ir == 0 ) { ic /= ir; break; } - } + dim_t nt = bli_rntm_num_threads( rntm ); + dim_t jc = bli_rntm_jc_ways( rntm ); + dim_t pc = bli_rntm_pc_ways( rntm ); + dim_t ic = bli_rntm_ic_ways( rntm ); + dim_t jr = bli_rntm_jr_ways( rntm ); + dim_t ir = bli_rntm_ir_ways( rntm ); - for ( jr = BLIS_THREAD_SUP_MAX_JR ; jr > 1 ; jr-- ) + if ( 0 < m && 0 < n && 0 <= k ) { - if ( jc % jr == 0 ) { jc /= jr; break; } + #ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS + // If use of prime numbers is disallowed for automatic thread + // factorizations, we first check if the number of threads requested + // is prime. If it is prime, and it exceeds a minimum threshold, then + // we reduce the number of threads by one so that the number is not + // prime. This will allow for automatic thread factorizations to span + // two dimensions (loops), which tends to be more efficient. + if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1; + #endif + + bli_thread_partition_2x2( nt, m, + n, &ic, &jc ); + ir = 1; jr = 1; } - #else - ir = 1; - jr = 1; - #endif - // Force the number of ways of parallelism in the pc loop to 1 just in - // case the caller set it to something greater than 1. - pc = 1; - - // Make note that auto-factorization was performed. - auto_factor = TRUE; - } - else // if ( ways_set == FALSE && nt_set == FALSE ) - { - // If neither the ways nor the number of threads were set, then the - // rntm_t was not meaningfully changed since initialization. This means - // the fields are all 1, which will lead to the default behavior of - // single-threaded execution. - //nt = jc = pc = ic = jr = ir = 1; - //auto_factor = FALSE; + // Save the results back in the rntm_t object. + bli_rntm_set_num_threads_only( nt, rntm ); + bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm ); } #else - // When multithreading is disabled, always set the per-loop ways of - // parallelism to 1. - nt = 1; - jc = pc = ic = jr = ir = 1; + // When multithreading is disabled at compile time, the rntm can keep its + // default initialization values since using one thread requires no + // factorization. #endif - - // Save the results back in the rntm_t object. - bli_rntm_set_auto_factor_only( auto_factor, rntm ); - bli_rntm_set_num_threads_only( nt, rntm ); - bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm ); } void bli_rntm_print diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h index 426b74d60..f6756c589 100644 --- a/frame/base/bli_rntm.h +++ b/frame/base/bli_rntm.h @@ -45,10 +45,10 @@ typedef struct rntm_s { timpl_t thread_impl; - bool auto_factor; - dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; + + bool auto_factor; bool pack_a; bool pack_b; bool l3_sup; @@ -214,14 +214,6 @@ BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm ); } -BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) -{ - bli_rntm_set_sba_pool( NULL, rntm ); -} -BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) -{ - bli_rntm_set_pba( NULL, rntm ); -} // // -- rntm_t modification (public API) ----------------------------------------- @@ -233,31 +225,6 @@ BLIS_INLINE void bli_rntm_set_thread_impl( timpl_t thread_impl, rntm_t* rntm ) bli_rntm_set_thread_impl_only( thread_impl, rntm ); } -BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) -{ - // Record the total number of threads to use. - bli_rntm_set_num_threads_only( nt, rntm ); - - // Set the individual ways of parallelism to default states. - bli_rntm_clear_ways_only( rntm ); -} - -BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) -{ - // Record the number of ways of parallelism per loop. - bli_rntm_set_jc_ways_only( jc, rntm ); - bli_rntm_set_pc_ways_only( 1, rntm ); - bli_rntm_set_ic_ways_only( ic, rntm ); - bli_rntm_set_jr_ways_only( jr, rntm ); - bli_rntm_set_ir_ways_only( ir, rntm ); - bli_rntm_set_pr_ways_only( 1, rntm ); - - // Set the num_threads field to the product of all the ways. The only - // benefit of doing this, though, is that the user can query the total - // number of threads from the rntm_t after calling this function. - bli_rntm_set_num_threads_only( jc * 1 * ic * jr * ir, rntm ); -} - BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. @@ -287,6 +254,15 @@ BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) // -- rntm_t modification (internal use only) ---------------------------------- // +BLIS_INLINE void bli_rntm_clear_thread_impl( rntm_t* rntm ) +{ + bli_rntm_set_thread_impl_only( BLIS_SINGLE, rntm ); +} + +BLIS_INLINE void bli_rntm_clear_auto_factor( rntm_t* rntm ) +{ + bli_rntm_set_auto_factor_only( FALSE, rntm ); +} BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); @@ -300,6 +276,15 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) bli_rntm_set_l3_sup( TRUE, rntm ); } +BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) +{ + bli_rntm_set_sba_pool( NULL, rntm ); +} +BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) +{ + bli_rntm_set_pba( NULL, rntm ); +} + // // -- rntm_t initialization ---------------------------------------------------- // @@ -310,10 +295,10 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) #define BLIS_RNTM_INITIALIZER \ { \ - .thread_impl = SINGLE, \ - .auto_factor = TRUE, \ + .thread_impl = BLIS_SINGLE, \ .num_threads = 1, \ .thrloop = { 1, 1, 1, 1, 1, 1 }, \ + .auto_factor = FALSE, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ @@ -323,12 +308,12 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { - bli_rntm_set_thread_impl_only( BLIS_SINGLE, rntm ); - - bli_rntm_set_auto_factor_only( TRUE, rntm ); + bli_rntm_clear_thread_impl( rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); + + bli_rntm_clear_auto_factor( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); @@ -363,6 +348,22 @@ BLIS_INLINE dim_t bli_rntm_calc_num_threads BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); +BLIS_EXPORT_BLIS void bli_rntm_set_num_threads + ( + dim_t nt, + rntm_t* rntm + ); + +BLIS_EXPORT_BLIS void bli_rntm_set_ways + ( + dim_t jc, + dim_t pc, + dim_t ic, + dim_t jr, + dim_t ir, + rntm_t* rntm + ); + BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, @@ -373,7 +374,12 @@ BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op rntm_t* rntm ); -void bli_rntm_set_ways_from_rntm +void bli_rntm_sanitize + ( + rntm_t* rntm + ); + +void bli_rntm_factorize ( dim_t m, dim_t n, @@ -381,7 +387,7 @@ void bli_rntm_set_ways_from_rntm rntm_t* rntm ); -void bli_rntm_set_ways_from_rntm_sup +void bli_rntm_factorize_sup ( dim_t m, dim_t n, diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 9369b373b..eefc20fdd 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -1562,6 +1562,18 @@ timpl_t bli_thread_get_thread_impl( void ) return bli_rntm_thread_impl( &global_rntm ); } +static const char* bli_timpl_string[BLIS_NUM_THREAD_IMPLS] = +{ + [BLIS_SINGLE] = "single", + [BLIS_OPENMP] = "openmp", + [BLIS_POSIX] = "pthreads", +}; + +const char* bli_thread_get_thread_impl_str( timpl_t ti ) +{ + return bli_timpl_string[ti]; +} + // ---------------------------------------------------------------------------- void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ) @@ -1569,13 +1581,25 @@ void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ) // We must ensure that global_rntm has been initialized. bli_init_once(); +#ifdef BLIS_ENABLE_MULTITHREADING + // Acquire the mutex protecting global_rntm. bli_pthread_mutex_lock( &global_rntm_mutex ); bli_rntm_set_ways_only( jc, 1, ic, jr, ir, &global_rntm ); + // Ensure that the rntm_t is in a consistent state. + bli_rntm_sanitize( &global_rntm ); + // Release the mutex protecting global_rntm. bli_pthread_mutex_unlock( &global_rntm_mutex ); + +#else + + // When multithreading is disabled at compile time, ignore the user's + // request. + +#endif } void bli_thread_set_num_threads( dim_t n_threads ) @@ -1583,13 +1607,25 @@ void bli_thread_set_num_threads( dim_t n_threads ) // We must ensure that global_rntm has been initialized. bli_init_once(); +#ifdef BLIS_ENABLE_MULTITHREADING + // Acquire the mutex protecting global_rntm. bli_pthread_mutex_lock( &global_rntm_mutex ); bli_rntm_set_num_threads_only( n_threads, &global_rntm ); + // Ensure that the rntm_t is in a consistent state. + bli_rntm_sanitize( &global_rntm ); + // Release the mutex protecting global_rntm. bli_pthread_mutex_unlock( &global_rntm_mutex ); + +#else + + // When multithreading is disabled at compile time, ignore the user's + // request. + +#endif } void bli_thread_set_thread_impl( timpl_t ti ) @@ -1619,24 +1655,10 @@ void bli_thread_init_rntm_from_env // function is only called from bli_thread_init(), which is only called // by bli_init_once(). - // NOTE: While much of the multithreading cpp case of this function may seem - // redundant with bli_rntm_set_ways_from_rntm(), we need them both. This - // function is only called to initialize the global rntm_t. Here, the - // consistency logic serves to make sure that sane values will be returned - // if the application (in the time between library initialization and when - // computation begins) subsequently queries the number of threads or ways - // via the runtime API. The bli_rntm_set_ways_from_rntm() function also - // needs the same consistency logic, but for a different reason: that - // function guarantees that the rntm_t has sane values in the event that the - // application passed in a custom rntm_t via an expert interface. - - bool auto_factor = FALSE; - dim_t nt; - dim_t jc, pc, ic, jr, ir; - timpl_t ti; - #ifdef BLIS_ENABLE_MULTITHREADING + timpl_t ti = BLIS_SINGLE; + // Try to read BLIS_THREAD_IMPL. char* ti_env = bli_env_get_str( "BLIS_THREAD_IMPL" ); @@ -1657,18 +1679,16 @@ void bli_thread_init_rntm_from_env else ti = BLIS_SINGLE; #ifdef PRINT_IMPL - if ( ti == BLIS_OPENMP ) - printf( "detected BLIS_THREAD_IMPL=openmp.\n" ); - else if ( ti == BLIS_POSIX ) - printf( "detected BLIS_THREAD_IMPL=pthreads.\n" ); - else - printf( "detected BLIS_THREAD_IMPL=single.\n" ); + printf( "detected BLIS_THREAD_IMPL=%s.\n", + bli_thread_get_thread_impl_str( ti ); #endif } else { // If BLIS_THREAD_IMPL was unset, default to the implementation that // was determined at configure-time. + ti = BLIS_SINGLE; + #ifdef BLIS_ENABLE_OPENMP_AS_DEFAULT ti = BLIS_OPENMP; #endif @@ -1677,16 +1697,15 @@ void bli_thread_init_rntm_from_env #endif #ifdef PRINT_IMPL - printf( "BLIS_THREAD_IMPL unset.\n" ); - if ( ti == BLIS_OPENMP ) - printf( "defaulting to BLIS_THREAD_IMPL=openmp.\n" ); - else if ( ti == BLIS_POSIX ) - printf( "defaulting to BLIS_THREAD_IMPL=pthreads.\n" ); + printf( "BLIS_THREAD_IMPL unset; defaulting to BLIS_THREAD_IMPL=%s.\n", + bli_thread_get_thread_impl_str( ti ); #endif } + // ------------------------------------------------------------------------ + // Try to read BLIS_NUM_THREADS first. - nt = bli_env_get_var( "BLIS_NUM_THREADS", -1 ); + dim_t nt = bli_env_get_var( "BLIS_NUM_THREADS", -1 ); // If BLIS_NUM_THREADS was not set, try to read BLIS_NT. if ( nt == -1 ) nt = bli_env_get_var( "BLIS_NT", -1 ); @@ -1694,92 +1713,39 @@ void bli_thread_init_rntm_from_env // If neither BLIS_NUM_THREADS nor BLIS_NT were set, try OMP_NUM_THREADS. if ( nt == -1 ) nt = bli_env_get_var( "OMP_NUM_THREADS", -1 ); + // ------------------------------------------------------------------------ + // Read the environment variables for the number of threads (ways of // parallelism) for each individual loop. - jc = bli_env_get_var( "BLIS_JC_NT", -1 ); - pc = bli_env_get_var( "BLIS_PC_NT", -1 ); - ic = bli_env_get_var( "BLIS_IC_NT", -1 ); - jr = bli_env_get_var( "BLIS_JR_NT", -1 ); - ir = bli_env_get_var( "BLIS_IR_NT", -1 ); - - bool nt_set = FALSE; - bool ways_set = FALSE; - - // Some users are mischievous/dumb. Make sure they don't cause trouble. - if ( nt < 1 ) nt = 1; - if ( jc < 1 ) jc = 1; - if ( pc < 1 ) pc = 1; - if ( ic < 1 ) ic = 1; - if ( jr < 1 ) jr = 1; - if ( ir < 1 ) ir = 1; - - // First, we establish whether or not the number of threads or ways of - // parallelism were set to meaningful values. - if ( nt > 1 ) { nt_set = TRUE; } - if ( jc > 1 ) { ways_set = TRUE; } - if ( pc > 1 ) { ways_set = TRUE; pc = 1; } // Disable pc_nt values. - if ( ic > 1 ) { ways_set = TRUE; } - if ( jr > 1 ) { ways_set = TRUE; } - if ( ir > 1 ) { ways_set = TRUE; } - - // Now we use the values of nt_set and ways_set to determine how to - // interpret the original values we found in the rntm_t object. - - if ( ( ways_set == TRUE && nt_set == TRUE ) || - ( ways_set == TRUE && nt_set == FALSE ) ) - { - // If the per-loop ways of parallelism were set, then we use the values - // that were given and interpreted above. The only thing left to do is - // calculate the correct number of threads. Notice that whatever value - // may have been asigned to BLIS_NUM_THREADS will be ignored, and the - // total number of threads will be taken to be the number implied from - // the per-loop ways of parallelism. - - nt = jc * pc * ic * jr * ir; - auto_factor = FALSE; - } - else if ( ways_set == FALSE && nt_set == TRUE ) - { - // If the ways were not set but the number of thread was set, then we - // will attempt to automatically generate a thread factorization that - // will work given the problem size. This auto-factorization will - // occur later, in bli_rntm_set_ways_from_rntm(), once we know the - // problem size. - - // Make note that auto-factorization will be performed. - auto_factor = TRUE; - } - else // if ( ways_set == FALSE && nt_set == FALSE ) - { - // If neither the ways nor the number of threads were set, then we - // allow the default values to stand. - //nt = jc = pc = ic = jr = ir = 1; - //auto_factor = FALSE; - } - -#else + dim_t jc = bli_env_get_var( "BLIS_JC_NT", -1 ); + dim_t pc = bli_env_get_var( "BLIS_PC_NT", -1 ); + dim_t ic = bli_env_get_var( "BLIS_IC_NT", -1 ); + dim_t jr = bli_env_get_var( "BLIS_JR_NT", -1 ); + dim_t ir = bli_env_get_var( "BLIS_IR_NT", -1 ); - // Note that we don't even bother checking BLIS_THREAD_IMPL if neither - // OpenMP nor pthreads was enabled at compile time. - ti = BLIS_SINGLE; - - // When multithreading is disabled, always set the per-loop ways of - // parallelism to 1. - jc = pc = ic = jr = ir = 1; - nt = 1; - auto_factor = FALSE; - -#endif + // ------------------------------------------------------------------------ // Save the results back in the runtime object. bli_rntm_set_thread_impl_only( ti, rntm ); bli_rntm_set_num_threads_only( nt, rntm ); bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm ); - bli_rntm_set_auto_factor_only( auto_factor, rntm ); - #if 0 - printf( "bli_thread_init_rntm_from_env()\n" ); - bli_rntm_print( rntm ); - #endif + // ------------------------------------------------------------------------ + + // This function, bli_thread_init_rntm_from_env(), is only called when BLIS + // is initialized, and so we need to go one step further and process the + // rntm's contents into a standard form to ensure, for example, that none of + // the ways of parallelism are negative or zero (in case the user queries + // them later). + bli_rntm_sanitize( rntm ); + +#else + + // When multithreading is disabled, the global rntm can keep the values it + // was assigned at (static) initialization time. + +#endif + + //printf( "bli_thread_init_rntm_from_env()\n" ); bli_rntm_print( rntm ); } diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 509072e57..88bdccda5 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -202,6 +202,7 @@ BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS timpl_t bli_thread_get_thread_impl( void ); +BLIS_EXPORT_BLIS const char* bli_thread_get_thread_impl_str( timpl_t ti ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); From ee81efc7887374c974a78bfb3e0865776b2f97a8 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 22 Sep 2022 19:15:07 -0500 Subject: [PATCH 088/230] Parameterized test/3 drivers via command line args. (#667) Details: - Rewrote the drivers in test/3, the Makefile, and the runme.sh script so that most of the important parameters, including parameter combo, datatype, storage combo, induced method, problem size range, dimension bindings, number of repeats, and alpha/beta values can be passed in via command line arguments. (Previously, most of these parameters were hard-coded into the driver source, except a few that were hard-coded into the Makefile.) If no argument is given for any particular option, it will be assigned a sane default. Either way, the values employed at runtime will be printed to stdout before the performance data in a section that is commented out with '%' characters (which is used by matlab and octave for comments), unless the -q option is given, in which case the driver will proceed quietly and output only performance data. Each driver also provides extensive help via the -h option, with the help text tailored for the operation in question (e.g. gemm, hemm, herk, etc.). In this help text, the driver reminds the user which implementation it was linked to (e.g. blis, openblas, vendor, eigen). Thanks to Jeff Diamond for suggesting this CLI-based reimagining of the test/3 drivers. - In the test/3 drivers: converted cpp macro string constants, as well as two string literals (for the opname and pc_str) used in each test driver, to global (or static) const char* strings, and replaced the use of strncpy() for storing the results of the command line argument parsing with pointer copies from the corresponding strings in argv. This works because the argv array is guaranteed by the C99 standard to persist throughout the life of the program. This new approach uses less storage and executes faster. Thanks to Minh Quan Ho for recommending this change. - Renamed the IMP_STR cpp macro that gets defined on the command line, via the test/3/Makefile, to IMPL_STR. - Updated runme.sh to set the problem size ranges for single-threaded and multithreaded execution independently from one another, as well as on a per-system basis. - Added a 'quiet' variable to runme.sh that can easily toggle quiet mode for the test drivers' output. - Very minor typecast fix in call to bli_getopt() in bli_utils.c. - In bli_getopt(), changed the nextchar variable from being a local static variable to a field of the getopt_t state struct. (Not sure why it was ever declared static to begin with.) - Other minor changes to bli_getopt() to accommodate the rewritten test drivers' command line parsing needs. --- frame/base/bli_getopt.c | 54 ++-- frame/base/bli_getopt.h | 1 + test/3/Makefile | 302 ++++++------------ test/3/old/runme.sh | 277 ++++++++++++++++ test/3/runme.sh | 199 +++++++----- test/3/test_gemm.c | 335 +++++++++++--------- test/3/test_hemm.c | 194 +++++++----- test/3/test_herk.c | 192 ++++++----- test/3/test_trmm.c | 202 ++++++------ test/3/test_trsm.c | 203 ++++++------ test/3/test_utils.c | 684 ++++++++++++++++++++++++++++++++++++++++ test/3/test_utils.h | 142 +++++++++ 12 files changed, 1993 insertions(+), 792 deletions(-) create mode 100755 test/3/old/runme.sh create mode 100644 test/3/test_utils.c create mode 100644 test/3/test_utils.h diff --git a/frame/base/bli_getopt.c b/frame/base/bli_getopt.c index e1d90d323..bf74eb1d7 100644 --- a/frame/base/bli_getopt.c +++ b/frame/base/bli_getopt.c @@ -37,18 +37,19 @@ static const char OPT_MARKER = '-'; +//bool bli_char_is_in_str( char ch, const char* str ); + void bli_getopt_init_state( int opterr, getopt_t* state ) { - state->optarg = NULL; - state->optind = 1; - state->opterr = opterr; - state->optopt = 0; + state->nextchar = NULL; + state->optarg = NULL; + state->optind = 1; + state->opterr = opterr; + state->optopt = 0; } int bli_getopt( int argc, const char* const * argv, const char* optstring, getopt_t* state ) { - static const char* nextchar = NULL; - const char* elem_str; const char* optstr_char; @@ -60,7 +61,7 @@ int bli_getopt( int argc, const char* const * argv, const char* optstring, getop // an element of argv with more than one option character, in which // case we need to pick up where we left off (which is the address // contained in nextchar). - if ( nextchar == NULL ) + if ( state->nextchar == NULL ) { elem_str = argv[ state->optind ]; @@ -87,10 +88,10 @@ int bli_getopt( int argc, const char* const * argv, const char* optstring, getop // character. // Use the nextchar pointer as our element string. - elem_str = nextchar; + elem_str = state->nextchar; // Reset nextchar to NULL. - nextchar = NULL; + state->nextchar = NULL; } // Find the first occurrence of elem_str[0] in optstring. @@ -130,17 +131,24 @@ int bli_getopt( int argc, const char* const * argv, const char* optstring, getop state->optind += 1; return '?'; } - // If there are still more elements in argv yet to process AND - // the next one is an option, then the argument was omitted. + // If there are still more elements in argv yet to process AND the + // next one is an option marker, then the argument was omitted + // (unless the option marker is actually part of the argument, + // such as with negative numbers, e.g. -1, which is very likely + // if the char *after* the option marker is missing from optstring). else if ( argv[ state->optind + 1 ][0] == OPT_MARKER ) { - if ( state->opterr == 1 ) fprintf( stderr, "bli_getopt(): **error**: option character '%c' is missing an argument (next element of argv is option '%c')\n", elem_str[0], argv[ state->optind + 1 ][1] ); - - state->optopt = *optstr_char; - state->optind += 1; - return '?'; + // If the char after the option marker is present in optstring, + // then the first option argument is missing. + if ( strchr( optstring, argv[ state->optind + 1 ][1] ) != NULL ) + { + if ( state->opterr == 1 ) fprintf( stderr, "bli_getopt(): **error**: option character '%c' is missing an argument (next element of argv is option '%c')\n", elem_str[0], argv[ state->optind + 1 ][1] ); + + state->optopt = *optstr_char; + state->optind += 1; + return '?'; + } } - // If no error was deteced above, we can safely assign optarg // to be the next element in argv and increment optind by two. state->optarg = argv[ state->optind + 1 ]; @@ -166,7 +174,7 @@ int bli_getopt( int argc, const char* const * argv, const char* optstring, getop { if ( strchr( optstring, elem_str[1] ) != NULL ) { - nextchar = &elem_str[1]; + state->nextchar = &elem_str[1]; return *optstr_char; } } @@ -176,3 +184,13 @@ int bli_getopt( int argc, const char* const * argv, const char* optstring, getop return *optstr_char; } +#if 0 +bool bli_char_is_in_str( char ch, const char* str ) +{ + int chi = ( int )ch; + + if ( strchr( str, chi ) == NULL ) return FALSE; + + return TRUE; +} +#endif diff --git a/frame/base/bli_getopt.h b/frame/base/bli_getopt.h index bb0e4f2cf..1e0f7b250 100644 --- a/frame/base/bli_getopt.h +++ b/frame/base/bli_getopt.h @@ -34,6 +34,7 @@ typedef struct getopt_s { + const char* nextchar; const char* optarg; int optind; int opterr; diff --git a/test/3/Makefile b/test/3/Makefile index 568b7ffb0..e7cb7235a 100644 --- a/test/3/Makefile +++ b/test/3/Makefile @@ -126,25 +126,6 @@ VENDOR_LIB := $(MKL_LIB) VENDORP_LIB := $(MKLP_LIB) -# -# --- Problem size definitions ------------------------------------------------- -# - -# Single core (single-threaded) -PS_BEGIN := 48 -PS_MAX := 2400 -PS_INC := 48 - -# Single-socket (multithreaded) -P1_BEGIN := 120 -P1_MAX := 6000 -P1_INC := 120 - -# Dual-socket (multithreaded) -P2_BEGIN := 160 -P2_MAX := 8000 -P2_INC := 160 - # # --- General build definitions ------------------------------------------------ @@ -182,30 +163,19 @@ CXXFLAGS_MT := -march=native $(CXXFLAGS) # Which library? -BLI_DEF := -DBLIS -BLA_DEF := -DBLAS -EIG_DEF := -DEIGEN - -# Complex implementation type -D1M := -DIND=BLIS_1M -DNAT := -DIND=BLIS_NAT - -# Implementation string -#STR_1M := -DSTR=\"1m\" -STR_NAT := -DSTR=\"asm_blis\" -STR_OBL := -DSTR=\"openblas\" -STR_EIG := -DSTR=\"eigen\" -STR_VEN := -DSTR=\"vendor\" - -# Single or multithreaded string -STR_ST := -DTHR_STR=\"st\" -STR_1S := -DTHR_STR=\"1s\" -STR_2S := -DTHR_STR=\"2s\" +DEF_BLI := -DBLIS +DEF_BLA := -DBLAS +DEF_EIG := -DEIGEN -# Problem size specification -PDEF_ST := -DP_BEGIN=$(PS_BEGIN) -DP_INC=$(PS_INC) -DP_MAX=$(PS_MAX) -PDEF_1S := -DP_BEGIN=$(P1_BEGIN) -DP_INC=$(P1_INC) -DP_MAX=$(P1_MAX) -PDEF_2S := -DP_BEGIN=$(P2_BEGIN) -DP_INC=$(P2_INC) -DP_MAX=$(P2_MAX) +# Implementation string. +STR_BLI := -DIMPL_STR=\"blis\" +STR_OBL := -DIMPL_STR=\"openblas\" +STR_EIG := -DIMPL_STR=\"eigen\" +STR_VEN := -DIMPL_STR=\"vendor\" + +# Single or multithreaded string. +STR_ST := -DTHR_STR=\"st\" +STR_MT := -DTHR_STR=\"mt\" @@ -213,188 +183,132 @@ PDEF_2S := -DP_BEGIN=$(P2_BEGIN) -DP_INC=$(P2_INC) -DP_MAX=$(P2_MAX) # --- Targets/rules ------------------------------------------------------------ # -all: all-st all-1s all-2s -blis: blis-st blis-1s blis-2s -openblas: openblas-st openblas-1s openblas-2s -eigen: eigen-st eigen-1s eigen-2s -vendor: vendor-st vendor-1s vendor-2s -mkl: vendor -armpl: vendor +all: all-st -all-st: blis-st openblas-st mkl-st eigen-st -all-1s: blis-1s openblas-1s mkl-1s eigen-1s -all-2s: blis-2s openblas-2s mkl-2s eigen-2s +all-st: blis-st openblas-st mkl-st eigen-st +all-mt: blis-mt openblas-mt mkl-mt eigen-mt -blis-st: blis-nat-st -blis-1s: blis-nat-1s -blis-2s: blis-nat-2s - -#blis-ind: blis-ind-st blis-ind-mt -blis-nat: blis-nat-st blis-nat-1s blis-nat-2s +blis: blis-st +openblas: openblas-st +eigen: eigen-st +vendor: vendor-st +mkl: mkl-st # Define the datatypes, operations, and implementations. -DTS := s d c z OPS := gemm hemm herk trmm trsm -BIMPLS := asm_blis openblas vendor +BIMPLS := blis openblas vendor EIMPLS := eigen -# Define functions to construct object filenames from the datatypes and -# operations given an implementation. We define one function for single- -# threaded, single-socket, and dual-socket filenames. -get-st-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(PS_MAX)_$(1)_st.o)) -get-1s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P1_MAX)_$(1)_1s.o)) -get-2s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P2_MAX)_$(1)_2s.o)) - -# Construct object and binary names for single-threaded, single-socket, and -# dual-socket files for BLIS, OpenBLAS, and a vendor library (e.g. MKL). -BLIS_NAT_ST_OBJS := $(call get-st-objs,asm_blis) -BLIS_NAT_ST_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_ST_OBJS)) -BLIS_NAT_1S_OBJS := $(call get-1s-objs,asm_blis) -BLIS_NAT_1S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_1S_OBJS)) -BLIS_NAT_2S_OBJS := $(call get-2s-objs,asm_blis) -BLIS_NAT_2S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_2S_OBJS)) +# Define a function to construct object filenames from the operations +# given an implementation. +get-st-objs = $(foreach op,$(OPS),test_$(op)_$(1)_st.o) +get-mt-objs = $(foreach op,$(OPS),test_$(op)_$(1)_mt.o) + +# Construct object and binary names for single-threaded and multithreaded +# files for BLIS, OpenBLAS, Eigen, and a vendor library (e.g. MKL). +BLIS_ST_OBJS := $(call get-st-objs,blis) +BLIS_ST_BINS := $(patsubst %.o,%.x,$(BLIS_ST_OBJS)) + +BLIS_MT_OBJS := $(call get-mt-objs,blis) +BLIS_MT_BINS := $(patsubst %.o,%.x,$(BLIS_MT_OBJS)) OPENBLAS_ST_OBJS := $(call get-st-objs,openblas) OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS)) -OPENBLAS_1S_OBJS := $(call get-1s-objs,openblas) -OPENBLAS_1S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_1S_OBJS)) -OPENBLAS_2S_OBJS := $(call get-2s-objs,openblas) -OPENBLAS_2S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_2S_OBJS)) + +OPENBLAS_MT_OBJS := $(call get-mt-objs,openblas) +OPENBLAS_MT_BINS := $(patsubst %.o,%.x,$(OPENBLAS_MT_OBJS)) EIGEN_ST_OBJS := $(call get-st-objs,eigen) EIGEN_ST_BINS := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS)) -EIGEN_1S_OBJS := $(call get-1s-objs,eigen) -EIGEN_1S_BINS := $(patsubst %.o,%.x,$(EIGEN_1S_OBJS)) -EIGEN_2S_OBJS := $(call get-2s-objs,eigen) -EIGEN_2S_BINS := $(patsubst %.o,%.x,$(EIGEN_2S_OBJS)) + +EIGEN_MT_OBJS := $(call get-mt-objs,eigen) +EIGEN_MT_BINS := $(patsubst %.o,%.x,$(EIGEN_MT_OBJS)) VENDOR_ST_OBJS := $(call get-st-objs,vendor) VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS)) -VENDOR_1S_OBJS := $(call get-1s-objs,vendor) -VENDOR_1S_BINS := $(patsubst %.o,%.x,$(VENDOR_1S_OBJS)) -VENDOR_2S_OBJS := $(call get-2s-objs,vendor) -VENDOR_2S_BINS := $(patsubst %.o,%.x,$(VENDOR_2S_OBJS)) - -# Define some targets associated with the above object/binary files. -blis-nat-st: check-env $(BLIS_NAT_ST_BINS) -blis-nat-1s: check-env $(BLIS_NAT_1S_BINS) -blis-nat-2s: check-env $(BLIS_NAT_2S_BINS) - -openblas-st: check-env $(OPENBLAS_ST_BINS) -openblas-1s: check-env $(OPENBLAS_1S_BINS) -openblas-2s: check-env $(OPENBLAS_2S_BINS) -eigen-st: check-env $(EIGEN_ST_BINS) -eigen-1s: check-env $(EIGEN_1S_BINS) -eigen-2s: check-env $(EIGEN_2S_BINS) +VENDOR_MT_OBJS := $(call get-mt-objs,vendor) +VENDOR_MT_BINS := $(patsubst %.o,%.x,$(VENDOR_MT_OBJS)) -vendor-st: check-env $(VENDOR_ST_BINS) -vendor-1s: check-env $(VENDOR_1S_BINS) -vendor-2s: check-env $(VENDOR_2S_BINS) +# List other miscellaneous object files +UTIL_OBJS := test_utils.o +UTIL_HDRS := test_utils.h -mkl-st: vendor-st -mkl-1s: vendor-1s -mkl-2s: vendor-2s - -armpl-st: vendor-st -armpl-1s: vendor-1s -armpl-2s: vendor-2s +# Define some targets associated with the above object/binary files. +blis-st: check-env $(BLIS_ST_BINS) +blis-mt: check-env $(BLIS_MT_BINS) +openblas-st: check-env $(OPENBLAS_ST_BINS) +openblas-mt: check-env $(OPENBLAS_MT_BINS) +eigen-st: check-env $(EIGEN_ST_BINS) +eigen-mt: check-env $(EIGEN_MT_BINS) +vendor-st: check-env $(VENDOR_ST_BINS) +vendor-mt: check-env $(VENDOR_MT_BINS) +mkl-st: vendor-st +mkl-mt: vendor-mt +armpl-st: vendor-st +armpl-mt: vendor-mt # Mark the object files as intermediate so that make will remove them # automatically after building the binaries on which they depend. -.INTERMEDIATE: $(BLIS_NAT_ST_OBJS) $(BLIS_NAT_1S_OBJS) $(BLIS_NAT_2S_OBJS) -.INTERMEDIATE: $(OPENBLAS_ST_OBJS) $(OPENBLAS_1S_OBJS) $(OPENBLAS_2S_OBJS) -.INTERMEDIATE: $(EIGEN_ST_OBJS) $(EIGEN_1S_OBJS) $(EIGEN_2S_OBJS) -.INTERMEDIATE: $(VENDOR_ST_OBJS) $(VENDOR_1S_OBJS) $(VENDOR_2S_OBJS) +.INTERMEDIATE: $(BLIS_ST_OBJS) $(BLIS_MT_OBJS) +.INTERMEDIATE: $(OPENBLAS_ST_OBJS) $(OPENBLAS_MT_OBJS) +.INTERMEDIATE: $(EIGEN_ST_OBJS) $(EIGEN_MT_OBJS) +.INTERMEDIATE: $(VENDOR_ST_OBJS) $(VENDOR_MT_OBJS) +.INTERMEDIATE: $(UTIL_OBJS) # -- Object file rules -- -#$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c -# $(CC) $(CFLAGS) -c $< -o $@ - -# A function to return the datatype cpp macro def from the datatype -# character. -get-dt-cpp = $(strip \ - $(if $(findstring s,$(1)),-DDT=BLIS_FLOAT -DIS_FLOAT,\ - $(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE -DIS_DOUBLE,\ - $(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\ - -DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX)))) - # A function to return other cpp macros that help the test driver # identify the implementation. -#get-bl-cpp = $(strip \ -# $(if $(findstring blis,$(1)),$(STR_NAT) $(BLI_DEF),\ -# $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\ -# $(if $(findstring eigen,$(1)),$(STR_EIG) $(EIG_DEF),\ -# $(STR_VEN) $(BLA_DEF))))) - get-bl-cpp = $(strip \ - $(if $(findstring blis,$(1)),$(STR_NAT) $(BLI_DEF),\ - $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\ + $(if $(findstring blis,$(1)),$(STR_BLI) $(DEF_BLI),\ + $(if $(findstring openblas,$(1)),$(STR_OBL) $(DEF_BLA),\ $(if $(and $(findstring eigen,$(1)),\ $(findstring gemm,$(2))),\ - $(STR_EIG) $(EIG_DEF),\ + $(STR_EIG) $(DEF_EIG),\ $(if $(findstring eigen,$(1)),\ - $(STR_EIG) $(BLA_DEF),\ - $(STR_VEN) $(BLA_DEF)))))) + $(STR_EIG) $(DEF_BLA),\ + $(STR_VEN) $(DEF_BLA)))))) +# Rules for miscellaneous files. +test_utils.o: test_utils.c test_utils.h + $(CC) $(CFLAGS) -c $< -o $@ # Rules for BLIS and BLAS libraries. define make-st-rule -test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile - $(CC) $(CFLAGS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@ -endef - -define make-1s-rule -test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile - $(CC) $(CFLAGS) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@ +test_$(1)_$(2)_st.o: test_$(op).c Makefile + $(CC) $(CFLAGS) $(call get-bl-cpp,$(2),$(1)) $(STR_ST) -c $$< -o $$@ endef -define make-2s-rule -test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile - $(CC) $(CFLAGS) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@ +define make-mt-rule +test_$(1)_$(2)_mt.o: test_$(op).c Makefile + $(CC) $(CFLAGS) $(call get-bl-cpp,$(2),$(1)) $(STR_MT) -c $$< -o $$@ endef -$(foreach dt,$(DTS), \ $(foreach op,$(OPS), \ -$(foreach im,$(BIMPLS),$(eval $(call make-st-rule,$(dt),$(op),$(im)))))) +$(foreach im,$(BIMPLS),$(eval $(call make-st-rule,$(op),$(im))))) -$(foreach dt,$(DTS), \ $(foreach op,$(OPS), \ -$(foreach im,$(BIMPLS),$(eval $(call make-1s-rule,$(dt),$(op),$(im)))))) - -$(foreach dt,$(DTS), \ -$(foreach op,$(OPS), \ -$(foreach im,$(BIMPLS),$(eval $(call make-2s-rule,$(dt),$(op),$(im)))))) +$(foreach im,$(BIMPLS),$(eval $(call make-mt-rule,$(op),$(im))))) # Rules for Eigen. +# NOTE: Eigen determines single- vs. multithreadedness at compile time. define make-eigst-rule -test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile - $(CXX) $(CXXFLAGS_ST) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@ -endef - -define make-eig1s-rule -test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile - $(CXX) $(CXXFLAGS_MT) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@ +test_$(1)_$(2)_st.o: test_$(op).c Makefile + $(CXX) $(CXXFLAGS_ST) $(call get-bl-cpp,$(2),$(1)) $(STR_ST) -c $$< -o $$@ endef -define make-eig2s-rule -test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile - $(CXX) $(CXXFLAGS_MT) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@ +define make-eigmt-rule +test_$(1)_$(2)_mt.o: test_$(op).c Makefile + $(CXX) $(CXXFLAGS_MT) $(call get-bl-cpp,$(2),$(1)) $(STR_MT) -c $$< -o $$@ endef -$(foreach dt,$(DTS), \ -$(foreach op,$(OPS), \ -$(foreach im,$(EIMPLS),$(eval $(call make-eigst-rule,$(dt),$(op),$(im)))))) - -$(foreach dt,$(DTS), \ $(foreach op,$(OPS), \ -$(foreach im,$(EIMPLS),$(eval $(call make-eig1s-rule,$(dt),$(op),$(im)))))) +$(foreach im,$(EIMPLS),$(eval $(call make-eigst-rule,$(op),$(im))))) -$(foreach dt,$(DTS), \ $(foreach op,$(OPS), \ -$(foreach im,$(EIMPLS),$(eval $(call make-eig2s-rule,$(dt),$(op),$(im)))))) +$(foreach im,$(EIMPLS),$(eval $(call make-eigmt-rule,$(op),$(im))))) # -- Executable file rules -- @@ -404,44 +318,36 @@ $(foreach im,$(EIMPLS),$(eval $(call make-eig2s-rule,$(dt),$(op),$(im)))))) # compatibility layer. This prevents BLIS from inadvertently getting called # for the BLAS routines we are trying to test with. -test_%_$(PS_MAX)_asm_blis_st.x: test_%_$(PS_MAX)_asm_blis_st.o $(LIBBLIS_LINK) - $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) - -test_%_$(P1_MAX)_asm_blis_1s.x: test_%_$(P1_MAX)_asm_blis_1s.o $(LIBBLIS_LINK) - $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) - -test_%_$(P2_MAX)_asm_blis_2s.x: test_%_$(P2_MAX)_asm_blis_2s.o $(LIBBLIS_LINK) - $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) - +# Combine the miscellaneous objects with libblis for conciseness (since all +# driver binaries depend on these objects). +COMMON_OBJS := $(UTIL_OBJS) $(LIBBLIS_LINK) -test_%_$(PS_MAX)_openblas_st.x: test_%_$(PS_MAX)_openblas_st.o $(LIBBLIS_LINK) - $(CC) $(strip $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) +test_%_blis_st.x: test_%_blis_st.o $(COMMON_OBJS) + $(CC) $(strip $< $(COMMON_OBJS) $(LDFLAGS) -o $@) -test_%_$(P1_MAX)_openblas_1s.x: test_%_$(P1_MAX)_openblas_1s.o $(LIBBLIS_LINK) - $(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) +test_%_blis_mt.x: test_%_blis_mt.o $(COMMON_OBJS) + $(CC) $(strip $< $(COMMON_OBJS) $(LDFLAGS) -o $@) -test_%_$(P2_MAX)_openblas_2s.x: test_%_$(P2_MAX)_openblas_2s.o $(LIBBLIS_LINK) - $(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) +test_%_openblas_st.x: test_%_openblas_st.o $(COMMON_OBJS) + $(CC) $(strip $< $(OPENBLAS_LIB) $(COMMON_OBJS) $(LDFLAGS) -o $@) -test_%_$(PS_MAX)_eigen_st.x: test_%_$(PS_MAX)_eigen_st.o $(LIBBLIS_LINK) - $(CXX) $(strip $< $(EIGEN_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) +test_%_openblas_mt.x: test_%_openblas_mt.o $(COMMON_OBJS) + $(CC) $(strip $< $(OPENBLASP_LIB) $(COMMON_OBJS) $(LDFLAGS) -o $@) -test_%_$(P1_MAX)_eigen_1s.x: test_%_$(P1_MAX)_eigen_1s.o $(LIBBLIS_LINK) - $(CXX) $(strip $< $(EIGENP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) -test_%_$(P2_MAX)_eigen_2s.x: test_%_$(P2_MAX)_eigen_2s.o $(LIBBLIS_LINK) - $(CXX) $(strip $< $(EIGENP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) +test_%_eigen_st.x: test_%_eigen_st.o $(COMMON_OBJS) + $(CXX) $(strip $< $(EIGEN_LIB) $(COMMON_OBJS) $(LDFLAGS) -o $@) +test_%_eigen_mt.x: test_%_eigen_mt.o $(COMMON_OBJS) + $(CXX) $(strip $< $(EIGENP_LIB) $(COMMON_OBJS) $(LDFLAGS) -o $@) -test_%_$(PS_MAX)_vendor_st.x: test_%_$(PS_MAX)_vendor_st.o $(LIBBLIS_LINK) - $(CC) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) -test_%_$(P1_MAX)_vendor_1s.x: test_%_$(P1_MAX)_vendor_1s.o $(LIBBLIS_LINK) - $(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) +test_%_vendor_st.x: test_%_vendor_st.o $(COMMON_OBJS) + $(CC) $(strip $< $(VENDOR_LIB) $(COMMON_OBJS) $(LDFLAGS) -o $@) -test_%_$(P2_MAX)_vendor_2s.x: test_%_$(P2_MAX)_vendor_2s.o $(LIBBLIS_LINK) - $(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) +test_%_vendor_mt.x: test_%_vendor_mt.o $(COMMON_OBJS) + $(CC) $(strip $< $(VENDORP_LIB) $(COMMON_OBJS) $(LDFLAGS) -o $@) # -- Environment check rules -- diff --git a/test/3/old/runme.sh b/test/3/old/runme.sh new file mode 100755 index 000000000..cf84bd121 --- /dev/null +++ b/test/3/old/runme.sh @@ -0,0 +1,277 @@ +#!/bin/bash + +# File pefixes. +exec_root="test" +out_root="output" +delay=0.1 + +sys="blis" +#sys="stampede2" +#sys="lonestar5" +#sys="ul252" +#sys="ul264" +#sys="ul2128" + +# Bind threads to processors. +#export OMP_PROC_BIND=true +#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23" +#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103" + +if [ ${sys} = "blis" ]; then + + export GOMP_CPU_AFFINITY="0-3" + + numactl="" + threads="jc1ic1jr1_st + jc2ic1jr1_1s + jc2ic2jr1_2s" + +elif [ ${sys} = "stampede2" ]; then + + echo "Need to set GOMP_CPU_AFFINITY." + exit 1 + + numactl="" + threads="jc1ic1jr1_st + jc4ic6jr1_1s + jc4ic12jr1_2s" + +elif [ ${sys} = "lonestar5" ]; then + + export GOMP_CPU_AFFINITY="0-23" + + # A hack to use libiomp5 with gcc. + #export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64" + + numactl="" + threads="jc1ic1jr1_st + jc2ic3jr2_1s + jc4ic3jr2_2s" + +elif [ ${sys} = "ul252" ]; then + + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64" + export GOMP_CPU_AFFINITY="0-51" + + numactl="" + threads="jc1ic1jr1_st + jc2ic13jr1_1s + jc4ic13jr1_2s" + +elif [ ${sys} = "ul264" ]; then + + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64" + export GOMP_CPU_AFFINITY="0-63" + + numactl="numactl --interleave=all" + threads="jc1ic1jr1_st + jc1ic8jr4_1s + jc2ic8jr4_2s" + +elif [ ${sys} = "ul2128" ]; then + + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64" + export GOMP_CPU_AFFINITY="0-127" + + numactl="numactl --interleave=all" + threads="jc1ic1jr1_st + jc4ic4jr4_1s + jc8ic4jr4_2s" + #threads="jc4ic4jr4_1s + # jc8ic4jr4_2s" + #threads="jc1ic1jr1_st" + #threads="jc4ic4jr4_1s" + #threads="jc8ic4jr4_2s" +fi + +# Datatypes to test. +test_dts="d s z c" +#test_dts="s" + +# Operations to test. +test_ops="gemm hemm herk trmm trsm" +#test_ops="herk" + +# Implementations to test. +impls="blis" +#impls="openblas" +#impls="vendor" +#impls="other" +#impls="eigen" +#impls="all" + +if [ "${impls}" = "blis" ]; then + + test_impls="asm_blis" + +elif [ "${impls}" = "openblas" ]; then + + test_impls="openblas" + +elif [ "${impls}" = "vendor" ]; then + + test_impls="vendor" + +elif [ "${impls}" = "eigen" ]; then + + test_impls="eigen" + +elif [ "${impls}" = "other" ]; then + + test_impls="openblas vendor eigen" +else + + test_impls="openblas asm_blis vendor eigen" +fi + +# Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can +# restore the value. +GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY} + + +# Iterate over the threading configs. +for th in ${threads}; do + + # Start with one way of parallelism in each loop. We will now begin + # parsing the 'th' variable to update one or more of these threading + # parameters. + jc_nt=1; pc_nt=1; ic_nt=1; jr_nt=1; ir_nt=1 + + # Strip everything before and after the underscore so that what remains + # is the problem size and threading parameter string, respectively. + #psize=${th##*_}; thinfo=${th%%_*} + tsuf=${th##*_}; thinfo=${th%%_*} + + # Identify each threading parameter and insert a space before it. + thsep=$(echo -e ${thinfo} | sed -e "s/\([jip][cr]\)/ \1/g" ) + + nt=1 + + for loopnum in ${thsep}; do + + # Given the current string, which identifies a loop and the + # number of ways of parallelism for that loop, strip out + # the ways and loop separately to identify each. + loop=$(echo -e ${loopnum} | sed -e "s/[0-9]//g" ) + num=$(echo -e ${loopnum} | sed -e "s/[a-z]//g" ) + + # Construct a string that we can evaluate to set the number + # of ways of parallelism for the current loop. + loop_nt_eq_num="${loop}_nt=${num}" + + # Update the total number of threads. + nt=$(expr ${nt} \* ${num}) + + # Evaluate the string to assign the ways to the variable. + eval ${loop_nt_eq_num} + + done + + # Find a binary using the test driver prefix and the threading suffix. + # Then strip everything before and after the max problem size that's + # encoded into the name of the binary. + binname=$(ls -1 ${exec_root}_*_${tsuf}.x | head -n1) + temp1=${binname#${exec_root}_*_} + psize=${temp1%%_*} + + # Sanity check: If 'ls' couldn't find any binaries, then the user + # probably didn't build them. Inform the user and proceed to the next + # threading config. + if [ "${binname}" = "" ]; then + + echo "Could not find binaries corresponding to '${tsuf}' threading config. Skipping." + continue + fi + + # Let the user know what threading config we are working on. + echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt}) p_max${psize}" + + # Iterate over the datatypes. + for dt in ${test_dts}; do + + # Iterate over the implementations. + for im in ${test_impls}; do + + # Iterate over the operations. + for op in ${test_ops}; do + + # Eigen does not support multithreading for hemm, herk, trmm, + # or trsm. So if we're getting ready to execute an Eigen driver + # for one of these operations and nt > 1, we skip this test. + if [ "${im}" = "eigen" ] && \ + [ "${op}" != "gemm" ] && \ + [ "${nt}" != "1" ]; then + continue; + fi + + # Find the threading suffix by probing the executable. + binname=$(ls ${exec_root}_${dt}${op}_*_${im}_${tsuf}.x) + + #echo "found file: ${binname} with suffix ${suf}" + + # Set the number of threads according to th. + if [ "${tsuf}" = "1s" ] || [ "${tsuf}" = "2s" ]; then + + # Set the threading parameters based on the implementation + # that we are preparing to run. + if [ "${im}" = "asm_blis" ]; then + unset OMP_NUM_THREADS + export BLIS_JC_NT=${jc_nt} + export BLIS_PC_NT=${pc_nt} + export BLIS_IC_NT=${ic_nt} + export BLIS_JR_NT=${jr_nt} + export BLIS_IR_NT=${ir_nt} + elif [ "${im}" = "openblas" ]; then + unset OMP_NUM_THREADS + export OPENBLAS_NUM_THREADS=${nt} + elif [ "${im}" = "eigen" ]; then + export OMP_NUM_THREADS=${nt} + elif [ "${im}" = "vendor" ]; then + unset OMP_NUM_THREADS + export MKL_NUM_THREADS=${nt} + fi + export nt_use=${nt} + + # Multithreaded OpenBLAS seems to have a problem running + # properly if GOMP_CPU_AFFINITY is set. So we temporarily + # unset it here if we are about to execute OpenBLAS, but + # otherwise restore it. + if [ ${im} = "openblas" ]; then + unset GOMP_CPU_AFFINITY + else + export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}" + fi + else + + export BLIS_JC_NT=1 + export BLIS_PC_NT=1 + export BLIS_IC_NT=1 + export BLIS_JR_NT=1 + export BLIS_IR_NT=1 + export OMP_NUM_THREADS=1 + export OPENBLAS_NUM_THREADS=1 + export MKL_NUM_THREADS=1 + export nt_use=1 + fi + + # Construct the name of the test executable. + exec_name="${exec_root}_${dt}${op}_${psize}_${im}_${tsuf}.x" + + # Construct the name of the output file. + out_file="${out_root}_${tsuf}_${dt}${op}_${im}.m" + + #echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}" + echo "Running ${numactl} ./${exec_name} > ${out_file}" + + # Run executable with or without numactl, depending on how + # the numactl variable was set. + ${numactl} ./${exec_name} > ${out_file} + + # Bedtime! + sleep ${delay} + + done + done + done +done + diff --git a/test/3/runme.sh b/test/3/runme.sh index cf84bd121..fefcbe5ee 100755 --- a/test/3/runme.sh +++ b/test/3/runme.sh @@ -5,6 +5,18 @@ exec_root="test" out_root="output" delay=0.1 +# Bind threads to processors. +#export OMP_PROC_BIND=true +#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23" +#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103" + +# ------------------ + +# Problem size range for single- and multithreaded execution. Set psr_st and +# psr_mt on a per-system basis below to override these default values. +psr_st="100 1000 100" +psr_mt="200 2000 200" + sys="blis" #sys="stampede2" #sys="lonestar5" @@ -12,19 +24,15 @@ sys="blis" #sys="ul264" #sys="ul2128" -# Bind threads to processors. -#export OMP_PROC_BIND=true -#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23" -#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103" - if [ ${sys} = "blis" ]; then export GOMP_CPU_AFFINITY="0-3" numactl="" threads="jc1ic1jr1_st - jc2ic1jr1_1s - jc2ic2jr1_2s" + jc2ic2jr1_mt" + #psr_st="40 1000 40" + #psr_mt="40 4000 40" elif [ ${sys} = "stampede2" ]; then @@ -33,8 +41,9 @@ elif [ ${sys} = "stampede2" ]; then numactl="" threads="jc1ic1jr1_st - jc4ic6jr1_1s - jc4ic12jr1_2s" + jc4ic12jr1_mt" + #psr_st="40 1000 40" + #psr_mt="40 4000 40" elif [ ${sys} = "lonestar5" ]; then @@ -45,8 +54,9 @@ elif [ ${sys} = "lonestar5" ]; then numactl="" threads="jc1ic1jr1_st - jc2ic3jr2_1s - jc4ic3jr2_2s" + jc4ic3jr2_mt" + #psr_st="40 1000 40" + #psr_mt="40 4000 40" elif [ ${sys} = "ul252" ]; then @@ -55,8 +65,9 @@ elif [ ${sys} = "ul252" ]; then numactl="" threads="jc1ic1jr1_st - jc2ic13jr1_1s - jc4ic13jr1_2s" + jc4ic13jr1_mt" + #psr_st="40 1000 40" + #psr_mt="40 4000 40" elif [ ${sys} = "ul264" ]; then @@ -65,8 +76,9 @@ elif [ ${sys} = "ul264" ]; then numactl="numactl --interleave=all" threads="jc1ic1jr1_st - jc1ic8jr4_1s - jc2ic8jr4_2s" + jc2ic8jr4_mt" + #psr_st="40 1000 40" + #psr_mt="40 4000 40" elif [ ${sys} = "ul2128" ]; then @@ -75,54 +87,42 @@ elif [ ${sys} = "ul2128" ]; then numactl="numactl --interleave=all" threads="jc1ic1jr1_st - jc4ic4jr4_1s - jc8ic4jr4_2s" - #threads="jc4ic4jr4_1s - # jc8ic4jr4_2s" - #threads="jc1ic1jr1_st" - #threads="jc4ic4jr4_1s" - #threads="jc8ic4jr4_2s" + jc8ic4jr4_mt" + + #psr_st="40 1000 40" + #psr_mt="40 4000 40" fi # Datatypes to test. -test_dts="d s z c" -#test_dts="s" +test_dts="s d c z" +test_dts="d" # Operations to test. -test_ops="gemm hemm herk trmm trsm" +test_ops="gemm_nn hemm_ll herk_ln trmm_llnn trsm_runn" #test_ops="herk" # Implementations to test. -impls="blis" -#impls="openblas" -#impls="vendor" -#impls="other" -#impls="eigen" -#impls="all" - -if [ "${impls}" = "blis" ]; then - - test_impls="asm_blis" - -elif [ "${impls}" = "openblas" ]; then - - test_impls="openblas" - -elif [ "${impls}" = "vendor" ]; then - - test_impls="vendor" +test_impls="blis" +#test_impls="openblas" +#test_impls="vendor" +#test_impls="eigen" +#test_impls="all" + +if [ "${impls}" = "all" ]; then + test_impls="openblas blis vendor eigen" +fi -elif [ "${impls}" = "eigen" ]; then +# Number of repeats per problem size. +nrepeats=3 - test_impls="eigen" +# The induced method to use ('native' or '1m'). +ind="native" -elif [ "${impls}" = "other" ]; then +# Quiet mode? +#quiet="yes" - test_impls="openblas vendor eigen" -else - - test_impls="openblas asm_blis vendor eigen" -fi +# For testing purposes. +#dryrun="yes" # Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can # restore the value. @@ -132,35 +132,41 @@ GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY} # Iterate over the threading configs. for th in ${threads}; do + #threads="jc1ic1jr1_st + # jc8ic4jr4_mt" + # Start with one way of parallelism in each loop. We will now begin # parsing the 'th' variable to update one or more of these threading # parameters. jc_nt=1; pc_nt=1; ic_nt=1; jr_nt=1; ir_nt=1 - # Strip everything before and after the underscore so that what remains - # is the problem size and threading parameter string, respectively. - #psize=${th##*_}; thinfo=${th%%_*} - tsuf=${th##*_}; thinfo=${th%%_*} + # Strip everything before the understore so that what remains is the + # threading suffix. + tsuf=${th##*_}; + + # Strip everything after the understore so that what remains is the + # parallelism (threading) info. + thinfo=${th%%_*} # Identify each threading parameter and insert a space before it. - thsep=$(echo -e ${thinfo} | sed -e "s/\([jip][cr]\)/ \1/g" ) + thinfo_sep=$(echo -e ${thinfo} | sed -e "s/\([jip][cr]\)/ \1/g" ) nt=1 - for loopnum in ${thsep}; do + for loopnum in ${thinfo_sep}; do - # Given the current string, which identifies a loop and the - # number of ways of parallelism for that loop, strip out - # the ways and loop separately to identify each. + # Given the current string, which identifies a loop and the number of + # ways of parallelism to be obtained from that loop, strip out the ways + # and loop separately to identify each. loop=$(echo -e ${loopnum} | sed -e "s/[0-9]//g" ) - num=$(echo -e ${loopnum} | sed -e "s/[a-z]//g" ) + nways=$(echo -e ${loopnum} | sed -e "s/[a-z]//g" ) - # Construct a string that we can evaluate to set the number - # of ways of parallelism for the current loop. - loop_nt_eq_num="${loop}_nt=${num}" + # Construct a string that we can evaluate to set the number of ways of + # parallelism for the current loop (e.g. jc_nt, ic_nt, jr_nt). + loop_nt_eq_num="${loop}_nt=${nways}" # Update the total number of threads. - nt=$(expr ${nt} \* ${num}) + nt=$(expr ${nt} \* ${nways}) # Evaluate the string to assign the ways to the variable. eval ${loop_nt_eq_num} @@ -171,8 +177,6 @@ for th in ${threads}; do # Then strip everything before and after the max problem size that's # encoded into the name of the binary. binname=$(ls -1 ${exec_root}_*_${tsuf}.x | head -n1) - temp1=${binname#${exec_root}_*_} - psize=${temp1%%_*} # Sanity check: If 'ls' couldn't find any binaries, then the user # probably didn't build them. Inform the user and proceed to the next @@ -184,7 +188,7 @@ for th in ${threads}; do fi # Let the user know what threading config we are working on. - echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt}) p_max${psize}" + echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt})" # Iterate over the datatypes. for dt in ${test_dts}; do @@ -195,26 +199,29 @@ for th in ${threads}; do # Iterate over the operations. for op in ${test_ops}; do + # Strip everything before the understore so that what remains is + # the operation parameter string. + oppars=${op##*_}; + + # Strip everything after the understore so that what remains is + # the operation name (sans parameter encoding). + opname=${op%%_*} + # Eigen does not support multithreading for hemm, herk, trmm, # or trsm. So if we're getting ready to execute an Eigen driver # for one of these operations and nt > 1, we skip this test. - if [ "${im}" = "eigen" ] && \ - [ "${op}" != "gemm" ] && \ - [ "${nt}" != "1" ]; then + if [ "${im}" = "eigen" ] && \ + [ "${opname}" != "gemm" ] && \ + [ "${nt}" != "1" ]; then continue; fi - # Find the threading suffix by probing the executable. - binname=$(ls ${exec_root}_${dt}${op}_*_${im}_${tsuf}.x) - - #echo "found file: ${binname} with suffix ${suf}" - # Set the number of threads according to th. - if [ "${tsuf}" = "1s" ] || [ "${tsuf}" = "2s" ]; then + if [ "${tsuf}" = "mt" ]; then # Set the threading parameters based on the implementation # that we are preparing to run. - if [ "${im}" = "asm_blis" ]; then + if [ "${im}" = "blis" ]; then unset OMP_NUM_THREADS export BLIS_JC_NT=${jc_nt} export BLIS_PC_NT=${pc_nt} @@ -241,8 +248,14 @@ for th in ${threads}; do else export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}" fi + + # Choose the mt problem size range. + psr="${psr_mt}" + else + # Set all environment variables to 1 to ensure single- + # threaded execution. export BLIS_JC_NT=1 export BLIS_PC_NT=1 export BLIS_IC_NT=1 @@ -252,20 +265,38 @@ for th in ${threads}; do export OPENBLAS_NUM_THREADS=1 export MKL_NUM_THREADS=1 export nt_use=1 + + # Choose the st problem size range. + psr="${psr_st}" + fi + + if [ "${quiet}" = "yes" ]; then + qv="-q" # quiet + else + qv="-v" # verbose (the default) fi # Construct the name of the test executable. - exec_name="${exec_root}_${dt}${op}_${psize}_${im}_${tsuf}.x" + exec_name="${exec_root}_${opname}_${im}_${tsuf}.x" # Construct the name of the output file. - out_file="${out_root}_${tsuf}_${dt}${op}_${im}.m" - - #echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}" - echo "Running ${numactl} ./${exec_name} > ${out_file}" + out_file="${out_root}_${tsuf}_${dt}${opname}_${oppars}_${im}.m" + + # Use printf for its formatting capabilities. + printf 'Running %s %-21s %s %-7s %s %s %s %s > %s\n' \ + "${numactl}" "./${exec_name}" "-d ${dt}" \ + "-c ${oppars}" \ + "-i ${ind}" \ + "-p \"${psr}\"" \ + "-r ${nrepeats}" \ + "${qv}" \ + "${out_file}" # Run executable with or without numactl, depending on how # the numactl variable was set. - ${numactl} ./${exec_name} > ${out_file} + if [ "${dryrun}" != "yes" ]; then + ${numactl} ./${exec_name} -d ${dt} -c ${oppars} -i ${ind} -p "${psr}" -r ${nrepeats} ${qv} > ${out_file} + fi # Bedtime! sleep ${delay} diff --git a/test/3/test_gemm.c b/test/3/test_gemm.c index 96992f4a1..20bcca46c 100644 --- a/test/3/test_gemm.c +++ b/test/3/test_gemm.c @@ -36,18 +36,20 @@ #ifdef EIGEN #define BLIS_DISABLE_BLAS_DEFS #include "blis.h" + #include "test_utils.h" #include #include using namespace Eigen; #else #include "blis.h" + #include "test_utils.h" #endif -#define COL_STORAGE -//#define ROW_STORAGE - //#define PRINT +static const char* LOCAL_OPNAME_STR = "gemm"; +static const char* LOCAL_PC_STR = "nn"; + int main( int argc, char** argv ) { obj_t a, b, c; @@ -70,65 +72,43 @@ int main( int argc, char** argv ) double dtime_save; double gflops; - //bli_init(); - - //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - - n_repeats = 3; - - dt = DT; - - ind = IND; - -#if 1 - p_begin = P_BEGIN; - p_max = P_MAX; - p_inc = P_INC; - - m_input = -1; - n_input = -1; - k_input = -1; -#else - p_begin = 40; - p_max = 1000; - p_inc = 40; - - m_input = -1; - n_input = -1; - k_input = -1; -#endif - + params_t params; // Supress compiler warnings about unused variable 'ind'. ( void )ind; -#if 0 - cntx_t* cntx; + //bli_init(); + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + // Parse the command line options into strings, integers, enums, + // and doubles, as appropriate. + parse_cl_params( argc, argv, init_def_params, ¶ms ); - ind_t ind_mod = ind; + dt = params.dt; - // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod ); + ind = params.im; - // Set k to the kc blocksize for the current datatype. - k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + p_begin = params.sta; + p_max = params.end; + p_inc = params.inc; -#elif 1 + m_input = params.m; + n_input = params.n; + k_input = params.k; - //k_input = 256; + n_repeats = params.nr; -#endif - // Choose the char corresponding to the requested datatype. - if ( bli_is_float( dt ) ) dt_ch = 's'; - else if ( bli_is_double( dt ) ) dt_ch = 'd'; - else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; - else dt_ch = 'z'; + // Map the datatype to its corresponding char. + bli_param_map_blis_to_char_dt( dt, &dt_ch ); - transa = BLIS_NO_TRANSPOSE; - transb = BLIS_NO_TRANSPOSE; + // Map the parameter chars to their corresponding BLIS enum type values. + bli_param_map_char_to_blis_trans( params.pc_str[0], &transa ); + bli_param_map_char_to_blis_trans( params.pc_str[1], &transb ); + // Map the BLIS enum type values to their corresponding BLAS chars. bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); bli_param_map_blis_to_netlib_trans( transb, &f77_transb ); @@ -136,8 +116,8 @@ int main( int argc, char** argv ) // matlab allocates space for the entire array once up-front. for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ; - printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR ); - printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", + printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, IMPL_STR ); + printf( "( %4lu, 1:4 ) = [ %5lu %5lu %5lu %8.2f ];\n", ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, @@ -158,17 +138,20 @@ int main( int argc, char** argv ) bli_obj_create( dt, 1, 1, 0, 0, &alpha ); bli_obj_create( dt, 1, 1, 0, 0, &beta ); - #ifdef COL_STORAGE - bli_obj_create( dt, m, k, 0, 0, &a ); - bli_obj_create( dt, k, n, 0, 0, &b ); - bli_obj_create( dt, m, n, 0, 0, &c ); - bli_obj_create( dt, m, n, 0, 0, &c_save ); - #else - bli_obj_create( dt, m, k, k, 1, &a ); - bli_obj_create( dt, k, n, n, 1, &b ); - bli_obj_create( dt, m, n, n, 1, &c ); - bli_obj_create( dt, m, n, n, 1, &c_save ); - #endif + // Choose the storage of each matrix based on the corresponding + // char in the params_t struct. Note that the expected order of + // storage specifers in sc_str is CAB (not ABC). + if ( params.sc_str[1] == 'c' ) bli_obj_create( dt, m, k, 0, 0, &a ); + else bli_obj_create( dt, m, k, k, 1, &a ); + + if ( params.sc_str[2] == 'c' ) bli_obj_create( dt, k, n, 0, 0, &b ); + else bli_obj_create( dt, k, n, n, 1, &b ); + + if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c ); + else bli_obj_create( dt, m, n, n, 1, &c ); + + if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c_save ); + else bli_obj_create( dt, m, n, n, 1, &c_save ); bli_randm( &a ); bli_randm( &b ); @@ -177,12 +160,18 @@ int main( int argc, char** argv ) bli_obj_set_conjtrans( transa, &a ); bli_obj_set_conjtrans( transb, &b ); - bli_setsc( (2.0/1.0), 0.0, &alpha ); - bli_setsc( (1.0/1.0), 0.0, &beta ); + //bli_setsc( (2.0/1.0), 0.0, &alpha ); + //bli_setsc( (1.0/1.0), 0.0, &beta ); + bli_setsc( params.alpha, 0.0, &alpha ); + bli_setsc( params.beta, 0.0, &beta ); + + //bli_printm( "alpha:", &alpha, "%7.4e", "" ); + //bli_printm( "beta: ", &beta, "%7.4e", "" ); bli_copym( &c, &c_save ); -#if 0 //def BLIS +#ifdef BLIS + // Switch to the induced method specified by ind. bli_ind_disable_all_dt( dt ); bli_ind_enable_dt( ind, dt ); #endif @@ -196,58 +185,66 @@ int main( int argc, char** argv ) void* bp = bli_obj_buffer_at_off( &b ); void* cp = bli_obj_buffer_at_off( &c ); - #ifdef COL_STORAGE - const int os_a = bli_obj_col_stride( &a ); - const int os_b = bli_obj_col_stride( &b ); - const int os_c = bli_obj_col_stride( &c ); - #else - const int os_a = bli_obj_row_stride( &a ); - const int os_b = bli_obj_row_stride( &b ); - const int os_c = bli_obj_row_stride( &c ); - #endif + int os_a, os_b, os_c; + + if ( params.sc_str[0] == 'c' ) + { + os_a = bli_obj_col_stride( &a ); + os_b = bli_obj_col_stride( &b ); + os_c = bli_obj_col_stride( &c ); + } + else + { + os_a = bli_obj_row_stride( &a ); + os_b = bli_obj_row_stride( &b ); + os_c = bli_obj_row_stride( &c ); + } Stride stride_a( os_a, 1 ); Stride stride_b( os_b, 1 ); Stride stride_c( os_c, 1 ); - #ifdef COL_STORAGE - #if defined(IS_FLOAT) - typedef Matrix MatrixXf_; - #elif defined (IS_DOUBLE) - typedef Matrix MatrixXd_; - #elif defined (IS_SCOMPLEX) - typedef Matrix, Dynamic, Dynamic, ColMajor> MatrixXcf_; - #elif defined (IS_DCOMPLEX) - typedef Matrix, Dynamic, Dynamic, ColMajor> MatrixXcd_; - #endif - #else - #if defined(IS_FLOAT) - typedef Matrix MatrixXf_; - #elif defined (IS_DOUBLE) - typedef Matrix MatrixXd_; - #elif defined (IS_SCOMPLEX) - typedef Matrix, Dynamic, Dynamic, RowMajor> MatrixXcf_; - #elif defined (IS_DCOMPLEX) - typedef Matrix, Dynamic, Dynamic, RowMajor> MatrixXcd_; - #endif - #endif - #if defined(IS_FLOAT) - Map > A( ( float* )ap, m, k, stride_a ); - Map > B( ( float* )bp, k, n, stride_b ); - Map > C( ( float* )cp, m, n, stride_c ); - #elif defined (IS_DOUBLE) - Map > A( ( double* )ap, m, k, stride_a ); - Map > B( ( double* )bp, k, n, stride_b ); - Map > C( ( double* )cp, m, n, stride_c ); - #elif defined (IS_SCOMPLEX) - Map > A( ( std::complex* )ap, m, k, stride_a ); - Map > B( ( std::complex* )bp, k, n, stride_b ); - Map > C( ( std::complex* )cp, m, n, stride_c ); - #elif defined (IS_DCOMPLEX) - Map > A( ( std::complex* )ap, m, k, stride_a ); - Map > B( ( std::complex* )bp, k, n, stride_b ); - Map > C( ( std::complex* )cp, m, n, stride_c ); - #endif + typedef Matrix MatrixXs_c; + typedef Matrix MatrixXd_c; + typedef Matrix, Dynamic, Dynamic, ColMajor> MatrixXc_c; + typedef Matrix, Dynamic, Dynamic, ColMajor> MatrixXz_c; + + typedef Matrix MatrixXs_r; + typedef Matrix MatrixXd_r; + typedef Matrix, Dynamic, Dynamic, RowMajor> MatrixXc_r; + typedef Matrix, Dynamic, Dynamic, RowMajor> MatrixXz_r; + + Map > As_c( ( float* )ap, m, k, stride_a ); + Map > Bs_c( ( float* )bp, k, n, stride_b ); + Map > Cs_c( ( float* )cp, m, n, stride_c ); + + Map > Ad_c( ( double* )ap, m, k, stride_a ); + Map > Bd_c( ( double* )bp, k, n, stride_b ); + Map > Cd_c( ( double* )cp, m, n, stride_c ); + + Map > Ac_c( ( std::complex* )ap, m, k, stride_a ); + Map > Bc_c( ( std::complex* )bp, k, n, stride_b ); + Map > Cc_c( ( std::complex* )cp, m, n, stride_c ); + + Map > Az_c( ( std::complex* )ap, m, k, stride_a ); + Map > Bz_c( ( std::complex* )bp, k, n, stride_b ); + Map > Cz_c( ( std::complex* )cp, m, n, stride_c ); + + Map > As_r( ( float* )ap, m, k, stride_a ); + Map > Bs_r( ( float* )bp, k, n, stride_b ); + Map > Cs_r( ( float* )cp, m, n, stride_c ); + + Map > Ad_r( ( double* )ap, m, k, stride_a ); + Map > Bd_r( ( double* )bp, k, n, stride_b ); + Map > Cd_r( ( double* )cp, m, n, stride_c ); + + Map > Ac_r( ( std::complex* )ap, m, k, stride_a ); + Map > Bc_r( ( std::complex* )bp, k, n, stride_b ); + Map > Cc_r( ( std::complex* )cp, m, n, stride_c ); + + Map > Az_r( ( std::complex* )ap, m, k, stride_a ); + Map > Bz_r( ( std::complex* )bp, k, n, stride_b ); + Map > Cz_r( ( std::complex* )cp, m, n, stride_c ); #endif dtime_save = DBL_MAX; @@ -274,7 +271,22 @@ int main( int argc, char** argv ) #elif defined(EIGEN) - C.noalias() += alpha_r * A * B; + //C.noalias() += alpha_r * A * B; + + if ( params.sc_str[0] == 'c' ) + { + if ( params.dt_str[0] == 's' ) Cs_c.noalias() += alpha_r * As_c * Bs_c; + else if ( params.dt_str[0] == 'd' ) Cd_c.noalias() += alpha_r * Ad_c * Bd_c; + else if ( params.dt_str[0] == 'c' ) Cc_c.noalias() += alpha_r * Ac_c * Bc_c; + else if ( params.dt_str[0] == 'z' ) Cz_c.noalias() += alpha_r * Az_c * Bz_c; + } + else // if ( params.sc_str[0] == 'r' ) + { + if ( params.dt_str[0] == 's' ) Cs_r.noalias() += alpha_r * As_r * Bs_r; + else if ( params.dt_str[0] == 'd' ) Cd_r.noalias() += alpha_r * Ad_r * Bd_r; + else if ( params.dt_str[0] == 'c' ) Cc_r.noalias() += alpha_r * Ac_r * Bc_r; + else if ( params.dt_str[0] == 'z' ) Cz_r.noalias() += alpha_r * Az_r * Bz_r; + } #else // if defined(BLAS) @@ -293,15 +305,15 @@ int main( int argc, char** argv ) float* cp = ( float* )bli_obj_buffer( &c ); sgemm_( &f77_transa, - &f77_transb, - &mm, - &nn, - &kk, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); } else if ( bli_is_double( dt ) ) { @@ -318,15 +330,15 @@ int main( int argc, char** argv ) double* cp = ( double* )bli_obj_buffer( &c ); dgemm_( &f77_transa, - &f77_transb, - &mm, - &nn, - &kk, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); } else if ( bli_is_scomplex( dt ) ) { @@ -343,15 +355,15 @@ int main( int argc, char** argv ) scomplex* cp = ( scomplex* )bli_obj_buffer( &c ); cgemm_( &f77_transa, - &f77_transb, - &mm, - &nn, - &kk, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); } else if ( bli_is_dcomplex( dt ) ) { @@ -368,15 +380,15 @@ int main( int argc, char** argv ) dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c ); zgemm_( &f77_transa, - &f77_transb, - &mm, - &nn, - &kk, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); } #endif @@ -392,12 +404,13 @@ int main( int argc, char** argv ) if ( bli_is_complex( dt ) ) gflops *= 4.0; - printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR ); - printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", + printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, IMPL_STR ); + printf( "( %4lu, 1:4 ) = [ %5lu %5lu %5lu %8.2f ];\n", ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )k, ( unsigned long )n, gflops ); + fflush( stdout ); bli_obj_free( &alpha ); bli_obj_free( &beta ); @@ -413,3 +426,25 @@ int main( int argc, char** argv ) return 0; } +void init_def_params( params_t* params ) +{ + params->opname = LOCAL_OPNAME_STR; + params->impl = IMPL_STR; + + params->pc_str = LOCAL_PC_STR; + params->dt_str = GLOB_DEF_DT_STR; + params->sc_str = GLOB_DEF_SC_STR; + + params->im_str = GLOB_DEF_IM_STR; + + params->ps_str = GLOB_DEF_PS_STR; + params->m_str = GLOB_DEF_M_STR; + params->n_str = GLOB_DEF_N_STR; + params->k_str = GLOB_DEF_K_STR; + + params->nr_str = GLOB_DEF_NR_STR; + + params->alpha_str = GLOB_DEF_ALPHA_STR; + params->beta_str = GLOB_DEF_BETA_STR; +} + diff --git a/test/3/test_hemm.c b/test/3/test_hemm.c index 537378d43..d04d8cab2 100644 --- a/test/3/test_hemm.c +++ b/test/3/test_hemm.c @@ -34,9 +34,13 @@ #include #include "blis.h" +#include "test_utils.h" //#define PRINT +static const char* LOCAL_OPNAME_STR = "hemm"; +static const char* LOCAL_PC_STR = "ll"; + int main( int argc, char** argv ) { obj_t a, b, c; @@ -59,54 +63,42 @@ int main( int argc, char** argv ) double dtime_save; double gflops; - //bli_init(); - - //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - - n_repeats = 3; - - dt = DT; - - ind = IND; - - p_begin = P_BEGIN; - p_max = P_MAX; - p_inc = P_INC; - - m_input = -1; - n_input = -1; - + params_t params; // Supress compiler warnings about unused variable 'ind'. ( void )ind; -#if 0 - cntx_t* cntx; + //bli_init(); - ind_t ind_mod = ind; + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod ); + // Parse the command line options into strings, integers, enums, + // and doubles, as appropriate. + parse_cl_params( argc, argv, init_def_params, ¶ms ); - // Set k to the kc blocksize for the current datatype. - k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + dt = params.dt; -#elif 1 + ind = params.im; - //k_input = 256; + p_begin = params.sta; + p_max = params.end; + p_inc = params.inc; -#endif + m_input = params.m; + n_input = params.n; - // Choose the char corresponding to the requested datatype. - if ( bli_is_float( dt ) ) dt_ch = 's'; - else if ( bli_is_double( dt ) ) dt_ch = 'd'; - else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; - else dt_ch = 'z'; + n_repeats = params.nr; - side = BLIS_LEFT; - uploa = BLIS_LOWER; + // Map the datatype to its corresponding char. + bli_param_map_blis_to_char_dt( dt, &dt_ch ); + + // Map the parameter chars to their corresponding BLIS enum type values. + bli_param_map_char_to_blis_side( params.pc_str[0], &side ); + bli_param_map_char_to_blis_uplo( params.pc_str[1], &uploa ); + + // Map the BLIS enum type values to their corresponding BLAS chars. bli_param_map_blis_to_netlib_side( side, &f77_side ); bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa ); @@ -114,8 +106,8 @@ int main( int argc, char** argv ) // matlab allocates space for the entire array once up-front. for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ; - printf( "data_%s_%chemm_%s", THR_STR, dt_ch, STR ); - printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", + printf( "data_%s_%chemm_%s", THR_STR, dt_ch, IMPL_STR ); + printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n", ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); @@ -133,13 +125,28 @@ int main( int argc, char** argv ) bli_obj_create( dt, 1, 1, 0, 0, &alpha ); bli_obj_create( dt, 1, 1, 0, 0, &beta ); - if ( bli_is_left( side ) ) - bli_obj_create( dt, m, m, 0, 0, &a ); - else - bli_obj_create( dt, n, n, 0, 0, &a ); - bli_obj_create( dt, m, n, 0, 0, &b ); - bli_obj_create( dt, m, n, 0, 0, &c ); - bli_obj_create( dt, m, n, 0, 0, &c_save ); + // Choose the storage of each matrix based on the corresponding + // char in the params_t struct. Note that the expected order of + // storage specifers in sc_str is CAB (not ABC). + if ( params.sc_str[1] == 'c' ) + { + if ( bli_is_left( side ) ) bli_obj_create( dt, m, m, 0, 0, &a ); + else bli_obj_create( dt, n, n, 0, 0, &a ); + } + else // if ( params.sc_str[1] == 'r' ) + { + if ( bli_is_left( side ) ) bli_obj_create( dt, m, m, m, 1, &a ); + else bli_obj_create( dt, n, n, n, 1, &a ); + } + + if ( params.sc_str[2] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &b ); + else bli_obj_create( dt, m, n, n, 1, &b ); + + if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c ); + else bli_obj_create( dt, m, n, n, 1, &c ); + + if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c_save ); + else bli_obj_create( dt, m, n, n, 1, &c_save ); bli_randm( &a ); bli_randm( &b ); @@ -153,12 +160,15 @@ int main( int argc, char** argv ) bli_mkherm( &a ); bli_mktrim( &a ); - bli_setsc( (2.0/1.0), 0.0, &alpha ); - bli_setsc( (1.0/1.0), 0.0, &beta ); + //bli_setsc( (2.0/1.0), 0.0, &alpha ); + //bli_setsc( (1.0/1.0), 0.0, &beta ); + bli_setsc( params.alpha, 0.0, &alpha ); + bli_setsc( params.beta, 0.0, &beta ); bli_copym( &c, &c_save ); - -#if 0 //def BLIS + +#ifdef BLIS + // Switch to the induced method specified by ind. bli_ind_disable_all_dt( dt ); bli_ind_enable_dt( ind, dt ); #endif @@ -202,14 +212,14 @@ int main( int argc, char** argv ) float* cp = ( float* )bli_obj_buffer( &c ); ssymm_( &f77_side, - &f77_uploa, - &mm, - &nn, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); + &f77_uploa, + &mm, + &nn, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); } else if ( bli_is_double( dt ) ) { @@ -225,14 +235,14 @@ int main( int argc, char** argv ) double* cp = ( double* )bli_obj_buffer( &c ); dsymm_( &f77_side, - &f77_uploa, - &mm, - &nn, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); + &f77_uploa, + &mm, + &nn, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); } else if ( bli_is_scomplex( dt ) ) { @@ -256,14 +266,14 @@ int main( int argc, char** argv ) #endif chemm_( &f77_side, - &f77_uploa, - &mm, - &nn, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); + &f77_uploa, + &mm, + &nn, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); } else if ( bli_is_dcomplex( dt ) ) { @@ -287,14 +297,14 @@ int main( int argc, char** argv ) #endif zhemm_( &f77_side, - &f77_uploa, - &mm, - &nn, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); + &f77_uploa, + &mm, + &nn, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); } #endif @@ -313,11 +323,12 @@ int main( int argc, char** argv ) if ( bli_is_complex( dt ) ) gflops *= 4.0; - printf( "data_%s_%chemm_%s", THR_STR, dt_ch, STR ); - printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", + printf( "data_%s_%chemm_%s", THR_STR, dt_ch, IMPL_STR ); + printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n", ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )n, gflops ); + fflush( stdout ); bli_obj_free( &alpha ); bli_obj_free( &beta ); @@ -333,3 +344,24 @@ int main( int argc, char** argv ) return 0; } +void init_def_params( params_t* params ) +{ + params->opname = LOCAL_OPNAME_STR; + params->impl = IMPL_STR; + + params->pc_str = LOCAL_PC_STR; + params->dt_str = GLOB_DEF_DT_STR; + params->sc_str = GLOB_DEF_SC_STR; + + params->im_str = GLOB_DEF_IM_STR; + + params->ps_str = GLOB_DEF_PS_STR; + params->m_str = GLOB_DEF_M_STR; + params->n_str = GLOB_DEF_N_STR; + + params->nr_str = GLOB_DEF_NR_STR; + + params->alpha_str = GLOB_DEF_ALPHA_STR; + params->beta_str = GLOB_DEF_BETA_STR; +} + diff --git a/test/3/test_herk.c b/test/3/test_herk.c index 6dbaf1936..a713b6766 100644 --- a/test/3/test_herk.c +++ b/test/3/test_herk.c @@ -35,9 +35,13 @@ #include #include "blis.h" +#include "test_utils.h" //#define PRINT +static const char* LOCAL_OPNAME_STR = "herk"; +static const char* LOCAL_PC_STR = "ln"; + int main( int argc, char** argv ) { obj_t a, c; @@ -60,55 +64,43 @@ int main( int argc, char** argv ) double dtime_save; double gflops; - //bli_init(); - - //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - - n_repeats = 3; - - dt = DT; - dt_real = bli_dt_proj_to_real( DT ); - - ind = IND; - - p_begin = P_BEGIN; - p_max = P_MAX; - p_inc = P_INC; - - m_input = -1; - k_input = -1; - + params_t params; // Supress compiler warnings about unused variable 'ind'. ( void )ind; -#if 0 - cntx_t* cntx; + //bli_init(); + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + // Parse the command line options into strings, integers, enums, + // and doubles, as appropriate. + parse_cl_params( argc, argv, init_def_params, ¶ms ); - ind_t ind_mod = ind; + dt = params.dt; + dt_real = bli_dt_proj_to_real( dt ); - // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod ); + ind = params.im; - // Set k to the kc blocksize for the current datatype. - k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + p_begin = params.sta; + p_max = params.end; + p_inc = params.inc; -#elif 1 + m_input = params.m; + k_input = params.k; - //k_input = 256; + n_repeats = params.nr; -#endif - // Choose the char corresponding to the requested datatype. - if ( bli_is_float( dt ) ) dt_ch = 's'; - else if ( bli_is_double( dt ) ) dt_ch = 'd'; - else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; - else dt_ch = 'z'; + // Map the datatype to its corresponding char. + bli_param_map_blis_to_char_dt( dt, &dt_ch ); - uploc = BLIS_LOWER; - transa = BLIS_NO_TRANSPOSE; + // Map the parameter chars to their corresponding BLIS enum type values. + bli_param_map_char_to_blis_uplo( params.pc_str[0], &uploc ); + bli_param_map_char_to_blis_trans( params.pc_str[1], &transa ); + // Map the BLIS enum type values to their corresponding BLAS chars. bli_param_map_blis_to_netlib_uplo( uploc, &f77_uploc ); bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); @@ -116,8 +108,8 @@ int main( int argc, char** argv ) // matlab allocates space for the entire array once up-front. for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ; - printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR ); - printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", + printf( "data_%s_%cherk_%s", THR_STR, dt_ch, IMPL_STR ); + printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n", ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); @@ -135,15 +127,25 @@ int main( int argc, char** argv ) bli_obj_create( dt_real, 1, 1, 0, 0, &alpha ); bli_obj_create( dt, 1, 1, 0, 0, &beta ); - if ( bli_does_trans( transa ) ) - bli_obj_create( dt, k, m, 0, 0, &a ); - else - bli_obj_create( dt, m, k, 0, 0, &a ); - bli_obj_create( dt, m, m, 0, 0, &c ); - //bli_obj_create( dt, m, k, 2, 2*m, &a ); - //bli_obj_create( dt, k, n, 2, 2*k, &b ); - //bli_obj_create( dt, m, n, 2, 2*m, &c ); - bli_obj_create( dt, m, m, 0, 0, &c_save ); + // Choose the storage of each matrix based on the corresponding + // char in the params_t struct. Note that the expected order of + // storage specifers in sc_str is CA (not AC). + if ( params.sc_str[1] == 'c' ) + { + if ( bli_does_trans( transa ) ) bli_obj_create( dt, k, m, 0, 0, &a ); + else bli_obj_create( dt, m, k, 0, 0, &a ); + } + else // if ( params.sc_str[1] == 'r' ) + { + if ( bli_does_trans( transa ) ) bli_obj_create( dt, k, m, m, 1, &a ); + else bli_obj_create( dt, m, k, k, 1, &a ); + } + + if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, m, 0, 0, &c ); + else bli_obj_create( dt, m, m, m, 1, &c ); + + if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, m, 0, 0, &c_save ); + else bli_obj_create( dt, m, m, m, 1, &c_save ); bli_randm( &a ); bli_randm( &c ); @@ -151,14 +153,22 @@ int main( int argc, char** argv ) bli_obj_set_struc( BLIS_HERMITIAN, &c ); bli_obj_set_uplo( uploc, &c ); + // Make C densely Hermitian, and zero the unstored triangle to + // ensure the implementation reads only from the stored region. + bli_mkherm( &c ); + bli_mktrim( &c ); + bli_obj_set_conjtrans( transa, &a ); - bli_setsc( (2.0/1.0), 0.0, &alpha ); - bli_setsc( (1.0/1.0), 0.0, &beta ); + //bli_setsc( (2.0/1.0), 0.0, &alpha ); + //bli_setsc( (1.0/1.0), 0.0, &beta ); + bli_setsc( params.alpha, 0.0, &alpha ); + bli_setsc( params.beta, 0.0, &beta ); bli_copym( &c, &c_save ); - -#if 0 //def BLIS + +#ifdef BLIS + // Switch to the induced method specified by ind. bli_ind_disable_all_dt( dt ); bli_ind_enable_dt( ind, dt ); #endif @@ -197,13 +207,13 @@ int main( int argc, char** argv ) float* cp = ( float* )bli_obj_buffer( &c ); ssyrk_( &f77_uploc, - &f77_transa, - &mm, - &kk, - alphap, - ap, &lda, - betap, - cp, &ldc ); + &f77_transa, + &mm, + &kk, + alphap, + ap, &lda, + betap, + cp, &ldc ); } else if ( bli_is_double( dt ) ) { @@ -217,13 +227,13 @@ int main( int argc, char** argv ) double* cp = ( double* )bli_obj_buffer( &c ); dsyrk_( &f77_uploc, - &f77_transa, - &mm, - &kk, - alphap, - ap, &lda, - betap, - cp, &ldc ); + &f77_transa, + &mm, + &kk, + alphap, + ap, &lda, + betap, + cp, &ldc ); } else if ( bli_is_scomplex( dt ) ) { @@ -244,13 +254,13 @@ int main( int argc, char** argv ) #endif cherk_( &f77_uploc, - &f77_transa, - &mm, - &kk, - alphap, - ap, &lda, - betap, - cp, &ldc ); + &f77_transa, + &mm, + &kk, + alphap, + ap, &lda, + betap, + cp, &ldc ); } else if ( bli_is_dcomplex( dt ) ) { @@ -271,13 +281,13 @@ int main( int argc, char** argv ) #endif zherk_( &f77_uploc, - &f77_transa, - &mm, - &kk, - alphap, - ap, &lda, - betap, - cp, &ldc ); + &f77_transa, + &mm, + &kk, + alphap, + ap, &lda, + betap, + cp, &ldc ); } #endif @@ -293,11 +303,12 @@ int main( int argc, char** argv ) if ( bli_is_complex( dt ) ) gflops *= 4.0; - printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR ); - printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", + printf( "data_%s_%cherk_%s", THR_STR, dt_ch, IMPL_STR ); + printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n", ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )k, gflops ); + fflush( stdout ); bli_obj_free( &alpha ); bli_obj_free( &beta ); @@ -312,3 +323,24 @@ int main( int argc, char** argv ) return 0; } +void init_def_params( params_t* params ) +{ + params->opname = LOCAL_OPNAME_STR; + params->impl = IMPL_STR; + + params->pc_str = LOCAL_PC_STR; + params->dt_str = GLOB_DEF_DT_STR; + params->sc_str = GLOB_DEF_SC_STR; + + params->im_str = GLOB_DEF_IM_STR; + + params->ps_str = GLOB_DEF_PS_STR; + params->m_str = GLOB_DEF_M_STR; + params->k_str = GLOB_DEF_K_STR; + + params->nr_str = GLOB_DEF_NR_STR; + + params->alpha_str = GLOB_DEF_ALPHA_STR; + params->beta_str = GLOB_DEF_BETA_STR; +} + diff --git a/test/3/test_trmm.c b/test/3/test_trmm.c index 4e58b95fa..2ecbb19b1 100644 --- a/test/3/test_trmm.c +++ b/test/3/test_trmm.c @@ -35,9 +35,13 @@ #include #include "blis.h" +#include "test_utils.h" //#define PRINT +static const char* LOCAL_OPNAME_STR = "trmm"; +static const char* LOCAL_PC_STR = "llnn"; + int main( int argc, char** argv ) { obj_t a, c; @@ -64,64 +68,44 @@ int main( int argc, char** argv ) double dtime_save; double gflops; - //bli_init(); - - //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - - n_repeats = 3; - - dt = DT; - - ind = IND; - - p_begin = P_BEGIN; - p_max = P_MAX; - p_inc = P_INC; - - m_input = -1; - n_input = -1; - + params_t params; // Supress compiler warnings about unused variable 'ind'. ( void )ind; -#if 0 - cntx_t* cntx; + //bli_init(); - ind_t ind_mod = ind; + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod ); + // Parse the command line options into strings, integers, enums, + // and doubles, as appropriate. + parse_cl_params( argc, argv, init_def_params, ¶ms ); - // Set k to the kc blocksize for the current datatype. - k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + dt = params.dt; -#elif 1 + ind = params.im; - //k_input = 256; + p_begin = params.sta; + p_max = params.end; + p_inc = params.inc; -#endif + m_input = params.m; + n_input = params.n; - // Choose the char corresponding to the requested datatype. - if ( bli_is_float( dt ) ) dt_ch = 's'; - else if ( bli_is_double( dt ) ) dt_ch = 'd'; - else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; - else dt_ch = 'z'; + n_repeats = params.nr; -#if 0 - side = BLIS_LEFT; -#else - side = BLIS_RIGHT; -#endif -#if 0 - uploa = BLIS_LOWER; -#else - uploa = BLIS_UPPER; -#endif - transa = BLIS_NO_TRANSPOSE; - diaga = BLIS_NONUNIT_DIAG; + // Map the datatype to its corresponding char. + bli_param_map_blis_to_char_dt( dt, &dt_ch ); + + // Map the parameter chars to their corresponding BLIS enum type values. + bli_param_map_char_to_blis_side( params.pc_str[0], &side ); + bli_param_map_char_to_blis_uplo( params.pc_str[1], &uploa ); + bli_param_map_char_to_blis_trans( params.pc_str[2], &transa ); + bli_param_map_char_to_blis_diag( params.pc_str[3], &diaga ); + + // Map the BLIS enum type values to their corresponding BLAS chars. bli_param_map_blis_to_netlib_side( side, &f77_side ); bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa ); bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); @@ -131,8 +115,8 @@ int main( int argc, char** argv ) // matlab allocates space for the entire array once up-front. for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ; - printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR ); - printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", + printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, IMPL_STR ); + printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n", ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); @@ -149,12 +133,26 @@ int main( int argc, char** argv ) bli_obj_create( dt, 1, 1, 0, 0, &alpha ); - if ( bli_is_left( side ) ) - bli_obj_create( dt, m, m, 0, 0, &a ); - else - bli_obj_create( dt, n, n, 0, 0, &a ); - bli_obj_create( dt, m, n, 0, 0, &c ); - bli_obj_create( dt, m, n, 0, 0, &c_save ); + // Choose the storage of each matrix based on the corresponding + // char in the params_t struct. Note that the expected order of + // storage specifers in sc_str is CA (not AC). Also note that + // C plays the role of matrix B. + if ( params.sc_str[1] == 'c' ) + { + if ( bli_is_left( side ) ) bli_obj_create( dt, m, m, 0, 0, &a ); + else bli_obj_create( dt, n, n, 0, 0, &a ); + } + else // if ( params.sc_str[1] == 'r' ) + { + if ( bli_is_left( side ) ) bli_obj_create( dt, m, m, m, 1, &a ); + else bli_obj_create( dt, n, n, n, 1, &a ); + } + + if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c ); + else bli_obj_create( dt, m, n, n, 1, &c ); + + if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c_save ); + else bli_obj_create( dt, m, n, n, 1, &c_save ); bli_randm( &a ); bli_randm( &c ); @@ -164,14 +162,16 @@ int main( int argc, char** argv ) bli_obj_set_conjtrans( transa, &a ); bli_obj_set_diag( diaga, &a ); - bli_randm( &a ); + // Zero the unstored triangle. bli_mktrim( &a ); - bli_setsc( (2.0/1.0), 0.0, &alpha ); + //bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( params.alpha, 0.0, &alpha ); bli_copym( &c, &c_save ); - -#if 0 //def BLIS + +#ifdef BLIS + // Switch to the induced method specified by ind. bli_ind_disable_all_dt( dt ); bli_ind_enable_dt( ind, dt ); #endif @@ -209,14 +209,14 @@ int main( int argc, char** argv ) float* cp = ( float* )bli_obj_buffer( &c ); strmm_( &f77_side, - &f77_uploa, - &f77_transa, - &f77_diaga, - &mm, - &kk, - alphap, - ap, &lda, - cp, &ldc ); + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); } else if ( bli_is_double( dt ) ) { @@ -229,14 +229,14 @@ int main( int argc, char** argv ) double* cp = ( double* )bli_obj_buffer( &c ); dtrmm_( &f77_side, - &f77_uploa, - &f77_transa, - &f77_diaga, - &mm, - &kk, - alphap, - ap, &lda, - cp, &ldc ); + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); } else if ( bli_is_scomplex( dt ) ) { @@ -255,14 +255,14 @@ int main( int argc, char** argv ) #endif ctrmm_( &f77_side, - &f77_uploa, - &f77_transa, - &f77_diaga, - &mm, - &kk, - alphap, - ap, &lda, - cp, &ldc ); + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); } else if ( bli_is_dcomplex( dt ) ) { @@ -281,14 +281,14 @@ int main( int argc, char** argv ) #endif ztrmm_( &f77_side, - &f77_uploa, - &f77_transa, - &f77_diaga, - &mm, - &kk, - alphap, - ap, &lda, - cp, &ldc ); + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); } #endif @@ -307,11 +307,12 @@ int main( int argc, char** argv ) if ( bli_is_complex( dt ) ) gflops *= 4.0; - printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR ); - printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", + printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, IMPL_STR ); + printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n", ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )n, gflops ); + fflush( stdout ); bli_obj_free( &alpha ); @@ -325,3 +326,24 @@ int main( int argc, char** argv ) return 0; } +void init_def_params( params_t* params ) +{ + params->opname = LOCAL_OPNAME_STR; + params->impl = IMPL_STR; + + params->pc_str = LOCAL_PC_STR; + params->dt_str = GLOB_DEF_DT_STR; + params->sc_str = GLOB_DEF_SC_STR; + + params->im_str = GLOB_DEF_IM_STR; + + params->ps_str = GLOB_DEF_PS_STR; + params->m_str = GLOB_DEF_M_STR; + params->n_str = GLOB_DEF_N_STR; + + params->nr_str = GLOB_DEF_NR_STR; + + params->alpha_str = GLOB_DEF_ALPHA_STR; + params->beta_str = GLOB_DEF_BETA_STR; +} + diff --git a/test/3/test_trsm.c b/test/3/test_trsm.c index 4897d4627..4b92f6128 100644 --- a/test/3/test_trsm.c +++ b/test/3/test_trsm.c @@ -35,9 +35,13 @@ #include #include "blis.h" +#include "test_utils.h" //#define PRINT +static const char* LOCAL_OPNAME_STR = "trsm"; +static const char* LOCAL_PC_STR = "llnn"; + int main( int argc, char** argv ) { obj_t a, c; @@ -64,64 +68,44 @@ int main( int argc, char** argv ) double dtime_save; double gflops; - //bli_init(); - - //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - - n_repeats = 3; - - dt = DT; - - ind = IND; - - p_begin = P_BEGIN; - p_max = P_MAX; - p_inc = P_INC; - - m_input = -1; - n_input = -1; - + params_t params; // Supress compiler warnings about unused variable 'ind'. ( void )ind; -#if 0 - cntx_t* cntx; + //bli_init(); - ind_t ind_mod = ind; + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod ); + // Parse the command line options into strings, integers, enums, + // and doubles, as appropriate. + parse_cl_params( argc, argv, init_def_params, ¶ms ); - // Set k to the kc blocksize for the current datatype. - k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + dt = params.dt; -#elif 1 + ind = params.im; - //k_input = 256; + p_begin = params.sta; + p_max = params.end; + p_inc = params.inc; -#endif + m_input = params.m; + n_input = params.n; - // Choose the char corresponding to the requested datatype. - if ( bli_is_float( dt ) ) dt_ch = 's'; - else if ( bli_is_double( dt ) ) dt_ch = 'd'; - else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; - else dt_ch = 'z'; + n_repeats = params.nr; -#if 0 - side = BLIS_LEFT; -#else - side = BLIS_RIGHT; -#endif -#if 0 - uploa = BLIS_LOWER; -#else - uploa = BLIS_UPPER; -#endif - transa = BLIS_NO_TRANSPOSE; - diaga = BLIS_NONUNIT_DIAG; + // Map the datatype to its corresponding char. + bli_param_map_blis_to_char_dt( dt, &dt_ch ); + + // Map the parameter chars to their corresponding BLIS enum type values. + bli_param_map_char_to_blis_side( params.pc_str[0], &side ); + bli_param_map_char_to_blis_uplo( params.pc_str[1], &uploa ); + bli_param_map_char_to_blis_trans( params.pc_str[2], &transa ); + bli_param_map_char_to_blis_diag( params.pc_str[3], &diaga ); + + // Map the BLIS enum type values to their corresponding BLAS chars. bli_param_map_blis_to_netlib_side( side, &f77_side ); bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa ); bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); @@ -131,8 +115,8 @@ int main( int argc, char** argv ) // matlab allocates space for the entire array once up-front. for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ; - printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR ); - printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", + printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, IMPL_STR ); + printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n", ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )0, ( unsigned long )0, 0.0 ); @@ -149,13 +133,26 @@ int main( int argc, char** argv ) bli_obj_create( dt, 1, 1, 0, 0, &alpha ); - if ( bli_is_left( side ) ) - bli_obj_create( dt, m, m, 0, 0, &a ); - else - bli_obj_create( dt, n, n, 0, 0, &a ); - bli_obj_create( dt, m, n, 0, 0, &c ); - //bli_obj_create( dt, m, n, n, 1, &c ); - bli_obj_create( dt, m, n, 0, 0, &c_save ); + // Choose the storage of each matrix based on the corresponding + // char in the params_t struct. Note that the expected order of + // storage specifers in sc_str is CA (not AC). Also note that + // C plays the role of matrix B. + if ( params.sc_str[1] == 'c' ) + { + if ( bli_is_left( side ) ) bli_obj_create( dt, m, m, 0, 0, &a ); + else bli_obj_create( dt, n, n, 0, 0, &a ); + } + else // if ( params.sc_str[1] == 'r' ) + { + if ( bli_is_left( side ) ) bli_obj_create( dt, m, m, m, 1, &a ); + else bli_obj_create( dt, n, n, n, 1, &a ); + } + + if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c ); + else bli_obj_create( dt, m, n, n, 1, &c ); + + if ( params.sc_str[0] == 'c' ) bli_obj_create( dt, m, n, 0, 0, &c_save ); + else bli_obj_create( dt, m, n, n, 1, &c_save ); bli_randm( &a ); bli_randm( &c ); @@ -165,17 +162,19 @@ int main( int argc, char** argv ) bli_obj_set_conjtrans( transa, &a ); bli_obj_set_diag( diaga, &a ); - bli_randm( &a ); + // Zero the unstored triangle. bli_mktrim( &a ); // Load the diagonal of A to make it more likely to be invertible. bli_shiftd( &BLIS_TWO, &a ); - bli_setsc( (2.0/1.0), 0.0, &alpha ); + //bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( params.alpha, 0.0, &alpha ); bli_copym( &c, &c_save ); - -#if 0 //def BLIS + +#ifdef BLIS + // Switch to the induced method specified by ind. bli_ind_disable_all_dt( dt ); bli_ind_enable_dt( ind, dt ); #endif @@ -213,14 +212,14 @@ int main( int argc, char** argv ) float* cp = ( float* )bli_obj_buffer( &c ); strsm_( &f77_side, - &f77_uploa, - &f77_transa, - &f77_diaga, - &mm, - &kk, - alphap, - ap, &lda, - cp, &ldc ); + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); } else if ( bli_is_double( dt ) ) { @@ -233,14 +232,14 @@ int main( int argc, char** argv ) double* cp = ( double* )bli_obj_buffer( &c ); dtrsm_( &f77_side, - &f77_uploa, - &f77_transa, - &f77_diaga, - &mm, - &kk, - alphap, - ap, &lda, - cp, &ldc ); + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); } else if ( bli_is_scomplex( dt ) ) { @@ -259,14 +258,14 @@ int main( int argc, char** argv ) #endif ctrsm_( &f77_side, - &f77_uploa, - &f77_transa, - &f77_diaga, - &mm, - &kk, - alphap, - ap, &lda, - cp, &ldc ); + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); } else if ( bli_is_dcomplex( dt ) ) { @@ -285,14 +284,14 @@ int main( int argc, char** argv ) #endif ztrsm_( &f77_side, - &f77_uploa, - &f77_transa, - &f77_diaga, - &mm, - &kk, - alphap, - ap, &lda, - cp, &ldc ); + &f77_uploa, + &f77_transa, + &f77_diaga, + &mm, + &kk, + alphap, + ap, &lda, + cp, &ldc ); } #endif @@ -311,11 +310,12 @@ int main( int argc, char** argv ) if ( bli_is_complex( dt ) ) gflops *= 4.0; - printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR ); - printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n", + printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, IMPL_STR ); + printf( "( %4lu, 1:3 ) = [ %5lu %5lu %8.2f ];\n", ( unsigned long )(p - p_begin)/p_inc + 1, ( unsigned long )m, ( unsigned long )n, gflops ); + fflush( stdout ); bli_obj_free( &alpha ); @@ -329,3 +329,24 @@ int main( int argc, char** argv ) return 0; } +void init_def_params( params_t* params ) +{ + params->opname = LOCAL_OPNAME_STR; + params->impl = IMPL_STR; + + params->pc_str = LOCAL_PC_STR; + params->dt_str = GLOB_DEF_DT_STR; + params->sc_str = GLOB_DEF_SC_STR; + + params->im_str = GLOB_DEF_IM_STR; + + params->ps_str = GLOB_DEF_PS_STR; + params->m_str = GLOB_DEF_M_STR; + params->n_str = GLOB_DEF_N_STR; + + params->nr_str = GLOB_DEF_NR_STR; + + params->alpha_str = GLOB_DEF_ALPHA_STR; + params->beta_str = GLOB_DEF_BETA_STR; +} + diff --git a/test/3/test_utils.c b/test/3/test_utils.c new file mode 100644 index 000000000..8e441d055 --- /dev/null +++ b/test/3/test_utils.c @@ -0,0 +1,684 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_utils.h" + +// Global string constants. +const char* GLOB_DEF_DT_STR = "d"; +const char* GLOB_DEF_SC_STR = "ccc"; +const char* GLOB_DEF_IM_STR = "native"; + +const char* GLOB_DEF_PS_STR = "50 1000 50"; +const char* GLOB_DEF_M_STR = "-1"; +const char* GLOB_DEF_N_STR = "-1"; +const char* GLOB_DEF_K_STR = "-1"; + +const char* GLOB_DEF_NR_STR = "3"; + +const char* GLOB_DEF_ALPHA_STR = "1.0"; +const char* GLOB_DEF_BETA_STR = "1.0"; + + +void parse_cl_params( int argc, char** argv, init_fp fp, params_t* params ) +{ + bool gave_option_c = FALSE; + bool gave_option_d = FALSE; + bool gave_option_s = FALSE; + + bool gave_option_i = FALSE; + + bool gave_option_p = FALSE; + bool gave_option_m = FALSE; + bool gave_option_n = FALSE; + bool gave_option_k = FALSE; + + bool gave_option_r = FALSE; + + bool gave_option_a = FALSE; + bool gave_option_b = FALSE; + + int opt; + char opt_ch; + + getopt_t state; + + // Initialize the params_t struct with the caller-supplied function. + fp( params ); + + // Copy the binary name pointer so we can use it later. + params->bin = argv[0]; + + // Alias the binary name for conciseness. + const char* bin = params->bin; + + // Initialize the state for running bli_getopt(). Here, 0 is the + // initial value for opterr, which suppresses error messages. + bli_getopt_init_state( 0, &state ); + + // Process all option arguments until we get a -1, which means we're done. + while( (opt = bli_getopt( argc, ( const char* const * )argv, "c:d:s:i:p:m:n:k:r:a:b:qvh", &state )) != -1 ) + { + // Explicitly typecast opt, which is an int, to a char. (Failing to + // typecast resulted in at least one user-reported problem whereby + // opt was being filled with garbage.) + opt_ch = ( char )opt; + + switch( opt_ch ) + { + case 'c': + params->pc_str = state.optarg; + gave_option_c = TRUE; + break; + + case 'd': + params->dt_str = state.optarg; + gave_option_d = TRUE; + break; + + case 's': + params->sc_str = state.optarg; + gave_option_s = TRUE; + break; + + + case 'i': + params->im_str = state.optarg; + gave_option_i = TRUE; + break; + + + case 'p': + params->ps_str = state.optarg; + gave_option_p = TRUE; + break; + + case 'm': + params->m_str = state.optarg; + gave_option_m = TRUE; + break; + + case 'n': + params->n_str = state.optarg; + gave_option_n = TRUE; + break; + + case 'k': + params->k_str = state.optarg; + gave_option_k = TRUE; + break; + + + case 'r': + params->nr_str = state.optarg; + gave_option_r = TRUE; + break; + + + case 'a': + params->alpha_str = state.optarg; + gave_option_a = TRUE; + break; + + case 'b': + params->beta_str = state.optarg; + gave_option_b = TRUE; + break; + + + case 'q': + params->verbose = FALSE; + break; + + case 'v': + params->verbose = TRUE; + break; + + case 'h': + { + bool has_trans = FALSE; + bool has_side = FALSE; + bool has_uplo = FALSE; + bool has_unit = FALSE; + + if ( is_gemm( params ) || + is_herk( params ) || + is_trmm( params ) || + is_trsm( params ) ) has_trans = TRUE; + + if ( is_hemm( params ) || + is_trmm( params ) || + is_trsm( params ) ) has_side = TRUE; + + if ( is_hemm( params ) || + is_herk( params ) || + is_trmm( params ) || + is_trsm( params ) ) has_uplo = TRUE; + + if ( is_trmm( params ) || + is_trsm( params ) ) has_unit = TRUE; + + printf( "\n" ); + printf( " %s performance driver\n", params->opname ); + printf( " -----------------------\n" ); + printf( " (part of the BLIS framework)\n" ); + printf( "\n" ); + printf( " Measure performance of the '%s' implementation of the '%s' operation:\n", params->impl, params->opname ); + printf( "\n" ); + if ( is_gemm( params ) ) + { + printf( " C := beta * C + alpha * trans(A) * trans(B)\n" ); + printf( "\n" ); + printf( " where C is an m x n matrix, trans(A) is an m x k matrix, and\n" ); + printf( " trans(B) is a k x n matrix.\n" ); + } + else if ( is_hemm( params ) ) + { + printf( " C := beta * C + alpha * uplo(A) * B (side = left)\n" ); + printf( " C := beta * C + alpha * B * uplo(A) (side = right)\n" ); + printf( "\n" ); + printf( " where C and B are m x n matrices and A is a Hermitian matrix stored\n" ); + printf( " in the lower or upper triangle, as specified by uplo(A). When side =\n" ); + printf( " left, A is m x m, and when side = right, A is n x n.\n" ); + } + else if ( is_herk( params ) ) + { + printf( " uplo(C) := beta * uplo(C) + alpha * trans(A) * trans(A)^H\n" ); + printf( "\n" ); + printf( " where C is an m x m Hermitian matrix stored in the lower or upper\n" ); + printf( " triangle, as specified by uplo(C), and trans(A) is an m x k matrix.\n" ); + } + else if ( is_trmm( params ) ) + { + printf( " B := alpha * trans(uplo(A)) * B (side = left)\n" ); + printf( " B := alpha * B * trans(uplo(A)) (side = right)\n" ); + printf( "\n" ); + printf( " where B is an m x n matrix and A is a triangular matrix stored in\n" ); + printf( " the lower or upper triangle, as specified by uplo(A), with unit/non-unit\n" ); + printf( " diagonal specified by diag(A). When side = left, A is m x m, and when\n" ); + printf( " side = right, A is n x n.\n" ); + } + else if ( is_trsm( params ) ) + { + printf( " B := alpha * trans(uplo(A))^{-1} * B (side = left)\n" ); + printf( " B := alpha * B * trans(uplo(A))^{-1} (side = right)\n" ); + printf( "\n" ); + printf( " where B is an m x n matrix and A is a triangular matrix stored in\n" ); + printf( " the lower or upper triangle, as specified by uplo(A), with unit/non-unit\n" ); + printf( " diagonal specified by diag(A). When side = left, A is m x m, and when\n" ); + printf( " side = right, A is n x n. Note that while ^{-1} indicates inversion,\n" ); + printf( " trsm does not explicitly invert A, but rather solves for an m x n\n" ); + printf( " solution matrix X, which then overwrites the original contents of B.\n" ); + } + printf( "\n" ); + printf( " Performance measurements are taken for a range of problem sizes with a fixed\n" ); + printf( " set of parameters, and results are printed to stdout in a matlab/octave-\n" ); + printf( " friendly format.\n" ); + printf( "\n" ); + printf( " Usage:\n" ); + printf( "\n" ); + printf( " %s [options]\n", bin ); + printf( "\n" ); + printf( " The following computational options are supported:\n" ); + printf( "\n" ); + printf( " -c pc\n" ); + printf( " Use the operation-specific parameter combination specified by\n" ); + printf( " the 'pc' string. The following tables list expected parameters\n" ); + printf( " for the '%s' operation and the valid values for each parameter.\n", params->opname ); + printf( "\n" ); + printf( " Operation List (order) of parameters Example\n" ); + printf( " -------------------------------------------------------\n" ); + if ( is_gemm( params ) ) + { + printf( " gemm trans(A) trans(A) -c tn\n" ); + } + else if ( is_hemm( params ) ) + { + printf( " hemm/symm side(A) uplo(A) -c rl\n" ); + } + else if ( is_herk( params ) ) + { + printf( " herk/syrk uplo(C) trans(A) -c ln\n" ); + } + else if ( is_trmm( params ) ) + { + printf( " trmm side(A) uplo(A) trans(A) unit(A) -c lutn\n" ); + } + else if ( is_trsm( params ) ) + { + printf( " trsm side(A) uplo(A) trans(A) unit(A) -c rlnn\n" ); + } + printf( "\n" ); + printf( " Valid\n" ); + printf( " Param chars Interpretation\n" ); + printf( " ---------------------------------------\n" ); + if ( has_trans ) + { + printf( " trans n No transpose\n" ); + printf( " t Transpose only\n" ); + printf( " c Conjugate only*\n" ); + printf( " h Hermitian transpose\n" ); + printf( "\n" ); + } + if ( has_side ) + { + printf( " side l Left\n" ); + printf( " r Right\n" ); + printf( "\n" ); + } + if ( has_uplo ) + { + printf( " uplo l Lower-stored\n" ); + printf( " u Upper-stored\n" ); + printf( "\n" ); + } + if ( has_unit ) + { + printf( " unit u Unit diagonal\n" ); + printf( " n Non-unit diagonal\n" ); + printf( "\n" ); + } + if ( has_trans ) + { + printf( " *This option is supported by BLIS but not by classic BLAS.\n" ); + } + printf( "\n" ); + printf( " -d dt\n" ); + printf( " Allocate matrix elements using the datatype character specified\n" ); + printf( " by dt, and also perform computation in that same datatype. Valid\n" ); + printf( " char values for dt are:\n" ); + printf( "\n" ); + printf( " Valid\n" ); + printf( " chars Interpretation\n" ); + printf( " -----------------------------------------\n" ); + printf( " s single-precision real domain\n" ); + printf( " d double-precision real domain\n" ); + printf( " c single-precision complex domain\n" ); + printf( " z double-precision complex domain\n" ); + printf( "\n" ); + printf( " -s sc\n" ); + printf( " Use the characters in sc to determine the storage formats\n" ); + printf( " of each operand matrix used in the performance measurements.\n" ); + printf( " Valid chars are 'r' (row storage) and 'c' (column storage).\n" ); + printf( " The characters encode the storage format for each operand\n" ); + printf( " used by %s, with the mapping of chars to operand interpreted\n", params->opname ); + printf( " in the following order:\n" ); + printf( "\n" ); + printf( " Order of\n" ); + printf( " operand \n" ); + printf( " Operation mapping Example Interpretation\n" ); + printf( " ----------------------------------------------------------\n" ); + if ( is_gemm( params ) ) + { + printf( " gemm C A B -s crr C is col-stored;\n" ); + printf( " A and B are row-stored.\n" ); + } + else if ( is_hemm( params ) ) + { + printf( " hemm/symm C A B -s rcc C is row-stored;\n" ); + printf( " A and B are col-stored.\n" ); + } + else if ( is_herk( params ) ) + { + printf( " herk/syrk C A -s rc C is row-stored;\n" ); + printf( " A is col-stored.\n" ); + } + else if ( is_trmm( params ) ) + { + printf( " trmm B A -s cr B is col-stored;\n" ); + printf( " A is row-stored.\n" ); + } + else if ( is_trsm( params ) ) + { + printf( " trsm B A -s cc B and A are col-stored.\n" ); + } + printf( "\n" ); + printf( " -i im\n" ); + printf( " Use native execution if im is 'native' (or 'nat'). Otherwise,\n" ); + printf( " if im is '1m', use the 1m method to induce complex computation\n" ); + printf( " using the equivalent real-domain microkernels.\n" ); + printf( "\n" ); + printf( " -p 'lo hi in'\n" ); + printf( " Perform a sweep of measurements of problem sizes ranging from \n" ); + printf( " 'lo' to 'hi' in increments of 'in'. Note that measurements will\n" ); + printf( " be taken in descending order, starting from 'hi', and so 'lo'\n" ); + printf( " will act as a floor and may not be measured (see 2nd example).\n" ); + printf( "\n" ); + printf( " Example Interpretation\n" ); + printf( " -------------------------------------------------------\n" ); + printf( " -p '40 400 40' Measure performance from 40 to 400\n" ); + printf( " (inclusive) in increments of 40.\n" ); + printf( " -p '40 400 80' Measure performance for problem sizes\n" ); + printf( " {80,160,240,320,400}.\n" ); + printf( "\n" ); + printf( " Note that unlike the other option arguments, quotes are required\n" ); + printf( " around the 'lo hi in' string in order to facilitate parsing.\n" ); + printf( "\n" ); + printf( " -m M\n" ); + if ( is_gemm( params ) || is_hemm( params ) || is_trmm( params ) || is_trsm( params ) ) + printf( " -n N\n" ); + if ( is_gemm( params ) || is_herk( params ) ) + printf( " -k K\n" ); + if ( is_gemm( params ) ) + { + printf( " Bind the m, n, or k dimensions to M, N, or K, respectively.\n" ); + printf( " Binding of matrix dimensions takes place as follows:\n" ); + } + else if ( is_herk( params ) ) + { + printf( " Bind the m or k dimensions to M or K, respectively. Binding\n" ); + printf( " of matrix dimensions takes place as follows:\n" ); + } + else if ( is_hemm( params ) || is_trmm( params ) || is_trsm( params ) ) + { + printf( " Bind the m or n dimensions to M or N, respectively. Binding\n" ); + printf( " of matrix dimensions takes place as follows:\n" ); + } + printf( "\n" ); + printf( " if 0 < X: Bind the x dimension to X and hold it constant.\n" ); + printf( " if X = -1: Bind the x dimension to p.\n" ); + printf( " if X < -1: Bind the x dimension to p / abs(x).\n" ); + printf( "\n" ); + printf( " where p is the current problem size. Note: X = 0 is undefined.\n" ); + printf( "\n" ); + printf( " Examples Interpretation\n" ); + printf( " ---------------------------------------------------------\n" ); + if ( is_gemm( params ) ) + { + printf( " -m -1 -n -1 -k -1 Bind m, n, and k to the problem size\n" ); + printf( " to keep all matrices square.\n" ); + printf( " -m -1 -n -1 -k 100 Bind m and n to the problem size, but\n" ); + printf( " hold k = 100 constant.\n" ); + } + else if ( is_hemm( params ) ) + { + printf( " -m -1 -n -1 Bind m and n to the problem size to\n" ); + printf( " keep all matrices square.\n" ); + printf( " -m -1 -n 500 Bind m to the problem size, but hold\n" ); + printf( " n = 500 constant.\n" ); + } + else if ( is_herk( params ) ) + { + printf( " -m -1 -k -1 Bind m and k to the problem size to\n" ); + printf( " keep both matrices square.\n" ); + printf( " -m -1 -k 200 Bind m to the problem size, but hold\n" ); + printf( " k = 200 constant.\n" ); + } + else if ( is_trmm( params ) || is_trsm( params ) ) + { + printf( " -m -1 -n -1 Bind m and n to the problem size to\n" ); + printf( " keep both matrices square.\n" ); + printf( " -m -1 -n 300 Bind m to the problem size, but hold\n" ); + printf( " n = 300 constant.\n" ); + } + printf( "\n" ); + printf( " -r num\n" ); + printf( " When measuring performance for a given problem size, perform num\n" ); + printf( " repetitions and report performance using the best timing.\n" ); + printf( "\n" ); + if ( is_gemm( params ) || is_hemm( params ) || is_herk( params ) ) + { + printf( " -a alpha\n" ); + printf( " -b beta\n" ); + printf( " Specify the value to use for the alpha and/or beta scalars.\n" ); + } + else // if ( is_trmm( params ) || is_trsm( params ) ) + { + printf( " -a alpha\n" ); + printf( " Specify the value to use for the alpha scalar.\n" ); + } + printf( "\n" ); + printf( " If any of the computational options is not specified, its default value will\n" ); + printf( " be used. (Please use the -v option to see how the driver is interpreting each\n" ); + printf( " option.)\n" ); + printf( "\n" ); + printf( " The following IO options are also supported:\n" ); + printf( "\n" ); + printf( " -q\n" ); + printf( " -v\n" ); + printf( " Enable quiet or verbose output. (By default, output is quiet.)\n" ); + printf( " The verbose option is useful if you are unsure whether your options\n" ); + printf( " are being interpreted as you intended.\n" ); + printf( "\n" ); + printf( " -h\n" ); + printf( " Display this help and exit.\n" ); + printf( "\n" ); + printf( "\n" ); + + exit(0); + + break; + } + + + case '?': + printf( "%s: unexpected option '%c' given or missing option argument\n", bin, state.optopt ); + exit(1); + break; + + default: + printf( "%s: unexpected option chararcter returned from getopt: %c\n", bin, opt_ch ); + exit(1); + } + } + + // Process the command line options from strings to integers/enums/doubles, + // as appropriate. + proc_params( params ); + + // Inform the user of the values that were chosen (or defaulted to). + if ( params->verbose ) + { + const char* def_str = " (default)"; + const char* nul_str = " "; + + printf( "%%\n" ); + printf( "%% operation: %s\n", params->opname ); + printf( "%% parameter combination: %s%s\n", params->pc_str, ( gave_option_c ? nul_str : def_str ) ); + printf( "%% datatype: %s%s\n", params->dt_str, ( gave_option_d ? nul_str : def_str ) ); + printf( "%% storage combination: %s%s\n", params->sc_str, ( gave_option_s ? nul_str : def_str ) ); + printf( "%% induced method: %s%s\n", params->im_str, ( gave_option_i ? nul_str : def_str ) ); + printf( "%% problem size range: %s%s\n", params->ps_str, ( gave_option_p ? nul_str : def_str ) ); + printf( "%% m dim specifier: %s%s\n", params->m_str, ( gave_option_m ? nul_str : def_str ) ); + if ( is_gemm( params ) || is_hemm( params ) || is_trmm( params ) || is_trsm( params ) ) + printf( "%% n dim specifier: %s%s\n", params->n_str, ( gave_option_n ? nul_str : def_str ) ); + if ( is_gemm( params ) || is_herk( params ) ) + printf( "%% k dim specifier: %s%s\n", params->k_str, ( gave_option_k ? nul_str : def_str ) ); + printf( "%% number of repeats: %s%s\n", params->nr_str, ( gave_option_r ? nul_str : def_str ) ); + printf( "%% alpha scalar: %s%s\n", params->alpha_str, ( gave_option_a ? nul_str : def_str ) ); + if ( is_gemm( params ) || is_hemm( params ) || is_herk( params ) ) + printf( "%% beta scalar: %s%s\n", params->beta_str, ( gave_option_b ? nul_str : def_str ) ); + printf( "%% ---\n" ); + printf( "%% implementation: %s\n", params->impl ); + if ( params->nt == -1 ) + printf( "%% number of threads: %s\n", "unset (defaults to 1)" ); + else + printf( "%% number of threads: %ld\n", params->nt ); + printf( "%% thread affinity: %s\n", ( params->af_str == NULL ? "unset" : params->af_str ) ); + printf( "%%\n" ); + } + + + // If there are still arguments remaining after getopt() processing is + // complete, print an error. + if ( state.optind < argc ) + { + printf( "%s: encountered unexpected non-option argument: %s\n", bin, argv[ state.optind ] ); + exit(1); + } +} + +// ----------------------------------------------------------------------------- + +void proc_params( params_t* params ) +{ + dim_t nt; + + // Binary name doesn't need any conversion. + + // Operation name doesn't need any conversion. + + // Implementation name doesn't need any conversion. + + // Query the multithreading strings and convert them to integers. + if ( strncmp( params->impl, "blis", MAX_STRING_SIZE ) == 0 ) + { + nt = bli_thread_get_num_threads(); + } + else if ( strncmp( params->impl, "mkl", MAX_STRING_SIZE ) == 0 ) + { + nt = bli_env_get_var( "OMP_NUM_THREADS", -1 ); + + if ( nt == -1 ) nt = bli_env_get_var( "MKL_NUM_THREADS", -1 ); + } + else if ( strncmp( params->impl, "openblas", MAX_STRING_SIZE ) == 0 ) + { + nt = bli_env_get_var( "OMP_NUM_THREADS", -1 ); + + if ( nt == -1 ) nt = bli_env_get_var( "OPENBLAS_NUM_THREADS", -1 ); + } + else + { + nt = bli_env_get_var( "OMP_NUM_THREADS", -1 ); + } + + // Store nt to the params_t struct. + params->nt = ( long int )nt; + + // Store the affinity string pointer to the params_t struct. + params->af_str = bli_env_get_str( "GOMP_CPU_AFFINITY" ); + +#if 0 + dim_t nt = bli_thread_get_num_threads(); + dim_t jc_nt = bli_thread_get_jc_nt(); + dim_t pc_nt = bli_thread_get_pc_nt(); + dim_t ic_nt = bli_thread_get_ic_nt(); + dim_t jr_nt = bli_thread_get_jr_nt(); + dim_t ir_nt = bli_thread_get_ir_nt(); + + if ( nt == -1 ) nt = 1; + if ( jc_nt == -1 ) jc_nt = 1; + if ( pc_nt == -1 ) pc_nt = 1; + if ( ic_nt == -1 ) ic_nt = 1; + if ( jr_nt == -1 ) jr_nt = 1; + if ( ir_nt == -1 ) ir_nt = 1; + + params->nt = ( long int )nt; + params->jc_nt = ( long int )jc_nt; + params->pc_nt = ( long int )pc_nt; + params->ic_nt = ( long int )ic_nt; + params->jr_nt = ( long int )jr_nt; + params->ir_nt = ( long int )ir_nt; +#endif + + // Parameter combinations, datatype, and operand storage combination, + // need no conversion. + + // Convert the datatype to a num_t. + bli_param_map_char_to_blis_dt( params->dt_str[0], ¶ms->dt ); + + // Parse the induced method to the corresponding ind_t. + if ( strncmp( params->im_str, "native", 6 ) == 0 ) + { + params->im = BLIS_NAT; + } + else if ( strncmp( params->im_str, "1m", 2 ) == 0 ) + { + params->im = BLIS_1M; + } + else + { + printf( "%s: invalid induced method '%s'.\n", params->bin, params->im_str ); + exit(1); + } + + // Convert the problem size range and dimension specifier strings to + // integers. + sscanf( params->ps_str, "%ld %ld %ld", &(params->sta), + &(params->end), + &(params->inc) ); + sscanf( params->m_str, "%ld", &(params->m) ); + sscanf( params->n_str, "%ld", &(params->n) ); + sscanf( params->k_str, "%ld", &(params->k) ); + + // Convert the number of repeats to an integer. + sscanf( params->nr_str, "%ld", &(params->nr) ); + + // Convert the alpha and beta strings to doubles. + //params->alpha = ( double )atof( params->alpha_str ); + //params->beta = ( double )atof( params->beta_str ); + //sscanf( params->alpha_str, "%lf", &(params->alpha) ); + //sscanf( params->beta_str, "%lf", &(params->beta) ); + params->alpha = strtod( params->alpha_str, NULL ); + params->beta = strtod( params->beta_str, NULL ); +} + +// ----------------------------------------------------------------------------- + +bool is_match( const char* str1, const char* str2 ) +{ + if ( strncmp( str1, str2, MAX_STRING_SIZE ) == 0 ) return TRUE; + return FALSE; +} + +bool is_gemm( params_t* params ) +{ + if ( is_match( params->opname, "gemm" ) ) return TRUE; + return FALSE; +} + +bool is_hemm( params_t* params ) +{ + if ( is_match( params->opname, "hemm" ) ) return TRUE; + return FALSE; +} + +bool is_herk( params_t* params ) +{ + if ( is_match( params->opname, "herk" ) ) return TRUE; + return FALSE; +} + +bool is_trmm( params_t* params ) +{ + if ( is_match( params->opname, "trmm" ) ) return TRUE; + return FALSE; +} + +bool is_trsm( params_t* params ) +{ + if ( is_match( params->opname, "trsm" ) ) return TRUE; + return FALSE; +} + diff --git a/test/3/test_utils.h b/test/3/test_utils.h new file mode 100644 index 000000000..088f9ce97 --- /dev/null +++ b/test/3/test_utils.h @@ -0,0 +1,142 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifndef TEST_UTILS_H +#define TEST_UTILS_H + +// Allow C++ users to include this header file in their source code. However, +// we make the extern "C" conditional on whether we're using a C++ compiler, +// since regular C compilers don't understand the extern "C" construct. +#ifdef __cplusplus +extern "C" { +#endif + +// String arrays allocated using this constant will always add 1 to +// the value defined below, and so the total allocated will still be +// a nice power of two. +#define MAX_STRING_SIZE 31 + + +extern const char* GLOB_DEF_DT_STR; +extern const char* GLOB_DEF_SC_STR; +extern const char* GLOB_DEF_IM_STR; + +extern const char* GLOB_DEF_PS_STR; +extern const char* GLOB_DEF_M_STR; +extern const char* GLOB_DEF_N_STR; +extern const char* GLOB_DEF_K_STR; + +extern const char* GLOB_DEF_NR_STR; + +extern const char* GLOB_DEF_ALPHA_STR; +extern const char* GLOB_DEF_BETA_STR; + + +typedef struct params_s +{ + // Binary name. + const char* bin; + + // Operation name. + const char* opname; + + // Implementation name. + const char* impl; + + // Multithreading parameters: number of threads and affinity string. + const char* nt_str; + long int nt; + const char* af_str; + + // Parameter combinations, datatype, operand storage combination, + // and induced method. + const char* pc_str; + const char* dt_str; + const char* sc_str; + num_t dt; + + const char* im_str; + ind_t im; + + // Problem size range and dimension specifiers. + const char* ps_str; + const char* m_str; + const char* n_str; + const char* k_str; + long int sta; + long int end; + long int inc; + long int m; + long int n; + long int k; + + // Number of repeats. + const char* nr_str; + long int nr; + + // Value of alpha and beta. + const char* alpha_str; + const char* beta_str; + double alpha; + double beta; + + // A flag controlling whether to print informational messages. + bool verbose; + +} params_t; + +typedef void (*init_fp)( params_t* params ); + +// ----------------------------------------------------------------------------- + +void init_def_params( params_t* params ); +void parse_cl_params( int argc, char** argv, init_fp fp, params_t* params ); +void proc_params( params_t* params ); + +// ----------------------------------------------------------------------------- + +bool is_match( const char* str1, const char* str2 ); +bool is_gemm( params_t* params ); +bool is_hemm( params_t* params ); +bool is_herk( params_t* params ); +bool is_trmm( params_t* params ); +bool is_trsm( params_t* params ); + +#ifdef __cplusplus +} +#endif + +#endif From b861c71b50c6d48cb07282f44aa9dddffc1f1b3f Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 23 Sep 2022 13:22:27 -0500 Subject: [PATCH 089/230] Add consistent NaN/Inf handling in sumsqv. (#668) Details: - Changed sumsqv implementation as follows: - If there is a NaN (either real or imaginary), then return a sum of NaN and unit scale. - Else, if there is an Inf (either real or imaginary), then return a sum of +Inf and unit scale. - Otherwise behave as normal. --- frame/util/bli_util_unb_var1.c | 56 ++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c index 2b65c8460..3c501d107 100644 --- a/frame/util/bli_util_unb_var1.c +++ b/frame/util/bli_util_unb_var1.c @@ -1068,6 +1068,7 @@ void PASTEMAC(ch,varname) \ ctype_r scale_r; \ ctype_r sumsq_r; \ ctype_r abs_chi1_r; \ + ctype_r abs_chi1_i; \ dim_t i; \ \ /* NOTE: This function attempts to mimic the algorithm for computing @@ -1085,10 +1086,47 @@ void PASTEMAC(ch,varname) \ PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ \ abs_chi1_r = bli_fabs( chi1_r ); \ + abs_chi1_i = bli_fabs( chi1_i ); \ +\ + if ( bli_isnan( abs_chi1_r ) ) \ + { \ + sumsq_r = abs_chi1_r; \ + scale_r = one_r; \ + } \ +\ + if ( bli_isnan( abs_chi1_i ) ) \ + { \ + sumsq_r = abs_chi1_i; \ + scale_r = one_r; \ + } \ +\ + if ( bli_isnan( sumsq_r ) ) \ + { \ + chi1 += incx; \ + continue; \ + } \ +\ + if ( bli_isinf( abs_chi1_r ) ) \ + { \ + sumsq_r = abs_chi1_r; \ + scale_r = one_r; \ + } \ +\ + if ( bli_isinf( abs_chi1_i ) ) \ + { \ + sumsq_r = abs_chi1_i; \ + scale_r = one_r; \ + } \ +\ + if ( bli_isinf( sumsq_r ) ) \ + { \ + chi1 += incx; \ + continue; \ + } \ \ /* Accumulate real component into sumsq, adjusting scale if needed. */ \ - if ( abs_chi1_r > zero_r || bli_isnan( abs_chi1_r) ) \ + if ( abs_chi1_r > zero_r ) \ { \ if ( scale_r < abs_chi1_r ) \ { \ @@ -1104,25 +1142,23 @@ void PASTEMAC(ch,varname) \ ( abs_chi1_r / scale_r ); \ } \ } \ -\ - abs_chi1_r = bli_fabs( chi1_i ); \ \ /* Accumulate imaginary component into sumsq, adjusting scale if needed. */ \ - if ( abs_chi1_r > zero_r || bli_isnan( abs_chi1_r) ) \ + if ( abs_chi1_i > zero_r ) \ { \ - if ( scale_r < abs_chi1_r ) \ + if ( scale_r < abs_chi1_i ) \ { \ sumsq_r = one_r + \ - sumsq_r * ( scale_r / abs_chi1_r ) * \ - ( scale_r / abs_chi1_r ); \ + sumsq_r * ( scale_r / abs_chi1_i ) * \ + ( scale_r / abs_chi1_i ); \ \ - PASTEMAC(chr,copys)( abs_chi1_r, scale_r ); \ + PASTEMAC(chr,copys)( abs_chi1_i, scale_r ); \ } \ else \ { \ - sumsq_r = sumsq_r + ( abs_chi1_r / scale_r ) * \ - ( abs_chi1_r / scale_r ); \ + sumsq_r = sumsq_r + ( abs_chi1_i / scale_r ) * \ + ( abs_chi1_i / scale_r ); \ } \ } \ \ From 42d0e66318b186d25eeb215b40ce26115401ed8b Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 29 Sep 2022 17:38:02 -0500 Subject: [PATCH 090/230] Add AddressSanitizer (-fsanitize=address) option. (#669) Details: - Added support for AddressSanitizer (ASan), a compiler-integrated memory error detector. The option (disabled by default) enables compiling and linking with the -fsanitize=address flag supported by clang, gcc, and probably others. This flag is employed during compilation of all BLIS source files *except* for optimized kernels, which are exempted because ASan usually requires an extra register, which violates the constraints for many gemm microkernels. - Minor whitespace, comment, ordering, and configure help text updates. --- Makefile | 1 + build/config.mk.in | 3 +++ common.mk | 34 ++++++++++++++++++++------ configure | 61 ++++++++++++++++++++++++++++++++++------------ 4 files changed, 76 insertions(+), 23 deletions(-) diff --git a/Makefile b/Makefile index 5c4a32b59..04cdca421 100644 --- a/Makefile +++ b/Makefile @@ -1161,6 +1161,7 @@ showconfig: check-env @echo "install includedir: $(INSTALL_INCDIR)" @echo "install sharedir: $(INSTALL_SHAREDIR)" @echo "debugging status: $(DEBUG_TYPE)" + @echo "enable AddressSanitizer? $(MK_ENABLE_ASAN)" @echo "enabled threading model(s): $(THREADING_MODEL)" @echo "enable BLAS API? $(MK_ENABLE_BLAS)" @echo "enable CBLAS API? $(MK_ENABLE_CBLAS)" diff --git a/build/config.mk.in b/build/config.mk.in index 849a7ccfa..efb123366 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -124,6 +124,9 @@ LDFLAGS_PRESET := @ldflags_preset@ # The level of debugging info to generate. DEBUG_TYPE := @debug_type@ +# Whether to compile and link the AddressSanitizer library. +MK_ENABLE_ASAN := @enable_asan@ + # Whether operating system support was requested via --enable-system. ENABLE_SYSTEM := @enable_system@ diff --git a/common.mk b/common.mk index 00b9f8ad3..e69b97782 100644 --- a/common.mk +++ b/common.mk @@ -118,6 +118,7 @@ get-noopt-cxxflags-for = $(strip $(CFLAGS_PRESET) \ get-refinit-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ -DBLIS_CNAME=$(1) \ + $(BUILD_ASANFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ -DBLIS_IN_REF_KERNEL=1 \ @@ -129,6 +130,7 @@ get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ $(COMPSIMDFLAGS) \ -DBLIS_CNAME=$(1) \ + $(BUILD_ASANFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ -DBLIS_IN_REF_KERNEL=1 \ @@ -137,12 +139,14 @@ get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \ get-config-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ + $(BUILD_ASANFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ ) get-frame-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ + $(BUILD_ASANFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ ) @@ -201,11 +205,14 @@ get-sandbox-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ # Define a separate function that will return appropriate flags for use by # applications that want to use the same basic flags as those used when BLIS # was compiled. (NOTE: This is the same as the $(get-frame-cflags-for ...) -# function, except that it omits two variables that contain flags exclusively -# for use when BLIS is being compiled/built: BUILD_CPPFLAGS, which contains a -# cpp macro that confirms that BLIS is being built; and BUILD_SYMFLAGS, which -# contains symbol export flags that are only needed when a shared library is -# being compiled/linked.) +# function, except that it omits a few variables that contain flags exclusively +# for use when BLIS is being compiled/built: +# - BUILD_CPPFLAGS, which contains a cpp macro that confirms that BLIS +# is being built; +# - BUILD_SYMFLAGS, which contains symbol export flags that are only +# needed when a shared library is being compiled/linked; and +# - BUILD_ASANFLAGS, which contains a flag that causes the compiler to +# insert instrumentation for memory error detection. get-user-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ ) @@ -563,6 +570,11 @@ ifeq ($(DEBUG_TYPE),sde) LDFLAGS := $(filter-out $(LIBMEMKIND),$(LDFLAGS)) endif +# If AddressSanitizer is enabled, add the compiler flag to LDFLAGS. +ifeq ($(MK_ENABLE_ASAN),yes) +LDFLAGS += -fsanitize=address +endif + # Specify the shared library's 'soname' field. # NOTE: The flag for creating shared objects is different for Linux and OS X. ifeq ($(OS_NAME),Darwin) @@ -796,11 +808,19 @@ $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CXXLANGFLAGS,$(c)) CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPPROCFLAGS,$(c)))) +# --- AddressSanitizer flags --- + +ifeq ($(MK_ENABLE_ASAN),yes) +BUILD_ASANFLAGS := -fsanitize=address +else +BUILD_ASANFLAGS := +endif + # --- Threading flags --- # NOTE: We don't have to explicitly omit -pthread when --disable-system is given -# since that option forces --enable-threading=none, and thus -pthread never gets -# added to begin with. +# since that option forces --enable-threading=single, and thus -pthread never +# gets added to begin with. CTHREADFLAGS := diff --git a/configure b/configure index 858ce55de..a53f25380 100755 --- a/configure +++ b/configure @@ -224,12 +224,22 @@ print_usage() echo " " echo " --enable-mem-tracing, --disable-mem-tracing" echo " " - echo " Enable (disable by default) output to stdout that traces" + echo " Enable (disabled by default) output to stdout that traces" echo " the allocation and freeing of memory, including the names" echo " of the functions that triggered the allocation/freeing." echo " Enabling this option WILL NEGATIVELY IMPACT PERFORMANCE." echo " Please use only for informational/debugging purposes." echo " " + echo " --enable-asan, --disable-asan" + echo " " + echo " Enable (disabled by default) compiling and linking BLIS" + echo " framework code with the AddressSanitizer (ASan) library." + echo " Optimized kernels are NOT compiled with ASan support due" + echo " to limitations of register assignment in inline assembly." + echo " WARNING: ENABLING THIS OPTION WILL NEGATIVELY IMPACT" + echo " PERFORMANCE. Please use only for informational/debugging" + echo " purposes." + echo " " echo " -i SIZE, --int-size=SIZE" echo " " echo " Set the size (in bits) of internal BLIS integers and" @@ -2451,6 +2461,9 @@ main() debug_type='' debug_flag='' + # A flag indicating whether AddressSanitizer should be used. + enable_asan='no' + # The system flag. enable_system='yes' @@ -2576,6 +2589,12 @@ main() disable-debug) debug_flag=0 ;; + enable-asan) + enable_asan='yes' + ;; + disable-asan) + enable_asan='no' + ;; enable-verbose-make) enable_verbose='yes' ;; @@ -3357,6 +3376,20 @@ main() echo "${script_name}: no preset LDFLAGS detected." fi + # Check if the verbose make flag was specified. + if [ "x${enable_verbose}" = "xyes" ]; then + echo "${script_name}: enabling verbose make output. (disable with 'make V=0'.)" + else + echo "${script_name}: disabling verbose make output. (enable with 'make V=1'.)" + fi + + # Check if the ARG_MAX hack was requested. + if [ "x${enable_arg_max_hack}" = "xyes" ]; then + echo "${script_name}: enabling ARG_MAX hack." + else + echo "${script_name}: disabling ARG_MAX hack." + fi + # Check if the debug flag was specified. if [ -n "${debug_flag}" ]; then if [ "x${debug_type}" = "xopt" ]; then @@ -3373,29 +3406,24 @@ main() echo "${script_name}: debug symbols disabled." fi - # Check if the verbose make flag was specified. - if [ "x${enable_verbose}" = "xyes" ]; then - echo "${script_name}: enabling verbose make output. (disable with 'make V=0'.)" + # Check if the AddressSanitizer flag was specified. + if [ "x${enable_asan}" = "xyes" ]; then + echo "${script_name}: enabling AddressSanitizer support (except for optimized kernels)." else - echo "${script_name}: disabling verbose make output. (enable with 'make V=1'.)" + enable_asan='no' + echo "${script_name}: AddressSanitizer support disabled." fi - # Check if the ARG_MAX hack was requested. - if [ "x${enable_arg_max_hack}" = "xyes" ]; then - echo "${script_name}: enabling ARG_MAX hack." - else - echo "${script_name}: disabling ARG_MAX hack." - fi - - enable_shared_01=1 # Check if the static lib flag was specified. if [ "x${enable_static}" = "xyes" -a "x${enable_shared}" = "xyes" ]; then echo "${script_name}: building BLIS as both static and shared libraries." + enable_shared_01=1 + elif [ "x${enable_static}" = "xno" -a "x${enable_shared}" = "xyes" ]; then + echo "${script_name}: building BLIS as a shared library (static library disabled)." + enable_shared_01=1 elif [ "x${enable_static}" = "xyes" -a "x${enable_shared}" = "xno" ]; then echo "${script_name}: building BLIS as a static library (shared library disabled)." enable_shared_01=0 - elif [ "x${enable_static}" = "xno" -a "x${enable_shared}" = "xyes" ]; then - echo "${script_name}: building BLIS as a shared library (static library disabled)." else echo "${script_name}: Both static and shared libraries were disabled." echo "${script_name}: *** Please enable one (or both) to continue." @@ -3917,7 +3945,7 @@ main() # Create a #define for the configuration family (config_name). uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]') config_name_define="#define BLIS_FAMILY_${uconf}\n" - + # Create a list of #defines, one for each configuration in config_list. config_list_defines="" for conf in ${config_list}; do @@ -4012,6 +4040,7 @@ main() | sed -e "s/@libpthread@/${libpthread_esc}/g" \ | sed -e "s/@cflags_preset@/${cflags_preset_esc}/g" \ | sed -e "s/@ldflags_preset@/${ldflags_preset_esc}/g" \ + | sed -e "s/@enable_asan@/${enable_asan}/g" \ | sed -e "s/@debug_type@/${debug_type}/g" \ | sed -e "s/@enable_system@/${enable_system}/g" \ | sed -e "s/@threading_model@/${threading_model}/g" \ From 63470b49e3b9b15e00a8f666e86ccd70c6005fe9 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 29 Sep 2022 18:52:08 -0500 Subject: [PATCH 091/230] Fix some bugs in bli_pool.c (#670) Details: - Add a check for premature pool exhaustion when checking in blocks via bli_pool_checkin_block(). This detects "double-free" and other bad conditions that don't necessarily result in a segfault. - Make sure to copy all block pointers when growing the pool size. Previously, checked-out block pointers (which are guaranteed to be set to NULL) were not being copied, leading to the presence of uninitialized data. --- frame/base/bli_pool.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c index 684b0ef73..6449a9774 100644 --- a/frame/base/bli_pool.c +++ b/frame/base/bli_pool.c @@ -335,6 +335,10 @@ void bli_pool_checkin_block // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); + // Check for double-free and other conditions which may prematurely + // exhaust the memory pool. + if ( top_index == 0 ) bli_abort(); + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_checkin_block(): checking in block %d of size %d " "(align %d, offset %d).\n", @@ -407,10 +411,11 @@ void bli_pool_grow const siz_t top_index = bli_pool_top_index( pool ); // Copy the contents of the old block_ptrs array to the new/resized - // array. Notice that we can begin with top_index since all entries - // from 0 to top_index-1 have been (and are currently) checked out - // to threads. - for ( dim_t i = top_index; i < num_blocks_cur; ++i ) + // array. Notice that we copy the entire array, including elements + // corresponding to blocks that have been checked out. Those elements + // were set to NULL upon checkout, and so it's important to copy them + // into the new block_ptrs array. + for ( dim_t i = 0; i < num_blocks_cur; ++i ) { block_ptrs_new[i] = block_ptrs_cur[i]; } From 76a23bd8c33e161221891935a489df9a9fb9c8c0 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 3 Oct 2022 15:55:07 -0500 Subject: [PATCH 092/230] Reinstate sanity check in bli_pool_finalize. (#671) Details: - Added a reinit argument to bli_pool_finalize(). This bool will signal whether or not the function is being called from bli_pool_reinit(). If it is not being called from _reinit(), we can safely check to confirm that .top_index == 0 (i.e., all blocks have been checked in). But if it *is* being called from _reinit(), then that check will be skipped since one of the predicted use cases for bli_pool_reinit() anticipates that some blocks are (probably) checked out when the pool_t is reinitialized. - Updated existing invocations of bli_pool_finalize() to pass in either FALSE (from bli_apool_free_block() or bli_pba_finalize_pools()) or TRUE (from bli_pool_reinit()) for the new reinit argument. --- frame/base/bli_apool.c | 2 +- frame/base/bli_pba.c | 6 +++--- frame/base/bli_pool.c | 22 +++++++++------------- frame/base/bli_pool.h | 3 ++- 4 files changed, 15 insertions(+), 18 deletions(-) diff --git a/frame/base/bli_apool.c b/frame/base/bli_apool.c index a42c7103e..693e91bf9 100644 --- a/frame/base/bli_apool.c +++ b/frame/base/bli_apool.c @@ -188,7 +188,7 @@ void bli_apool_free_block if ( pool != NULL ) { // Finalize the pool. - bli_pool_finalize( pool ); + bli_pool_finalize( pool, FALSE ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_free_block(): pool_t %d: ", ( int )i ); diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c index 68dffd728..cabaf4ff6 100644 --- a/frame/base/bli_pba.c +++ b/frame/base/bli_pba.c @@ -389,9 +389,9 @@ void bli_pba_finalize_pools pool_t* pool_c = bli_pba_pool( index_c, pba ); // Finalize the memory pools for A, B, and C. - bli_pool_finalize( pool_a ); - bli_pool_finalize( pool_b ); - bli_pool_finalize( pool_c ); + bli_pool_finalize( pool_a, FALSE ); + bli_pool_finalize( pool_b, FALSE ); + bli_pool_finalize( pool_c, FALSE ); } // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c index 6449a9774..891f770ae 100644 --- a/frame/base/bli_pool.c +++ b/frame/base/bli_pool.c @@ -115,7 +115,8 @@ void bli_pool_init void bli_pool_finalize ( - pool_t* pool + pool_t* pool, + bool reinit ) { // NOTE: This implementation assumes that either: @@ -129,24 +130,22 @@ void bli_pool_finalize // Query the total number of blocks currently allocated. const siz_t num_blocks = bli_pool_num_blocks( pool ); - // NOTE: This sanity check has been disabled because bli_pool_reinit() - // is currently implemented in terms of bli_pool_finalize() followed by - // bli_pool_init(). If that _reinit() takes place when some blocks are - // checked out, then we would expect top_index != 0, and therefore this - // check is not universally appropriate. -#if 0 // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); // Sanity check: The top_index should be zero. - if ( top_index != 0 ) + // NOTE: This sanity check is disabled when called from bli_pool_reinit() + // because it is currently implemented in terms of bli_pool_finalize() followed by + // bli_pool_init(). If that _reinit() takes place when some blocks are + // checked out, then we would expect top_index != 0, and therefore this + // check is not universally appropriate. + if ( top_index != 0 && !reinit ) { printf( "bli_pool_finalize(): final top_index == %d (expected 0); block_size: %d.\n", ( int )top_index, ( int )bli_pool_block_size( pool ) ); printf( "bli_pool_finalize(): Implication: not all blocks were checked back in!\n" ); bli_abort(); } -#endif // Query the free() function pointer for the pool. free_ft free_fp = bli_pool_free_fp( pool ); @@ -215,7 +214,7 @@ void bli_pool_reinit // those blocks back into the pool. (This condition can be detected // since the block size is encoded into each pblk, which is copied // upon checkout.) - bli_pool_finalize( pool ); + bli_pool_finalize( pool, TRUE ); // Reinitialize the pool with the new parameters, in particular, // the new block size. @@ -407,9 +406,6 @@ void bli_pool_grow = bli_malloc_intl( block_ptrs_len_new * sizeof( pblk_t ), &r_val ); - // Query the top_index of the pool. - const siz_t top_index = bli_pool_top_index( pool ); - // Copy the contents of the old block_ptrs array to the new/resized // array. Notice that we copy the entire array, including elements // corresponding to blocks that have been checked out. Those elements diff --git a/frame/base/bli_pool.h b/frame/base/bli_pool.h index 0b16ae8ee..6f199f7a4 100644 --- a/frame/base/bli_pool.h +++ b/frame/base/bli_pool.h @@ -228,7 +228,8 @@ void bli_pool_init ); void bli_pool_finalize ( - pool_t* pool + pool_t* pool, + bool reinit ); void bli_pool_reinit ( From 9453e0f163503f64a290256b4be53d8882224863 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 3 Oct 2022 19:46:20 -0500 Subject: [PATCH 093/230] CREDITS file update. Details: - This attribution was intended to go in PR #647. --- CREDITS | 1 + 1 file changed, 1 insertion(+) diff --git a/CREDITS b/CREDITS index 152de0a4b..55c974f1b 100644 --- a/CREDITS +++ b/CREDITS @@ -36,6 +36,7 @@ but many others have contributed code and feedback, including Roman Gareev @gareevroman Richard Goldschmidt @SuperFluffy Chris Goodyer + Alexander Grund @Flamefire John Gunnels @jagunnels (IBM, T.J. Watson Research Center) Ali Emre Gülcü @Lephar Jeff Hammond @jeffhammond (Intel) From 23f5b8df3e802a27bacd92571184ec57bbdfa646 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 17 Oct 2022 20:21:21 -0500 Subject: [PATCH 094/230] Shuffled checked properties in bli_l3_check.c. (#676) Details: - Added certain checks for matrix structure to the level-3 operations' _check() functions, and slightly reorganized existing checks. --- frame/3/bli_l3_check.c | 179 ++++++++++++++++++++++++++++------------- 1 file changed, 122 insertions(+), 57 deletions(-) diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index 3b4d88746..9ac0a7fbb 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -44,7 +44,7 @@ void bli_gemm_check const cntx_t* cntx ) { - //err_t e_val; + err_t e_val; // Check basic properties of the operation. @@ -52,15 +52,14 @@ void bli_gemm_check // Check object structure. - // NOTE: Can't perform these checks as long as bli_gemm_check() is called - // from bli_l3_int(), which is in the execution path for structured - // level-3 operations such as hemm. + e_val = bli_check_general_object( a ); + bli_check_error_code( e_val ); - //e_val = bli_check_general_object( a ); - //bli_check_error_code( e_val ); + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); - //e_val = bli_check_general_object( b ); - //bli_check_error_code( e_val ); + e_val = bli_check_general_object( c ); + bli_check_error_code( e_val ); } void bli_gemmt_check @@ -83,6 +82,14 @@ void bli_gemmt_check e_val = bli_check_square_object( c ); bli_check_error_code( e_val ); + + // Check object structure. + + e_val = bli_check_general_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); } void bli_hemm_check @@ -102,10 +109,21 @@ void bli_hemm_check bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx ); + // Check matrix squareness. + + e_val = bli_check_square_object( a ); + bli_check_error_code( e_val ); + // Check object structure. e_val = bli_check_hermitian_object( a ); bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( c ); + bli_check_error_code( e_val ); } void bli_herk_check @@ -127,18 +145,26 @@ void bli_herk_check bli_herk_basic_check( alpha, a, &ah, beta, c, cntx ); - // Check for real-valued alpha and beta. - - e_val = bli_check_real_valued_object( alpha ); - bli_check_error_code( e_val ); + // Check matrix squareness. - e_val = bli_check_real_valued_object( beta ); + e_val = bli_check_square_object( c ); bli_check_error_code( e_val ); // Check matrix structure. e_val = bli_check_hermitian_object( c ); bli_check_error_code( e_val ); + + e_val = bli_check_general_object( a ); + bli_check_error_code( e_val ); + + // Check for real-valued alpha and beta. + + e_val = bli_check_real_valued_object( alpha ); + bli_check_error_code( e_val ); + + e_val = bli_check_real_valued_object( beta ); + bli_check_error_code( e_val ); } void bli_her2k_check @@ -162,15 +188,26 @@ void bli_her2k_check bli_her2k_basic_check( alpha, a, &bh, b, &ah, beta, c, cntx ); - // Check for real-valued beta. + // Check matrix squareness. - e_val = bli_check_real_valued_object( beta ); + e_val = bli_check_square_object( c ); bli_check_error_code( e_val ); // Check matrix structure. e_val = bli_check_hermitian_object( c ); bli_check_error_code( e_val ); + + e_val = bli_check_general_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); + + // Check for real-valued beta. + + e_val = bli_check_real_valued_object( beta ); + bli_check_error_code( e_val ); } void bli_symm_check @@ -190,10 +227,21 @@ void bli_symm_check bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx ); + // Check matrix squareness. + + e_val = bli_check_square_object( a ); + bli_check_error_code( e_val ); + // Check object structure. e_val = bli_check_symmetric_object( a ); bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( c ); + bli_check_error_code( e_val ); } void bli_syrk_check @@ -215,10 +263,18 @@ void bli_syrk_check bli_herk_basic_check( alpha, a, &at, beta, c, cntx ); + // Check matrix squareness. + + e_val = bli_check_square_object( c ); + bli_check_error_code( e_val ); + // Check matrix structure. e_val = bli_check_symmetric_object( c ); bli_check_error_code( e_val ); + + e_val = bli_check_general_object( a ); + bli_check_error_code( e_val ); } void bli_syr2k_check @@ -242,10 +298,21 @@ void bli_syr2k_check bli_her2k_basic_check( alpha, a, &bt, b, &at, beta, c, cntx ); + // Check matrix squareness. + + e_val = bli_check_square_object( c ); + bli_check_error_code( e_val ); + // Check matrix structure. e_val = bli_check_symmetric_object( c ); bli_check_error_code( e_val ); + + e_val = bli_check_general_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); } void bli_trmm3_check @@ -261,14 +328,25 @@ void bli_trmm3_check { err_t e_val; - // Perform checks common to hemm/symm/trmm/trsm. + // Check basic properties of the operation. bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx ); + // Check matrix squareness. + + e_val = bli_check_square_object( a ); + bli_check_error_code( e_val ); + // Check object structure. e_val = bli_check_triangular_object( a ); bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); + + e_val = bli_check_general_object( c ); + bli_check_error_code( e_val ); } void bli_trmm_check @@ -282,14 +360,22 @@ void bli_trmm_check { err_t e_val; - // Perform checks common to hemm/symm/trmm/trsm. + // Check basic properties of the operation. bli_hemm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx ); + // Check matrix squareness. + + e_val = bli_check_square_object( a ); + bli_check_error_code( e_val ); + // Check object structure. e_val = bli_check_triangular_object( a ); bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); } void bli_trsm_check @@ -307,10 +393,18 @@ void bli_trsm_check bli_hemm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx ); + // Check matrix squareness. + + e_val = bli_check_square_object( a ); + bli_check_error_code( e_val ); + // Check object structure. e_val = bli_check_triangular_object( a ); bli_check_error_code( e_val ); + + e_val = bli_check_general_object( b ); + bli_check_error_code( e_val ); } // ----------------------------------------------------------------------------- @@ -385,6 +479,14 @@ void bli_gemmt_basic_check e_val = bli_check_level3_dims( a, b, c ); bli_check_error_code( e_val ); + + // Check for consistent datatypes. + + e_val = bli_check_consistent_object_datatypes( c, a ); + bli_check_error_code( e_val ); + + e_val = bli_check_consistent_object_datatypes( c, b ); + bli_check_error_code( e_val ); } void bli_hemm_basic_check @@ -417,11 +519,6 @@ void bli_hemm_basic_check bli_check_error_code( e_val ); } - // Check matrix squareness. - - e_val = bli_check_square_object( a ); - bli_check_error_code( e_val ); - // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( c, a ); @@ -452,19 +549,6 @@ void bli_herk_basic_check e_val = bli_check_level3_dims( a, ah, c ); bli_check_error_code( e_val ); - // Check matrix squareness. - - e_val = bli_check_square_object( c ); - bli_check_error_code( e_val ); - - // Check matrix structure. - - e_val = bli_check_general_object( a ); - bli_check_error_code( e_val ); - - e_val = bli_check_general_object( ah ); - bli_check_error_code( e_val ); - // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( c, a ); @@ -501,25 +585,6 @@ void bli_her2k_basic_check e_val = bli_check_level3_dims( b, ah, c ); bli_check_error_code( e_val ); - // Check matrix squareness. - - e_val = bli_check_square_object( c ); - bli_check_error_code( e_val ); - - // Check matrix structure. - - e_val = bli_check_general_object( a ); - bli_check_error_code( e_val ); - - e_val = bli_check_general_object( bh ); - bli_check_error_code( e_val ); - - e_val = bli_check_general_object( b ); - bli_check_error_code( e_val ); - - e_val = bli_check_general_object( ah ); - bli_check_error_code( e_val ); - // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( c, a ); @@ -586,13 +651,13 @@ void bli_l3_basic_check e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); - e_val = bli_check_object_buffer( a ); + e_val = bli_check_object_buffer( beta ); bli_check_error_code( e_val ); - e_val = bli_check_object_buffer( b ); + e_val = bli_check_object_buffer( a ); bli_check_error_code( e_val ); - e_val = bli_check_object_buffer( beta ); + e_val = bli_check_object_buffer( b ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( c ); From 88105dbecf0f9dfbfa30215743346e8bd6afb971 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 21 Oct 2022 15:16:12 -0500 Subject: [PATCH 095/230] Added Discord documentation (#677) Details: - Added a docs/Discord.md markdown document that walks the reader through creating a Discord account, obtaining the invite link, and using the link to join the BLIS Discord server. - Updated README.md to reference the new Discord.md document in multiple places, including via the official Discord logo (used with explicit permission from representatives at Discord Inc.). --- README.md | 31 ++++++++--- docs/Discord.md | 115 ++++++++++++++++++++++++++++++++++++++++ docs/images/discord.svg | 23 ++++++++ 3 files changed, 163 insertions(+), 6 deletions(-) create mode 100644 docs/Discord.md create mode 100644 docs/images/discord.svg diff --git a/README.md b/README.md index 7996cb676..012861366 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ [![Build Status](https://api.travis-ci.com/flame/blis.svg?branch=master)](https://app.travis-ci.com/github/flame/blis) [![Build Status](https://ci.appveyor.com/api/projects/status/github/flame/blis?branch=master&svg=true)](https://ci.appveyor.com/project/shpc/blis/branch/master) +[Discord logo](docs/Discord.md) + Contents -------- @@ -97,6 +99,17 @@ all of which are available for free via the [edX platform](http://www.edx.org/). What's New ---------- + * **Join us on Discord!** In 2021, we soft-launched our [Discord](https://discord.com/) +server by privately inviting current and former collaborators, attendees of our BLIS +Retreat, as well as other participants within the BLIS ecosystem. We've been thrilled by +the results thus far, and are happy to announce that our new community is now open to +the broader public! If you'd like to hang out with other BLIS users and developers, +ask a question, discuss future features, or just say hello, please feel free to join us! +We've put together a [step-by-step guide](docs/Discord.md) for creating an account and +joining our cozy enclave. We even have a monthly "BLIS happy hour" event where people +can casually come together for a video chat, Q&A, brainstorm session, or whatever it +happens to unfold into! + * **Addons feature now available!** Have you ever wanted to quickly extend BLIS's operation support or define new custom BLIS APIs for your application, but were unsure of how to add your source code to BLIS? Do you want to isolate your custom @@ -417,6 +430,9 @@ If/when you have time, we *strongly* encourage you to read the detailed walkthrough of the build system found in our [Build System](docs/BuildSystem.md) guide. +If you are still having trouble, you are welcome to [join us on Discord](docs/Discord.md) +for further information and/or assistance. + Example Code ------------ @@ -500,6 +516,10 @@ empirically measured performance of `gemm` on select hardware architectures within BLIS and other BLAS libraries when performing matrix problems where one or two dimensions is exceedingly small. + * **[Discord](docs/Discord.md).** This document describes how to: create an +account on Discord (if you don't already have one); obtain a private invite +link; and use that invite link to join our BLIS server on Discord. + * **[Release Notes](docs/ReleaseNotes.md).** This document tracks a summary of changes included with each new version of BLIS, along with contributor credits for key features. @@ -610,16 +630,15 @@ has Linux, OSX and Windows binary packages for x86_64. Discussion ---------- -You can keep in touch with developers and other users of the project by joining -one of the following mailing lists: +Most of the active discussions are now happening on our [Discord](https://discord.com/) +server. Users and developers alike are welcome! Please see the +[BLIS Discord guide](docs/Discord.md) for a walkthrough of how to join us. + +You can also still stay in touch by using either of the following mailing lists: * [blis-devel](https://groups.google.com/group/blis-devel): Please join and post to this mailing list if you are a BLIS developer, or if you are trying to use BLIS beyond simply linking to it as a BLAS library. -**Note:** Most of the interesting discussions happen here; don't be afraid to -join! If you would like to submit a bug report, or discuss a possible bug, -please consider opening a [new issue](https://github.com/flame/blis/issues) on -github. * [blis-discuss](https://groups.google.com/group/blis-discuss): Please join and post to this mailing list if you have general questions or feedback regarding diff --git a/docs/Discord.md b/docs/Discord.md new file mode 100644 index 000000000..b4403f7bc --- /dev/null +++ b/docs/Discord.md @@ -0,0 +1,115 @@ +*NOTE: The [BLIS](https://github.com/flame/blis) project is not affiliated with [Discord Inc.](https://discord.com/company) in any way, and we use the Discord logo with their permission.* + + +## Contents + +* **[Welcome](Discord.md#welcome)** +* **[Introduction to Discord](Discord.md#introduction-to-discord)** +* **[Creating an account](Discord.md#creating-an-account)** +* **[Obtaining the invite link](Discord.md#obtaining-the-invite-link)** +* **[Joining the BLIS server](Discord.md#joining-the-blis-server)** +* **[Additional resources](Discord.md#additional-resources)** + +## Welcome + +In 2021, we soft-launched our Discord server by privately inviting current and former collaborators, attendees of our BLIS Retreat, as well as other participants within the BLIS ecosystem. We've been thrilled by the results thus far, and are happy to announce that our new community is now open to the broader public! + +If you'd like to hang out with other BLIS users and developers, ask a question, discuss future features, or just say hello, please feel free to join us! Joining our server is also a great way to get announcements for new versions, workshop events, video chat parties, and other infrequent updates. + +**If you already use Discord** and want to skip straight to the invite link, you can find it [here](#obtaining-the-invite-link). Just be sure to manually remove the dashes (`-`) and equal signs (`=`) before using it! + +## Introduction to Discord + +The remaining sections of this file walk the reader through basic instructions for joining the BLIS community on [Discord](https://discord.com). + +Discord is free to use for everyone. You can optionally pay for premium features via their [Nitro](https://discord.com/nitro) subscription, but Nitro is not necessary for most casual users. + +Discord offers several kinds of clients. Users may use Discord via: + +- the official Android and iOS apps on mobile devices +- a [web browser](https://discord.com/login) +- the standalone desktop application, available from their [Download](https://discord.com/download) page. + +You can even stay logged in on multiple devices! Each one will automatically sync itself to newly sent/received messages. + +In this document, we'll walk you through each step necessary to join the BLIS Discord community. First, we'll talk about how to [create a Discord account](#creating-an-account) (if you don't already have one). Then, we'll explain how to [obtain the invite link](#obtaining-the-invite-link). And finally, we'll tell you how to use that invite link to [join the BLIS Discord server](#joining-the-blis-server). + + +## Creating an account + +If you don't already have a Discord account, you'll need to first create one. + +As of this writing, you may follow these steps to create your account: + +*NOTE: We recommend executing these steps using a desktop web browser. Once you've created your account and joined the BLIS server, you can proceed to use your client(s) of choice (mobile app, desktop app, or web browser).* + +1. Go to [https://discord.com](https://discord.com) and click on "Login" at the top-right. +2. At the bottom of the dialog, click the "Register" link. +3. Enter the prompted information, such as username and email, then click "Continue". +4. Perform the Captcha verification. +5. This should take you into the web browser version of Discord. You will be asked if you want to create your own server. Close the dialog without making any selection. +6. At this point, you need to verify your email address. Check your email account for a message from Discord. Click the link in the email. This should bring up a dialog confirming your email has been verified. You may now close the web browser tab. + +Congratulations! You're now a member of Discord and ready to join individual communities, or "servers." + + +## Obtaining the invite link + +Since we do not have access to an official Captcha-like service to confirm that you are not a software bot, we have instead obfuscated our invite link in a way that should be easy for a human to unmangle. + +Here's an example invite link (for reference purposes only): `https://discord.gg/abC2jUVeip` + +Notice that the link consists of `https://discord.gg/` followed by a 10-character string consisting of lower- and upper-case letters, and (typically) one numerical digit. + +**The BLIS Discord invite link is: https://discord.gg/e-Zx=p-z9=p-Ks=x** + +*Note that you **must** remove the dashes (`-`) and equal signs (`=`) before using the link!* + +Once you decipher the invite link, copy it to your clipboard so it's ready to use in the appropriate step within the next section, [Joining the BLIS server](#joining-the-blis-server). + + +## Joining the BLIS server + +Once you have the invite link copied to your clipboard, follow these steps in order to join the BLIS server: + +*NOTE: We recommend executing these steps using a desktop web browser. Once you've joined the BLIS server, you can proceed to use your client(s) of choice (mobile app, desktop app, or web browser).* + +1. Log in to the [Discord website](https://discord.com). +2. Once logged in, on the left-hand side of the UI, click on the button with the "+" symbol. This will bring up a dialog asking if you want to create a server. +3. At the bottom of the dialog, there will be a section asking, "Have an invite already?" Click the button below it labeled "Join a Server". +4. Paste the invite link into the prompt and click "Join Server". +5. This should bring up a dialog stating that you've been invited to join the BLIS server. Click on "Accept Invite". This will trigger a new dialog informing you that your account has been updated with the invitation. + +That's it! Now that you've joined our server, please consider introducing yourself in `#general`! We love hearing about how application developers and end-users are using BLIS. + +If you had any difficulty joining or with the invite link, please reach out to [field@cs.utexas.edu](field@cs.utexas.edu). + + +## Additional resources + +Are you new to Discord? Not sure how to work this newfangled technology? Don't worry; once you learn the basics, you'll feel much more at home! + +While a tutorial on Discord is beyond the scope of this document, there are countless articles and YouTube videos that introduce newcomers to Discord's UI. Here are a few articles on the basics: + +- **tom's guide**. [Discord: Everything You Need to Know](https://www.tomsguide.com/us/what-is-discord,review-5203.html) +- **WIRED.** [How to Use Discord: A Beginner's Guide](https://www.wired.com/story/how-to-use-discord/) +- **Discord Support.** [Beginner's Guide to Discord](https://support.discord.com/hc/en-us/articles/360045138571-Beginner-s-Guide-to-Discord) + +And some YouTube videos: + +- **Tech Audit TV.** [How to Use Discord in 2022: The Ultimate Beginner Walkthrough](https://www.youtube.com/watch?v=nPmdafMo1b8) +- **Howfinity.** [How to Use Discord - Beginner's Guide](https://www.youtube.com/watch?v=rnYGrq95ezA) + +Some things I recommend setting up shortly after you create your account: + +- Take note of your username's "tag" or disambiguator. This is a randomly-assigned four-digit number that gets implicitly appended to the end of your username (e.g. `bsmith#1234`), which helps when others need to tell you apart from others who have the same username. +- Not happy with your username? You can change it! +- Review your privacy settings, and consider using two-factor authentication. +- Personalize your account with a custom profile image. +- Consider switching to the "dark" theme (if you prefer dark modes on other websites or on mobile devices). +- Tweak other appearence settings such as the font size or UI compactness. +- Set up your notifications. + +There are many other settings in Discord! Feel free to explore all of them by clicking the gear icon in the bottom-left area of your screen, just to the right of your username. + +We hope you found this short guide useful, and we hope to see you on Discord! Thanks for your interest in BLIS and our community! :) diff --git a/docs/images/discord.svg b/docs/images/discord.svg new file mode 100644 index 000000000..1f483fe8f --- /dev/null +++ b/docs/images/discord.svg @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + + + + + + From 2dd692b710b6a9889f7ebdd7934a2108be5c5530 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 26 Oct 2022 18:10:26 -0500 Subject: [PATCH 096/230] Fix auto-detection of firestorm (Apple M1). --- frame/base/bli_cpuid.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c index 527db1f5d..d967cc05d 100644 --- a/frame/base/bli_cpuid.c +++ b/frame/base/bli_cpuid.c @@ -781,7 +781,7 @@ uint32_t bli_cpuid_query if ( bli_cpuid_has_features( ecx, FEATURE_MASK_AVX ) ) *features |= FEATURE_AVX; if ( bli_cpuid_has_features( ecx, FEATURE_MASK_FMA3 ) ) *features |= FEATURE_FMA3; - // Check whether the hardware supports xsave/xrestor/xsetbv/xgetbv AND + // Check whether the hardware supports xsave/xrestor/xsetbv/xgetbv AND // support for these is enabled by the OS. If so, then we proceed with // checking that various register-state saving features are available. if ( bli_cpuid_has_features( ecx, FEATURE_MASK_XGETBV ) ) @@ -813,7 +813,7 @@ uint32_t bli_cpuid_query // The OS can manage the state of 512-bit zmm (AVX-512) registers // only if the xcr[7:5] bits are set. If they are not set, then - // clear all feature bits related to AVX-512. + // clear all feature bits related to AVX-512. if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM | XGETBV_MASK_YMM | XGETBV_MASK_ZMM ) ) @@ -829,7 +829,7 @@ uint32_t bli_cpuid_query // The OS can manage the state of 256-bit ymm (AVX) registers // only if the xcr[2] bit is set. If it is not set, then - // clear all feature bits related to AVX. + // clear all feature bits related to AVX. if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM | XGETBV_MASK_YMM ) ) { @@ -842,7 +842,7 @@ uint32_t bli_cpuid_query // The OS can manage the state of 128-bit xmm (SSE) registers // only if the xcr[1] bit is set. If it is not set, then // clear all feature bits related to SSE (which means the - // entire bitfield is clear). + // entire bitfield is clear). if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM ) ) { *features = 0; @@ -1025,6 +1025,7 @@ static uint32_t get_coretype { int implementer = 0x00, part = 0x000; *features = FEATURE_NEON; + bool has_sve = FALSE; #ifdef __linux__ if ( getauxval( AT_HWCAP ) & HWCAP_CPUID ) @@ -1033,7 +1034,7 @@ static uint32_t get_coretype // /sys/devices/system/cpu/cpu0/regs/identification/midr_el1 // and split out in /proc/cpuinfo (with a tab before the colon): // CPU part : 0x0a1 - + uint64_t midr_el1; __asm("mrs %0, MIDR_EL1" : "=r" (midr_el1)); /* @@ -1047,8 +1048,8 @@ static uint32_t get_coretype implementer = (midr_el1 >> 24) & 0xFF; part = (midr_el1 >> 4) & 0xFFF; } - - bool has_sve = getauxval( AT_HWCAP ) & HWCAP_SVE; + + has_sve = getauxval( AT_HWCAP ) & HWCAP_SVE; if (has_sve) *features |= FEATURE_SVE; #endif //__linux__ @@ -1097,7 +1098,7 @@ static uint32_t get_coretype // CAVIUM_CPU_PART_THUNDERX2 0x0AF // CAVIUM_CPU_PART_THUNDERX3 0x0B8 // taken from OpenBLAS // - // BRCM_CPU_PART_BRAHMA_B53 0x100 + // BRCM_CPU_PART_BRAHMA_B53 0x100 // BRCM_CPU_PART_VULCAN 0x516 // // QCOM_CPU_PART_FALKOR_V1 0x800 @@ -1210,7 +1211,7 @@ uint32_t bli_cpuid_query #elif defined(__arm__) || defined(_M_ARM) || defined(_ARCH_PPC) -/* +/* I can't easily find documentation to do this as for aarch64, though it presumably could be unearthed from Linux code. However, on Linux 5.2 (and Androids's 3.4), /proc/cpuinfo has this sort of From c803b03e52a7a6997a8d304a8cfa9acf7c1c555b Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 26 Oct 2022 18:20:00 -0500 Subject: [PATCH 097/230] Add check to disable armsve on Apple M1. --- configure | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/configure b/configure index a53f25380..37399fbde 100755 --- a/configure +++ b/configure @@ -1335,6 +1335,17 @@ blacklistbu_add() fi } +blacklistos_add() +{ + # Check whether we've already blacklisted the given sub-config so + # we don't output redundant messages. + if [ $(is_in_list "$1" "${config_blist}") == "false" ]; then + + echowarn "The operating system does not support building '$1'; adding to blacklist." + config_blist="${config_blist} $1" + fi +} + blacklist_init() { config_blist="" @@ -1989,6 +2000,13 @@ check_assembler() fi } +check_os() +{ + if [[ "$(uname -s)" == "Darwin" && "$(uname -m)" == "arm64" ]]; then + blacklistos_add "armsve" + fi +} + try_assemble() { local cc cflags asm_src asm_base asm_bin rval @@ -2886,6 +2904,9 @@ main() get_binutils_version check_assembler + # Check if there is any incompatibility due to the operating system. + check_os + # Remove duplicates and whitespace from the blacklist. blacklist_cleanup From aeb5f0cc19665456e990a7ffccdb09da2e3f504b Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 27 Oct 2022 12:39:11 -0500 Subject: [PATCH 098/230] Omnibus PR - Oct 2023 (#678) Details: - This is an "omnibus" commit, consisting of multiple medium-sized commits that affect non-trivial aspects of BLIS. The major highlights: - Relocated the pba, sba pool (from the rntm_t), and mem_t (from the cntl_t) to the thrinfo_t object. This allows the rntm_t to be effectively const (although it is sometimes copied internally and modified to reflect different ways of parallelism). Moving the mem_t sets the stage for sharing a global control tree amongst all threads. - De-templatized the macrokernels for gemmt, trmm, and trsm to match the macrokernel for gemm, which has been de-templatized since 54fa28b. - Reimplemented bli_l3_determine_kc() by separating out the logic for adjusting KC based on MR/NR for triangular A and/or B into a new function, bli_l3_adjust_kc(). For now, this function is still called from bli_l3_determine_kc(), but in the future we plan to have it called once when constructing the control tree. - Refactored the level-3 thread decorator into two parts: - One part deals only with launching threads, each one calling a generic thread entry function. This code resides in frame/thread and constitutes the definition of bli_thread_launch(). Note that it is specific to the threading implementation (OpenMP, pthreads, single, etc.) - The other part deals with passing the matrix operands and related information into bli_thread_launch(). This is the "l3 decorator" and now resides in frame/3. It is agnostic to the threading implementation. - Modified the "level" of the thread control tree passed in at each operation. Previously, each operation (e.g. bli_gemm_blk_var1()) was passed in a communicator representing the active thread teams which would share the available work. Now, the *parent* thread comm is passed in. The operation then grabs the child comm and uses it to partition the work. The difference is in bli_trsm_blk_var1(), where there are now two children nodes for this single operation (i.e. the thread control tree is split one level above where the control tree is). The sub-prenode is used for the trsm subproblem while the normal sub-node is used for the gemm part. Importantly, the parent comm is used for the barrier between them. - Removed cntl_t* arguments from bli_*_front() functions. These will be added back in the future when the control tree's creation is moved so that it happens much sooner (provided that bli_*_front() have not been absorbed into their respective bli_*_ex() functions). - Renamed various bli_thread_*() query functions to bli_thrinfo_*(), for consistency. This includes _num_threads(), _thread_id(), _n_way(), _work_id(), _sba_pool(), _pba(), _mem(), _barrier(), _broadcast(), and _am_chief(). - Removed extraneous barrier from _blk_var3() of gemm and trsm. - Fixed a typo in bli_type_defs.h where BLIS_BLAS_INT_TYPE_SIZE was misspelled. --- addon/gemmd/attic/bao_gemmd_bp_var2.c | 10 +- addon/gemmd/bao_gemmd_bp_var1.c | 10 +- addon/gemmd/bao_l3_packm_a.c | 10 +- addon/gemmd/bao_l3_packm_b.c | 10 +- addon/gemmd/bao_l3_packm_var1.c | 4 +- addon/gemmd/bao_l3_packm_var2.c | 4 +- build/libblis-symbols.def | 1484 ++--------------- frame/1m/bli_l1m_oft_var.h | 5 +- frame/1m/bli_l1m_tapi.c | 24 +- frame/1m/bli_l1m_unb_var1.c | 15 +- frame/1m/bli_l1m_unb_var1.h | 15 +- frame/1m/packm/bli_packm.h | 1 + frame/1m/packm/bli_packm_alloc.c | 50 +- frame/1m/packm/bli_packm_alloc.h | 13 +- frame/1m/packm/bli_packm_blk_var1.c | 11 +- frame/1m/packm/bli_packm_blk_var1.h | 11 +- frame/1m/packm/bli_packm_cntl.c | 6 +- frame/1m/packm/bli_packm_cntl.h | 2 +- frame/1m/packm/bli_packm_init.c | 7 +- frame/1m/packm/bli_packm_init.h | 5 +- frame/1m/packm/bli_packm_int.c | 11 +- frame/1m/packm/bli_packm_int.h | 5 +- frame/1m/packm/bli_packm_struc_cxk_md.c | 2 - frame/1m/packm/bli_packm_thrinfo.c | 75 - frame/1m/packm/bli_packm_thrinfo.h | 40 - frame/1m/unpackm/bli_unpackm_cntl.c | 10 +- frame/1m/unpackm/bli_unpackm_cntl.h | 8 +- frame/1m/unpackm/bli_unpackm_int.c | 4 +- frame/3/bli_l3.h | 4 + frame/3/bli_l3_blocksize.c | 297 +--- frame/3/bli_l3_blocksize.h | 57 +- frame/3/bli_l3_cntl.c | 25 +- frame/3/bli_l3_cntl.h | 11 +- frame/3/bli_l3_decor.c | 298 ++++ frame/{thread => 3}/bli_l3_decor.h | 33 +- frame/3/bli_l3_int.c | 15 +- frame/3/bli_l3_int.h | 3 +- frame/3/bli_l3_oapi_ex.c | 64 +- frame/3/bli_l3_oapi_ex.h | 8 +- frame/3/bli_l3_oft.h | 8 +- frame/3/bli_l3_oft_var.h | 3 +- frame/3/bli_l3_packab.c | 10 +- frame/3/bli_l3_packab.h | 6 +- frame/3/bli_l3_sup.c | 16 +- frame/3/bli_l3_sup.h | 4 +- .../bli_l3_decor.c => 3/bli_l3_sup_decor.c} | 161 +- frame/{thread => 3}/bli_l3_sup_decor.h | 29 +- frame/3/bli_l3_sup_int.c | 40 +- frame/3/bli_l3_sup_int.h | 4 +- frame/3/bli_l3_sup_oft.h | 2 +- frame/3/bli_l3_sup_packm.c | 68 +- frame/3/bli_l3_sup_packm.h | 8 +- frame/3/bli_l3_sup_packm_var.c | 32 +- frame/3/bli_l3_sup_var1n2m.c | 145 +- frame/3/bli_l3_sup_vars.h | 2 +- frame/3/bli_l3_tapi_ex.c | 18 +- frame/3/bli_l3_tapi_ex.h | 16 +- frame/3/bli_l3_thrinfo.c | 389 +++-- frame/3/bli_l3_thrinfo.h | 63 +- frame/3/gemm/bli_gemm_blk_var1.c | 17 +- frame/3/gemm/bli_gemm_blk_var2.c | 17 +- frame/3/gemm/bli_gemm_blk_var3.c | 20 +- frame/3/gemm/bli_gemm_cntl.c | 158 +- frame/3/gemm/bli_gemm_cntl.h | 18 +- frame/3/gemm/bli_gemm_front.c | 6 +- frame/3/gemm/bli_gemm_front.h | 5 +- frame/3/gemm/bli_gemm_ker_var2.c | 14 +- frame/3/gemm/bli_gemm_var.h | 11 +- frame/3/gemm/{ => other}/bli_gemm_ker_var1.c | 0 frame/3/gemm/other/bli_gemm_ker_var2.c | 8 +- frame/3/gemm/other/bli_gemm_ker_var2rr.c | 8 +- frame/3/gemm/other/bli_gemm_ker_var2sl.c | 8 +- frame/3/gemmt/bli_gemmt_front.c | 6 +- frame/3/gemmt/bli_gemmt_front.h | 3 +- frame/3/gemmt/bli_gemmt_l_ker_var2.c | 757 ++++----- frame/3/gemmt/bli_gemmt_u_ker_var2.c | 763 ++++----- frame/3/gemmt/bli_gemmt_var.h | 4 +- frame/3/gemmt/bli_gemmt_x_ker_var2.c | 9 +- frame/3/gemmt/other/bli_gemmt_l_ker_var2.c | 8 +- frame/3/gemmt/other/bli_gemmt_u_ker_var2.c | 8 +- frame/3/hemm/bli_hemm_front.c | 6 +- frame/3/hemm/bli_hemm_front.h | 3 +- frame/3/symm/bli_symm_front.c | 6 +- frame/3/symm/bli_symm_front.h | 3 +- frame/3/trmm/bli_trmm_front.c | 6 +- frame/3/trmm/bli_trmm_front.h | 3 +- frame/3/trmm/bli_trmm_ll_ker_var2.c | 551 +++--- frame/3/trmm/bli_trmm_lu_ker_var2.c | 566 +++---- frame/3/trmm/bli_trmm_rl_ker_var2.c | 676 +++----- frame/3/trmm/bli_trmm_ru_ker_var2.c | 714 ++++---- frame/3/trmm/bli_trmm_var.h | 4 +- frame/3/trmm/bli_trmm_xx_ker_var2.c | 11 +- frame/3/trmm/other/bli_trmm_ll_ker_var2.c | 4 +- frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c | 8 +- frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c | 8 +- frame/3/trmm/other/bli_trmm_lu_ker_var2.c | 4 +- frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c | 8 +- frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c | 8 +- frame/3/trmm/other/bli_trmm_rl_ker_var2.c | 4 +- frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c | 8 +- frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c | 8 +- frame/3/trmm/other/bli_trmm_ru_ker_var2.c | 4 +- frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c | 8 +- frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c | 8 +- frame/3/trmm3/bli_trmm3_front.c | 6 +- frame/3/trmm3/bli_trmm3_front.h | 3 +- frame/3/trsm/bli_trsm_blk_var1.c | 28 +- frame/3/trsm/bli_trsm_blk_var2.c | 17 +- frame/3/trsm/bli_trsm_blk_var3.c | 25 +- frame/3/trsm/bli_trsm_cntl.c | 101 +- frame/3/trsm/bli_trsm_cntl.h | 13 +- frame/3/trsm/bli_trsm_front.c | 6 +- frame/3/trsm/bli_trsm_front.h | 3 +- frame/3/trsm/bli_trsm_ll_ker_var2.c | 625 +++---- frame/3/trsm/bli_trsm_lu_ker_var2.c | 658 +++----- frame/3/trsm/bli_trsm_rl_ker_var2.c | 576 +++---- frame/3/trsm/bli_trsm_ru_ker_var2.c | 641 +++---- frame/3/trsm/bli_trsm_var.h | 44 +- frame/3/trsm/bli_trsm_xx_ker_var2.c | 11 +- frame/3/trsm/other/bli_trsm_ll_ker_var2.c | 4 +- frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c | 4 +- frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c | 4 +- frame/3/trsm/other/bli_trsm_lu_ker_var2.c | 4 +- frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c | 4 +- frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c | 4 +- frame/3/trsm/other/bli_trsm_rl_ker_var2.c | 4 +- frame/3/trsm/other/bli_trsm_ru_ker_var2.c | 4 +- frame/base/bli_cntl.c | 140 +- frame/base/bli_cntl.h | 43 +- frame/base/bli_cntx.h | 2 - frame/base/bli_mem.h | 15 +- frame/base/bli_pba.c | 29 +- frame/base/bli_pba.h | 16 +- frame/base/bli_rntm.h | 44 +- frame/base/bli_sba.c | 144 +- frame/base/bli_sba.h | 13 +- frame/compat/extra/bla_gemm3m.c | 3 +- frame/include/bli_extern_defs.h | 2 - frame/include/bli_oapi_ex.h | 2 +- frame/include/bli_tapi_ex.h | 2 +- frame/include/bli_type_defs.h | 28 +- frame/include/level0/1e/bli_copy1es.h | 28 +- frame/include/level0/1e/bli_copyj1es.h | 28 +- frame/include/level0/1e/bli_scal21es.h | 86 +- frame/include/level0/1e/bli_scal2j1es.h | 86 +- frame/include/level0/ri/bli_copyris.h | 18 +- frame/include/level0/ri/bli_scal2jris.h | 4 + frame/include/level0/ri/bli_scal2ris.h | 4 + frame/thread/bli_l3_decor_openmp.c | 253 --- frame/thread/bli_l3_decor_pthreads.c | 264 --- frame/thread/bli_l3_decor_pthreads.h | 61 - frame/thread/bli_l3_decor_single.c | 165 -- frame/thread/bli_l3_sup_decor_openmp.c | 136 -- frame/thread/bli_l3_sup_decor_pthreads.c | 225 --- frame/thread/bli_l3_sup_decor_pthreads.h | 60 - frame/thread/bli_l3_sup_decor_single.c | 138 -- frame/thread/bli_thrcomm.c | 76 +- frame/thread/bli_thrcomm.h | 15 +- frame/thread/bli_thread.c | 55 +- frame/thread/bli_thread.h | 37 +- ..._l3_decor_openmp.h => bli_thread_openmp.c} | 46 +- ...sup_decor_openmp.h => bli_thread_openmp.h} | 18 +- frame/thread/bli_thread_pthreads.c | 128 ++ ...3_decor_single.h => bli_thread_pthreads.h} | 24 +- .../thread/bli_thread_single.c | 29 +- ...sup_decor_single.h => bli_thread_single.h} | 18 +- frame/thread/bli_thrinfo.c | 629 ++----- frame/thread/bli_thrinfo.h | 176 +- frame/thread/bli_thrinfo_sup.c | 290 ---- frame/thread/bli_thrinfo_sup.h | 66 - frame/util/bli_util_tapi.c | 14 +- sandbox/gemmlike/attic/bls_gemm_bp_var2.c | 10 +- sandbox/gemmlike/bli_gemm_ex.c | 4 +- sandbox/gemmlike/bls_gemm.c | 14 +- sandbox/gemmlike/bls_gemm.h | 2 +- sandbox/gemmlike/bls_gemm_bp_var1.c | 102 +- .../gemmlike/bls_l3_decor.c | 156 +- sandbox/gemmlike/{thread => }/bls_l3_decor.h | 25 - sandbox/gemmlike/bls_l3_packm_a.c | 67 +- sandbox/gemmlike/bls_l3_packm_a.h | 21 - sandbox/gemmlike/bls_l3_packm_b.c | 67 +- sandbox/gemmlike/bls_l3_packm_b.h | 21 - sandbox/gemmlike/bls_l3_packm_var.h | 2 +- sandbox/gemmlike/bls_l3_packm_var1.c | 11 +- sandbox/gemmlike/bls_l3_packm_var2.c | 11 +- sandbox/gemmlike/bls_l3_packm_var3.c | 11 +- sandbox/gemmlike/thread/bls_l3_decor.c | 148 -- sandbox/gemmlike/thread/bls_l3_decor_openmp.c | 131 -- sandbox/gemmlike/thread/bls_l3_decor_openmp.h | 57 - .../gemmlike/thread/bls_l3_decor_pthreads.c | 222 --- .../gemmlike/thread/bls_l3_decor_pthreads.h | 60 - sandbox/gemmlike/thread/bls_l3_decor_single.c | 137 -- sandbox/old/ref99/old/packm/blx_l3_packm.c | 10 +- .../old/ref99/old/vars/blx_gemm_blk_var3.c | 4 +- .../old/ref99/old/vars/blx_gemm_ker_var2.c | 8 +- .../old/vars/other/blx_gemm_ker_var2rr.c | 8 +- .../old/vars/other/blx_gemm_ker_var2sl.c | 8 +- sandbox/power10/bli_gemm_ex.c | 4 +- test/syrk_diagonal/syrk_diagonal_example2.c | 4 +- test/syrk_diagonal/syrk_diagonal_example2.cxx | 4 +- test/tensor_contraction/tcontract_example.cxx | 16 +- testsuite/src/test_gemm_ukr.c | 18 +- testsuite/src/test_gemmtrsm_ukr.c | 28 +- testsuite/src/test_libblis.c | 20 +- testsuite/src/test_libblis.h | 2 +- testsuite/src/test_trsm_ukr.c | 18 +- 206 files changed, 5013 insertions(+), 11035 deletions(-) delete mode 100644 frame/1m/packm/bli_packm_thrinfo.c create mode 100644 frame/3/bli_l3_decor.c rename frame/{thread => 3}/bli_l3_decor.h (78%) rename frame/{thread/bli_l3_decor.c => 3/bli_l3_sup_decor.c} (59%) rename frame/{thread => 3}/bli_l3_sup_decor.h (77%) rename frame/3/gemm/{ => other}/bli_gemm_ker_var1.c (100%) delete mode 100644 frame/thread/bli_l3_decor_openmp.c delete mode 100644 frame/thread/bli_l3_decor_pthreads.c delete mode 100644 frame/thread/bli_l3_decor_pthreads.h delete mode 100644 frame/thread/bli_l3_decor_single.c delete mode 100644 frame/thread/bli_l3_sup_decor_openmp.c delete mode 100644 frame/thread/bli_l3_sup_decor_pthreads.c delete mode 100644 frame/thread/bli_l3_sup_decor_pthreads.h delete mode 100644 frame/thread/bli_l3_sup_decor_single.c rename frame/thread/{bli_l3_decor_openmp.h => bli_thread_openmp.c} (69%) rename frame/thread/{bli_l3_sup_decor_openmp.h => bli_thread_openmp.h} (82%) create mode 100644 frame/thread/bli_thread_pthreads.c rename frame/thread/{bli_l3_decor_single.h => bli_thread_pthreads.h} (82%) rename sandbox/gemmlike/thread/bls_l3_decor_single.h => frame/thread/bli_thread_single.c (77%) rename frame/thread/{bli_l3_sup_decor_single.h => bli_thread_single.h} (81%) delete mode 100644 frame/thread/bli_thrinfo_sup.c delete mode 100644 frame/thread/bli_thrinfo_sup.h rename frame/thread/bli_l3_sup_decor.c => sandbox/gemmlike/bls_l3_decor.c (50%) rename sandbox/gemmlike/{thread => }/bls_l3_decor.h (79%) delete mode 100644 sandbox/gemmlike/thread/bls_l3_decor.c delete mode 100644 sandbox/gemmlike/thread/bls_l3_decor_openmp.c delete mode 100644 sandbox/gemmlike/thread/bls_l3_decor_openmp.h delete mode 100644 sandbox/gemmlike/thread/bls_l3_decor_pthreads.c delete mode 100644 sandbox/gemmlike/thread/bls_l3_decor_pthreads.h delete mode 100644 sandbox/gemmlike/thread/bls_l3_decor_single.c diff --git a/addon/gemmd/attic/bao_gemmd_bp_var2.c b/addon/gemmd/attic/bao_gemmd_bp_var2.c index 9139e89b1..dbccedc35 100644 --- a/addon/gemmd/attic/bao_gemmd_bp_var2.c +++ b/addon/gemmd/attic/bao_gemmd_bp_var2.c @@ -386,8 +386,8 @@ void PASTECH2(bao_,ch,varname) \ /* Query the number of threads and thread ids for the JR loop. NOTE: These values are only needed when computing the next micropanel of B. */ \ - const dim_t jr_nt = bli_thread_n_way( thread_jr ); \ - const dim_t jr_tid = bli_thread_work_id( thread_jr ); \ + const dim_t jr_nt = bli_thrinfo_n_way( thread_jr ); \ + const dim_t jr_tid = bli_thrinfo_work_id( thread_jr ); \ \ /* Compute number of primary and leftover components of the JR loop. */ \ dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ @@ -416,8 +416,8 @@ void PASTECH2(bao_,ch,varname) \ /* Query the number of threads and thread ids for the IR loop. NOTE: These values are only needed when computing the next micropanel of A. */ \ - const dim_t ir_nt = bli_thread_n_way( thread_ir ); \ - const dim_t ir_tid = bli_thread_work_id( thread_ir ); \ + const dim_t ir_nt = bli_thrinfo_n_way( thread_ir ); \ + const dim_t ir_tid = bli_thrinfo_work_id( thread_ir ); \ \ /* Compute number of primary and leftover components of the IR loop. */ \ dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ @@ -476,7 +476,7 @@ void PASTECH2(bao_,ch,varname) \ /* This barrier is needed to prevent threads from starting to pack the next row panel of B before the current row panel is fully computed upon. */ \ - bli_thread_barrier( thread_pb ); \ + bli_thrinfo_barrier( thread_pb ); \ } \ } \ \ diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c index e3f47982c..b475218e9 100644 --- a/addon/gemmd/bao_gemmd_bp_var1.c +++ b/addon/gemmd/bao_gemmd_bp_var1.c @@ -370,8 +370,8 @@ void PASTECH2(bao_,ch,varname) \ /* Query the number of threads and thread ids for the JR loop. NOTE: These values are only needed when computing the next micropanel of B. */ \ - const dim_t jr_nt = bli_thread_n_way( thread_jr ); \ - const dim_t jr_tid = bli_thread_work_id( thread_jr ); \ + const dim_t jr_nt = bli_thrinfo_n_way( thread_jr ); \ + const dim_t jr_tid = bli_thrinfo_work_id( thread_jr ); \ \ /* Compute number of primary and leftover components of the JR loop. */ \ dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ @@ -400,8 +400,8 @@ void PASTECH2(bao_,ch,varname) \ /* Query the number of threads and thread ids for the IR loop. NOTE: These values are only needed when computing the next micropanel of A. */ \ - const dim_t ir_nt = bli_thread_n_way( thread_ir ); \ - const dim_t ir_tid = bli_thread_work_id( thread_ir ); \ + const dim_t ir_nt = bli_thrinfo_n_way( thread_ir ); \ + const dim_t ir_tid = bli_thrinfo_work_id( thread_ir ); \ \ /* Compute number of primary and leftover components of the IR loop. */ \ dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ @@ -458,7 +458,7 @@ void PASTECH2(bao_,ch,varname) \ /* This barrier is needed to prevent threads from starting to pack the next row panel of B before the current row panel is fully computed upon. */ \ - bli_thread_barrier( rntm, thread_pb ); \ + bli_thrinfo_barrier( thread_pb ); \ } \ } \ \ diff --git a/addon/gemmd/bao_l3_packm_a.c b/addon/gemmd/bao_l3_packm_a.c index 1d6502884..b33fd9089 100644 --- a/addon/gemmd/bao_l3_packm_a.c +++ b/addon/gemmd/bao_l3_packm_a.c @@ -61,7 +61,7 @@ void PASTECH2(bao_,ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thrinfo_barrier( thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \ @@ -90,7 +90,7 @@ void PASTECH2(bao_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ + mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -139,7 +139,7 @@ void PASTECH2(bao_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ + mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -313,13 +313,13 @@ void PASTECH2(bao_,ch,opname) \ d, incd, \ a, rs_a, cs_a, \ *p, *rs_p, *cs_p, \ - pd_p, *ps_p, \ + pd_p, *ps_p, \ cntx, \ thread \ ); \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thrinfo_barrier( thread ); \ } //INSERT_GENTFUNC_BASIC0( packm_a ) diff --git a/addon/gemmd/bao_l3_packm_b.c b/addon/gemmd/bao_l3_packm_b.c index 8d020007c..76860c8ee 100644 --- a/addon/gemmd/bao_l3_packm_b.c +++ b/addon/gemmd/bao_l3_packm_b.c @@ -61,7 +61,7 @@ void PASTECH2(bao_,ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thrinfo_barrier( thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \ @@ -90,7 +90,7 @@ void PASTECH2(bao_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ + mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -139,7 +139,7 @@ void PASTECH2(bao_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ + mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -313,13 +313,13 @@ void PASTECH2(bao_,ch,opname) \ d, incd, \ b, rs_b, cs_b, \ *p, *rs_p, *cs_p, \ - pd_p, *ps_p, \ + pd_p, *ps_p, \ cntx, \ thread \ ); \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thrinfo_barrier( thread ); \ } //INSERT_GENTFUNC_BASIC0( packm_b ) diff --git a/addon/gemmd/bao_l3_packm_var1.c b/addon/gemmd/bao_l3_packm_var1.c index 24c0a2cc1..d002dc6bf 100644 --- a/addon/gemmd/bao_l3_packm_var1.c +++ b/addon/gemmd/bao_l3_packm_var1.c @@ -127,8 +127,8 @@ void PASTECH2(bao_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ + const dim_t nt = bli_thrinfo_n_way( thread ); \ + const dim_t tid = bli_thrinfo_work_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ diff --git a/addon/gemmd/bao_l3_packm_var2.c b/addon/gemmd/bao_l3_packm_var2.c index 830e499b3..49e9d1941 100644 --- a/addon/gemmd/bao_l3_packm_var2.c +++ b/addon/gemmd/bao_l3_packm_var2.c @@ -127,8 +127,8 @@ void PASTECH2(bao_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ + const dim_t nt = bli_thrinfo_n_way( thread ); \ + const dim_t tid = bli_thrinfo_work_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def index 8d29d73b2..db20ffbca 100644 --- a/build/libblis-symbols.def +++ b/build/libblis-symbols.def @@ -1,122 +1,69 @@ EXPORTS bli_abort bli_absqsc -bli_absqsc_check -bli_absqsc_qfp bli_acquire_mij bli_acquire_mpart bli_acquire_mpart_b2t bli_acquire_mpart_br2tl bli_acquire_mpart_l2r -bli_acquire_mpart_l2r_check bli_acquire_mpart_mdim bli_acquire_mpart_mndim bli_acquire_mpart_ndim bli_acquire_mpart_r2l bli_acquire_mpart_t2b -bli_acquire_mpart_t2b_check bli_acquire_mpart_tl2br -bli_acquire_mpart_tl2br_check bli_acquire_vi bli_acquire_vpart_b2f bli_acquire_vpart_f2b bli_addd -bli_addd_check bli_addd_ex -bli_addd_ex_qfp bli_addm -bli_addm_check bli_addm_ex -bli_addm_ex_qfp bli_addsc -bli_addsc_check -bli_addsc_qfp bli_addv -bli_addv_check bli_addv_ex -bli_addv_ex_qfp -bli_adjust_strides bli_align_dim_to_mult bli_align_dim_to_size bli_align_ptr_to_size bli_amaxv -bli_amaxv_check bli_amaxv_ex -bli_amaxv_ex_qfp -bli_apool_alloc_block -bli_apool_array_elem -bli_apool_checkin_array -bli_apool_checkout_array -bli_apool_finalize -bli_apool_free_block -bli_apool_grow -bli_apool_init bli_arch_query_id -bli_arch_set_id -bli_arch_set_id_once bli_arch_string -bli_array_elem -bli_array_finalize -bli_array_init -bli_array_resize -bli_array_set_elem bli_asumv -bli_asumv_check bli_asumv_ex -bli_asumv_ex_qfp bli_axpbyv -bli_axpbyv_check bli_axpbyv_ex -bli_axpbyv_ex_qfp bli_axpy2v -bli_axpy2v_check bli_axpy2v_ex -bli_axpy2v_ex_qfp bli_axpyd -bli_axpyd_check bli_axpyd_ex -bli_axpyd_ex_qfp bli_axpyf -bli_axpyf_check bli_axpyf_ex -bli_axpyf_ex_qfp bli_axpym -bli_axpym_check bli_axpym_ex -bli_axpym_ex_qfp bli_axpyv -bli_axpyv_check bli_axpyv_ex -bli_axpyv_ex_qfp bli_blksz_create bli_blksz_create_ed bli_blksz_free bli_blksz_init bli_blksz_init_easy bli_blksz_init_ed -bli_blksz_reduce_def_to -bli_blksz_reduce_max_to bli_cabsqsc bli_caddd bli_caddd_ex bli_caddm bli_caddm_ex -bli_caddm_unb_var1 bli_caddsc bli_caddv bli_caddv_ex -bli_calloc_intl bli_camaxv bli_camaxv_ex bli_castm -bli_castm_check bli_castnzm -bli_castnzm_check bli_castv -bli_castv_check bli_casumv bli_casumv_ex -bli_casumv_unb_var1 bli_caxpbyv bli_caxpbyv_ex bli_caxpy2v @@ -127,33 +74,24 @@ bli_caxpyf bli_caxpyf_ex bli_caxpym bli_caxpym_ex -bli_caxpym_unb_var1 bli_caxpyv bli_caxpyv_ex bli_cccastm bli_cccastnzm bli_cccastv bli_cccopysc -bli_ccgemm_ker_var2_md bli_ccopyd bli_ccopyd_ex bli_ccopym bli_ccopym_ex -bli_ccopym_unb_var1 bli_ccopyv bli_ccopyv_ex -bli_ccpackm_blk_var1_md -bli_ccpackm_cxk_1e_md -bli_ccpackm_cxk_1r_md -bli_ccpackm_struc_cxk_md bli_ccxpbym_md bli_ccxpbym_md_ex -bli_ccxpbym_md_unb_var1 bli_cdcastm bli_cdcastnzm bli_cdcastv bli_cdcopysc -bli_cdgemm_ker_var2_md bli_cdivsc bli_cdotaxpyv bli_cdotaxpyv_ex @@ -165,288 +103,111 @@ bli_cdotxf bli_cdotxf_ex bli_cdotxv bli_cdotxv_ex -bli_cdpackm_blk_var1_md -bli_cdpackm_cxk_1e_md -bli_cdpackm_cxk_1r_md -bli_cdpackm_struc_cxk_md bli_cdxpbym_md bli_cdxpbym_md_ex -bli_cdxpbym_md_unb_var1 +bli_ceqm +bli_ceqsc +bli_ceqv bli_cfprintm bli_cfprintv bli_cgemm -bli_cgemm1m -bli_cgemm3m1 -bli_cgemm3mh -bli_cgemm4m1 -bli_cgemm4mb -bli_cgemm4mb_ker_var2 -bli_cgemm4mh bli_cgemm_ex -bli_cgemm_ker_var2 -bli_cgemm_md_c2r_ref -bli_cgemmtrsm_l_ukernel -bli_cgemmtrsm_u_ukernel -bli_cgemm_ukernel +bli_cgemmt +bli_cgemmt_ex bli_cgemv bli_cgemv_ex -bli_cgemv_unb_var1 -bli_cgemv_unb_var2 -bli_cgemv_unf_var1 -bli_cgemv_unf_var2 bli_cger bli_cger_ex -bli_cger_unb_var1 -bli_cger_unb_var2 bli_cgetijm +bli_cgetijv bli_cgetsc -bli_check_alignment_is_mult_of_ptr_size -bli_check_alignment_is_power_of_two -bli_check_conformal_dims -bli_check_consistent_datatypes -bli_check_consistent_object_datatypes -bli_check_consistent_object_precisions -bli_check_consistent_precisions -bli_check_datatype_real_proj_of -bli_check_equal_vector_lengths bli_check_error_code_helper -bli_check_floating_datatype -bli_check_floating_object -bli_check_general_object -bli_check_hermitian_object -bli_check_if_exhausted_pool -bli_check_integer_datatype -bli_check_integer_object -bli_check_level3_dims -bli_check_matrix_object -bli_check_matrix_strides -bli_check_nonconstant_datatype -bli_check_nonconstant_object -bli_check_noninteger_datatype -bli_check_noninteger_object -bli_check_nonunit_diag -bli_check_null_pointer -bli_check_object_alias_of -bli_check_object_buffer -bli_check_object_diag_offset_equals -bli_check_object_length_equals -bli_check_object_real_proj_of -bli_check_object_struc -bli_check_object_valid_datatype -bli_check_object_width_equals -bli_check_packm_schema_on_unpack -bli_check_packv_schema_on_unpack -bli_check_real_datatype -bli_check_real_object -bli_check_real_valued_object -bli_check_scalar_object -bli_check_square_object -bli_check_sufficient_stack_buf_size -bli_check_symmetric_object -bli_check_triangular_object -bli_check_upper_or_lower_object -bli_check_valid_1x3_subpart -bli_check_valid_3x1_subpart -bli_check_valid_3x3_subpart -bli_check_valid_arch_id -bli_check_valid_cntl -bli_check_valid_datatype -bli_check_valid_diag -bli_check_valid_error_level -bli_check_valid_kc_mod_mult -bli_check_valid_malloc_buf -bli_check_valid_mc_mod_mult -bli_check_valid_nc_mod_mult -bli_check_valid_packbuf -bli_check_valid_side -bli_check_valid_trans -bli_check_valid_uplo -bli_check_vector_dim_equals -bli_check_vector_object bli_chemm -bli_chemm1m -bli_chemm3m1 -bli_chemm3mh -bli_chemm4m1 -bli_chemm4mh bli_chemm_ex bli_chemv bli_chemv_ex -bli_chemv_unb_var1 -bli_chemv_unb_var2 -bli_chemv_unb_var3 -bli_chemv_unb_var4 -bli_chemv_unf_var1 -bli_chemv_unf_var1a -bli_chemv_unf_var3 -bli_chemv_unf_var3a bli_cher bli_cher2 bli_cher2_ex bli_cher2k -bli_cher2k1m -bli_cher2k3m1 -bli_cher2k3mh -bli_cher2k4m1 -bli_cher2k4mh bli_cher2k_ex -bli_cher2_unb_var1 -bli_cher2_unb_var2 -bli_cher2_unb_var3 -bli_cher2_unb_var4 -bli_cher2_unf_var1 -bli_cher2_unf_var4 bli_cher_ex bli_cherk -bli_cherk1m -bli_cherk3m1 -bli_cherk3mh -bli_cherk4m1 -bli_cherk4mh bli_cherk_ex -bli_cherk_l_ker_var2 -bli_cherk_u_ker_var2 -bli_cher_unb_var1 -bli_cher_unb_var2 bli_cinvertd bli_cinvertd_ex bli_cinvertsc bli_cinvertv bli_cinvertv_ex +bli_cinvscald +bli_cinvscald_ex +bli_cinvscalm +bli_cinvscalm_ex +bli_cinvscalv +bli_cinvscalv_ex bli_clock -bli_clock_helper bli_clock_min_diff bli_cmachval bli_cmkherm bli_cmkherm_ex -bli_cmkherm_unb_var1 bli_cmksymm bli_cmksymm_ex -bli_cmksymm_unb_var1 bli_cmktrim bli_cmktrim_ex -bli_cmktrim_unb_var1 bli_cmulsc bli_cnorm1m bli_cnorm1m_ex -bli_cnorm1m_unb_var1 bli_cnorm1v bli_cnorm1v_ex -bli_cnorm1v_unb_var1 bli_cnormfm bli_cnormfm_ex -bli_cnormfm_unb_var1 bli_cnormfsc bli_cnormfv bli_cnormfv_ex -bli_cnormfv_unb_var1 bli_cnormim bli_cnormim_ex -bli_cnormim_unb_var1 bli_cnormiv bli_cnormiv_ex -bli_cnormiv_unb_var1 -bli_cntl_calc_num_threads_in bli_cntl_clear_node bli_cntl_copy bli_cntl_create_node bli_cntl_free bli_cntl_free_node -bli_cntl_free_wo_thrinfo -bli_cntl_free_w_thrinfo bli_cntl_mark_family -bli_cntx_1m_stage -bli_cntx_3m1_stage -bli_cntx_3mh_stage -bli_cntx_4m1_stage -bli_cntx_4mb_stage -bli_cntx_4mh_stage bli_cntx_clear -bli_cntx_ind_stage -bli_cntx_nat_stage bli_cntx_print bli_cntx_set_blkszs bli_cntx_set_ind_blkszs -bli_cntx_set_l1f_kers -bli_cntx_set_l1v_kers -bli_cntx_set_l3_nat_ukrs -bli_cntx_set_packm_kers +bli_cntx_set_l3_sup_handlers +bli_cntx_set_ukr_prefs +bli_cntx_set_ukrs bli_copyd -bli_copyd_check bli_copyd_ex -bli_copyd_ex_qfp bli_copym -bli_copym_check bli_copym_ex -bli_copym_ex_qfp bli_copysc -bli_copysc_check bli_copyv -bli_copyv_check bli_copyv_ex -bli_copyv_ex_qfp -bli_cpackm_blk_var1 -bli_cpackm_cxk -bli_cpackm_cxk_1er -bli_cpackm_cxk_3mis -bli_cpackm_cxk_4mi -bli_cpackm_cxk_rih -bli_cpackm_herm_cxk -bli_cpackm_herm_cxk_1er -bli_cpackm_herm_cxk_3mis -bli_cpackm_herm_cxk_4mi -bli_cpackm_herm_cxk_rih -bli_cpackm_struc_cxk -bli_cpackm_struc_cxk_1er -bli_cpackm_struc_cxk_3mis -bli_cpackm_struc_cxk_4mi -bli_cpackm_struc_cxk_rih -bli_cpackm_tri_cxk -bli_cpackm_tri_cxk_1er -bli_cpackm_tri_cxk_3mis -bli_cpackm_tri_cxk_4mi -bli_cpackm_tri_cxk_rih -bli_cpackm_unb_var1 bli_cprintm -bli_cprintm_ex bli_cprintv -bli_cprintv_ex -bli_cpuid_is_bulldozer -bli_cpuid_is_excavator -bli_cpuid_is_haswell -bli_cpuid_is_knl -bli_cpuid_is_penryn -bli_cpuid_is_piledriver -bli_cpuid_is_sandybridge -bli_cpuid_is_skx -bli_cpuid_is_steamroller -bli_cpuid_is_zen -bli_cpuid_query -bli_cpuid_query_id bli_crandm bli_crandm_ex -bli_crandm_unb_var1 bli_crandnm bli_crandnm_ex -bli_crandnm_unb_var1 bli_crandnv bli_crandnv_ex -bli_crandnv_unb_var1 bli_crandv bli_crandv_ex -bli_crandv_unb_var1 bli_cscal2d bli_cscal2d_ex bli_cscal2m bli_cscal2m_ex -bli_cscal2m_unb_var1 bli_cscal2v bli_cscal2v_ex bli_cscald bli_cscald_ex bli_cscalm bli_cscalm_ex -bli_cscalm_unb_var1 bli_cscalv bli_cscalv_ex bli_cscastm @@ -458,42 +219,29 @@ bli_csetd_ex bli_csetid bli_csetid_ex bli_csetijm +bli_csetijv bli_csetm bli_csetm_ex -bli_csetm_unb_var1 bli_csetsc bli_csetv bli_csetv_ex -bli_csgemm_ker_var2_md bli_cshiftd bli_cshiftd_ex -bli_cspackm_blk_var1_md -bli_cspackm_cxk_1e_md -bli_cspackm_cxk_1r_md -bli_cspackm_struc_cxk_md bli_csqrtsc bli_csubd bli_csubd_ex bli_csubm bli_csubm_ex -bli_csubm_unb_var1 bli_csubsc bli_csubv bli_csubv_ex bli_csumsqv bli_csumsqv_ex -bli_csumsqv_unb_var1 bli_cswapv bli_cswapv_ex bli_csxpbym_md bli_csxpbym_md_ex -bli_csxpbym_md_unb_var1 bli_csymm -bli_csymm1m -bli_csymm3m1 -bli_csymm3mh -bli_csymm4m1 -bli_csymm4mh bli_csymm_ex bli_csymv bli_csymv_ex @@ -501,89 +249,39 @@ bli_csyr bli_csyr2 bli_csyr2_ex bli_csyr2k -bli_csyr2k1m -bli_csyr2k3m1 -bli_csyr2k3mh -bli_csyr2k4m1 -bli_csyr2k4mh bli_csyr2k_ex bli_csyr_ex bli_csyrk -bli_csyrk1m -bli_csyrk3m1 -bli_csyrk3mh -bli_csyrk4m1 -bli_csyrk4mh bli_csyrk_ex bli_ctrmm -bli_ctrmm1m bli_ctrmm3 -bli_ctrmm31m -bli_ctrmm33m1 -bli_ctrmm33mh -bli_ctrmm34m1 -bli_ctrmm34mh bli_ctrmm3_ex -bli_ctrmm3m1 -bli_ctrmm4m1 bli_ctrmm_ex -bli_ctrmm_ll_ker_var2 -bli_ctrmm_lu_ker_var2 -bli_ctrmm_rl_ker_var2 -bli_ctrmm_ru_ker_var2 bli_ctrmv bli_ctrmv_ex -bli_ctrmv_unb_var1 -bli_ctrmv_unb_var2 -bli_ctrmv_unf_var1 -bli_ctrmv_unf_var2 bli_ctrsm -bli_ctrsm1m -bli_ctrsm3m1 -bli_ctrsm4m1 bli_ctrsm_ex -bli_ctrsm_ll_ker_var2 -bli_ctrsm_l_ukernel -bli_ctrsm_lu_ker_var2 -bli_ctrsm_rl_ker_var2 -bli_ctrsm_ru_ker_var2 -bli_ctrsm_u_ukernel bli_ctrsv bli_ctrsv_ex -bli_ctrsv_unb_var1 -bli_ctrsv_unb_var2 -bli_ctrsv_unf_var1 -bli_ctrsv_unf_var2 -bli_cunpackm_blk_var1 -bli_cunpackm_cxk -bli_cunpackm_unb_var1 bli_cunzipsc bli_cxpbyd bli_cxpbyd_ex bli_cxpbym bli_cxpbym_ex -bli_cxpbym_unb_var1 bli_cxpbyv bli_cxpbyv_ex bli_czcastm bli_czcastnzm bli_czcastv bli_czcopysc -bli_czgemm_ker_var2_md bli_czipsc -bli_czpackm_blk_var1_md -bli_czpackm_cxk_1e_md -bli_czpackm_cxk_1r_md -bli_czpackm_struc_cxk_md bli_czxpbym_md bli_czxpbym_md_ex -bli_czxpbym_md_unb_var1 bli_dabsqsc bli_daddd bli_daddd_ex bli_daddm bli_daddm_ex -bli_daddm_unb_var1 bli_daddsc bli_daddv bli_daddv_ex @@ -591,7 +289,6 @@ bli_damaxv bli_damaxv_ex bli_dasumv bli_dasumv_ex -bli_dasumv_unb_var1 bli_daxpbyv bli_daxpbyv_ex bli_daxpy2v @@ -602,33 +299,24 @@ bli_daxpyf bli_daxpyf_ex bli_daxpym bli_daxpym_ex -bli_daxpym_unb_var1 bli_daxpyv bli_daxpyv_ex bli_dccastm bli_dccastnzm bli_dccastv bli_dccopysc -bli_dcgemm_ker_var2_md bli_dcopyd bli_dcopyd_ex bli_dcopym bli_dcopym_ex -bli_dcopym_unb_var1 bli_dcopyv bli_dcopyv_ex -bli_dcpackm_blk_var1_md -bli_dcpackm_cxk_1e_md -bli_dcpackm_cxk_1r_md -bli_dcpackm_struc_cxk_md bli_dcxpbym_md bli_dcxpbym_md_ex -bli_dcxpbym_md_unb_var1 bli_ddcastm bli_ddcastnzm bli_ddcastv bli_ddcopysc -bli_ddgemm_ker_var2_md bli_ddivsc bli_ddotaxpyv bli_ddotaxpyv_ex @@ -640,183 +328,99 @@ bli_ddotxf bli_ddotxf_ex bli_ddotxv bli_ddotxv_ex -bli_ddpackm_blk_var1_md -bli_ddpackm_cxk_1e_md -bli_ddpackm_cxk_1r_md -bli_ddpackm_struc_cxk_md bli_ddxpbym_md bli_ddxpbym_md_ex -bli_ddxpbym_md_unb_var1 -bli_determine_blocksize -bli_determine_blocksize_b -bli_determine_blocksize_b_sub -bli_determine_blocksize_f -bli_determine_blocksize_f_sub +bli_deqm +bli_deqsc +bli_deqv bli_dfprintm bli_dfprintv bli_dgemm -bli_dgemm1m -bli_dgemm3m1 -bli_dgemm3mh -bli_dgemm4m1 -bli_dgemm4mb -bli_dgemm4mb_ker_var2 -bli_dgemm4mh bli_dgemm_ex -bli_dgemm_ker_var2 -bli_dgemmtrsm_l_ukernel -bli_dgemmtrsm_u_ukernel -bli_dgemm_ukernel +bli_dgemmt +bli_dgemmt_ex bli_dgemv bli_dgemv_ex -bli_dgemv_unb_var1 -bli_dgemv_unb_var2 -bli_dgemv_unf_var1 -bli_dgemv_unf_var2 bli_dger bli_dger_ex -bli_dger_unb_var1 -bli_dger_unb_var2 bli_dgetijm +bli_dgetijv bli_dgetsc bli_dhemm -bli_dhemm1m -bli_dhemm3m1 -bli_dhemm3mh -bli_dhemm4m1 -bli_dhemm4mh bli_dhemm_ex bli_dhemv bli_dhemv_ex -bli_dhemv_unb_var1 -bli_dhemv_unb_var2 -bli_dhemv_unb_var3 -bli_dhemv_unb_var4 -bli_dhemv_unf_var1 -bli_dhemv_unf_var1a -bli_dhemv_unf_var3 -bli_dhemv_unf_var3a bli_dher bli_dher2 bli_dher2_ex bli_dher2k -bli_dher2k1m -bli_dher2k3m1 -bli_dher2k3mh -bli_dher2k4m1 -bli_dher2k4mh bli_dher2k_ex -bli_dher2_unb_var1 -bli_dher2_unb_var2 -bli_dher2_unb_var3 -bli_dher2_unb_var4 -bli_dher2_unf_var1 -bli_dher2_unf_var4 bli_dher_ex bli_dherk -bli_dherk1m -bli_dherk3m1 -bli_dherk3mh -bli_dherk4m1 -bli_dherk4mh bli_dherk_ex -bli_dherk_l_ker_var2 -bli_dherk_u_ker_var2 -bli_dher_unb_var1 -bli_dher_unb_var2 bli_dinvertd bli_dinvertd_ex bli_dinvertsc bli_dinvertv bli_dinvertv_ex +bli_dinvscald +bli_dinvscald_ex +bli_dinvscalm +bli_dinvscalm_ex +bli_dinvscalv +bli_dinvscalv_ex bli_divsc -bli_divsc_check -bli_divsc_qfp -bli_dlamch bli_dmachval bli_dmkherm bli_dmkherm_ex -bli_dmkherm_unb_var1 bli_dmksymm bli_dmksymm_ex -bli_dmksymm_unb_var1 bli_dmktrim bli_dmktrim_ex -bli_dmktrim_unb_var1 bli_dmulsc bli_dnorm1m bli_dnorm1m_ex -bli_dnorm1m_unb_var1 bli_dnorm1v bli_dnorm1v_ex -bli_dnorm1v_unb_var1 bli_dnormfm bli_dnormfm_ex -bli_dnormfm_unb_var1 bli_dnormfsc bli_dnormfv bli_dnormfv_ex -bli_dnormfv_unb_var1 bli_dnormim bli_dnormim_ex -bli_dnormim_unb_var1 bli_dnormiv bli_dnormiv_ex -bli_dnormiv_unb_var1 bli_dotaxpyv -bli_dotaxpyv_check bli_dotaxpyv_ex -bli_dotaxpyv_ex_qfp bli_dotv -bli_dotv_check bli_dotv_ex -bli_dotv_ex_qfp bli_dotxaxpyf -bli_dotxaxpyf_check bli_dotxaxpyf_ex -bli_dotxaxpyf_ex_qfp bli_dotxf -bli_dotxf_check bli_dotxf_ex -bli_dotxf_ex_qfp bli_dotxv -bli_dotxv_check bli_dotxv_ex -bli_dotxv_ex_qfp -bli_dpackm_blk_var1 -bli_dpackm_cxk -bli_dpackm_herm_cxk -bli_dpackm_struc_cxk -bli_dpackm_tri_cxk -bli_dpackm_unb_var1 bli_dprintm -bli_dprintm_ex bli_dprintv -bli_dprintv_ex bli_drandm bli_drandm_ex -bli_drandm_unb_var1 bli_drandnm bli_drandnm_ex -bli_drandnm_unb_var1 bli_drandnv bli_drandnv_ex -bli_drandnv_unb_var1 bli_drandv bli_drandv_ex -bli_drandv_unb_var1 bli_dscal2d bli_dscal2d_ex bli_dscal2m bli_dscal2m_ex -bli_dscal2m_unb_var1 bli_dscal2v bli_dscal2v_ex bli_dscald bli_dscald_ex bli_dscalm bli_dscalm_ex -bli_dscalm_unb_var1 bli_dscalv bli_dscalv_ex bli_dscastm @@ -828,42 +432,29 @@ bli_dsetd_ex bli_dsetid bli_dsetid_ex bli_dsetijm +bli_dsetijv bli_dsetm bli_dsetm_ex -bli_dsetm_unb_var1 bli_dsetsc bli_dsetv bli_dsetv_ex -bli_dsgemm_ker_var2_md bli_dshiftd bli_dshiftd_ex -bli_dspackm_blk_var1_md -bli_dspackm_cxk_1e_md -bli_dspackm_cxk_1r_md -bli_dspackm_struc_cxk_md bli_dsqrtsc bli_dsubd bli_dsubd_ex bli_dsubm bli_dsubm_ex -bli_dsubm_unb_var1 bli_dsubsc bli_dsubv bli_dsubv_ex bli_dsumsqv bli_dsumsqv_ex -bli_dsumsqv_unb_var1 bli_dswapv bli_dswapv_ex bli_dsxpbym_md bli_dsxpbym_md_ex -bli_dsxpbym_md_unb_var1 bli_dsymm -bli_dsymm1m -bli_dsymm3m1 -bli_dsymm3mh -bli_dsymm4m1 -bli_dsymm4mh bli_dsymm_ex bli_dsymv bli_dsymv_ex @@ -871,301 +462,79 @@ bli_dsyr bli_dsyr2 bli_dsyr2_ex bli_dsyr2k -bli_dsyr2k1m -bli_dsyr2k3m1 -bli_dsyr2k3mh -bli_dsyr2k4m1 -bli_dsyr2k4mh bli_dsyr2k_ex bli_dsyr_ex bli_dsyrk -bli_dsyrk1m -bli_dsyrk3m1 -bli_dsyrk3mh -bli_dsyrk4m1 -bli_dsyrk4mh bli_dsyrk_ex bli_dtrmm -bli_dtrmm1m bli_dtrmm3 -bli_dtrmm31m -bli_dtrmm33m1 -bli_dtrmm33mh -bli_dtrmm34m1 -bli_dtrmm34mh bli_dtrmm3_ex -bli_dtrmm3m1 -bli_dtrmm4m1 bli_dtrmm_ex -bli_dtrmm_ll_ker_var2 -bli_dtrmm_lu_ker_var2 -bli_dtrmm_rl_ker_var2 -bli_dtrmm_ru_ker_var2 bli_dtrmv bli_dtrmv_ex -bli_dtrmv_unb_var1 -bli_dtrmv_unb_var2 -bli_dtrmv_unf_var1 -bli_dtrmv_unf_var2 bli_dtrsm -bli_dtrsm1m -bli_dtrsm3m1 -bli_dtrsm4m1 bli_dtrsm_ex -bli_dtrsm_ll_ker_var2 -bli_dtrsm_l_ukernel -bli_dtrsm_lu_ker_var2 -bli_dtrsm_rl_ker_var2 -bli_dtrsm_ru_ker_var2 -bli_dtrsm_u_ukernel bli_dtrsv bli_dtrsv_ex -bli_dtrsv_unb_var1 -bli_dtrsv_unb_var2 -bli_dtrsv_unf_var1 -bli_dtrsv_unf_var2 bli_dt_size -bli_dt_size_check bli_dt_string -bli_dt_string_check -bli_dt_union_check -bli_dunpackm_blk_var1 -bli_dunpackm_cxk -bli_dunpackm_unb_var1 bli_dunzipsc bli_dxpbyd bli_dxpbyd_ex bli_dxpbym bli_dxpbym_ex -bli_dxpbym_unb_var1 bli_dxpbyv bli_dxpbyv_ex bli_dzcastm bli_dzcastnzm bli_dzcastv bli_dzcopysc -bli_dzgemm_ker_var2_md bli_dzipsc -bli_dzpackm_blk_var1_md -bli_dzpackm_cxk_1e_md -bli_dzpackm_cxk_1r_md -bli_dzpackm_struc_cxk_md bli_dzxpbym_md bli_dzxpbym_md_ex -bli_dzxpbym_md_unb_var1 +bli_eqm +bli_eqsc +bli_eqv bli_error_checking_is_enabled bli_error_checking_level bli_error_checking_level_set -bli_error_string_for_code -bli_ffree_align -bli_ffree_noalign bli_finalize -bli_finalize_apis -bli_finalize_auto -bli_finalize_once -bli_find_area_trap_l -bli_fmalloc_align -bli_fmalloc_align_check -bli_fmalloc_noalign -bli_fmalloc_post_check bli_fprintm -bli_fprintm_check -bli_fprintm_ex -bli_fprintm_qfp bli_fprintv -bli_fprintv_check -bli_fprintv_ex -bli_fprintv_qfp -bli_free_intl bli_free_user -bli_func_create -bli_func_free -bli_func_init -bli_func_init_null -bli_func_is_null -bli_func_is_null_dt -bli_gcd bli_gemm -bli_gemm1m -bli_gemm3m1 -bli_gemm3mh -bli_gemm4m1 -bli_gemm4mb -bli_gemm4mb_ker_var2 -bli_gemm4mh -bli_gemm_basic_check -bli_gemm_blk_var1 -bli_gemm_blk_var2 -bli_gemm_blk_var3 -bli_gemmbp_cntl_create -bli_gemm_check -bli_gemm_cntl_create -bli_gemm_cntl_create_node -bli_gemm_cntl_free -bli_gemm_determine_kc -bli_gemm_determine_kc_b -bli_gemm_determine_kc_f -bli_gemm_direct bli_gemm_ex -bli_gemm_front -bli_gemmind -bli_gemmind_get_avail -bli_gemm_int -bli_gemm_ker_var2 -bli_gemm_ker_var2_md -bli_gemm_md -bli_gemm_md_ccc -bli_gemm_md_ccr -bli_gemm_md_crc -bli_gemm_md_crr -bli_gemm_md_rcc -bli_gemm_md_rcr -bli_gemm_md_rrc -bli_gemm_md_rrr -bli_gemmnat -bli_gemm_packa -bli_gemm_packb -bli_gemm_prune_unref_mparts_k -bli_gemm_prune_unref_mparts_m -bli_gemm_prune_unref_mparts_n -bli_gemmtrsm_l_ukernel_qfp +bli_gemmt +bli_gemmt_ex bli_gemmtrsm_ukernel -bli_gemmtrsm_u_ukernel_qfp bli_gemm_ukernel -bli_gemm_ukernel_qfp bli_gemv -bli_gemv_check bli_gemv_ex -bli_gemv_ex_qfp -bli_gemv_unb_var1 -bli_gemv_unb_var1_qfp -bli_gemv_unb_var2 -bli_gemv_unb_var2_qfp -bli_gemv_unf_var1 -bli_gemv_unf_var1_qfp -bli_gemv_unf_var2 -bli_gemv_unf_var2_qfp bli_ger -bli_ger_check bli_ger_ex -bli_ger_ex_qfp -bli_ger_unb_var1 -bli_ger_unb_var1_qfp -bli_ger_unb_var2 -bli_ger_unb_var2_qfp bli_getijm +bli_getijv bli_getopt bli_getopt_init_state bli_getsc -bli_getsc_check -bli_getsc_qfp -bli_gks_cntx_l3_nat_ukr_is_ref -bli_gks_finalize -bli_gks_init -bli_gks_init_index bli_gks_init_ref_cntx bli_gks_l3_ukr_impl_string bli_gks_l3_ukr_impl_type -bli_gks_lookup_ind_cntx -bli_gks_lookup_nat_cntx bli_gks_query_cntx -bli_gks_query_cntx_noinit bli_gks_query_ind_cntx bli_gks_query_nat_cntx -bli_gks_register_cntx bli_hemm -bli_hemm1m -bli_hemm3m1 -bli_hemm3mh -bli_hemm4m1 -bli_hemm4mh -bli_hemm_basic_check -bli_hemm_check bli_hemm_ex -bli_hemm_front -bli_hemmind -bli_hemmind_get_avail -bli_hemmnat bli_hemv -bli_hemv_check bli_hemv_ex -bli_hemv_ex_qfp -bli_hemv_unb_var1 -bli_hemv_unb_var1_qfp -bli_hemv_unb_var2 -bli_hemv_unb_var2_qfp -bli_hemv_unb_var3 -bli_hemv_unb_var3_qfp -bli_hemv_unb_var4 -bli_hemv_unb_var4_qfp -bli_hemv_unf_var1 -bli_hemv_unf_var1a -bli_hemv_unf_var1a_qfp -bli_hemv_unf_var1_qfp -bli_hemv_unf_var3 -bli_hemv_unf_var3a -bli_hemv_unf_var3a_qfp -bli_hemv_unf_var3_qfp bli_her bli_her2 -bli_her2_check bli_her2_ex -bli_her2_ex_qfp bli_her2k -bli_her2k1m -bli_her2k3m1 -bli_her2k3mh -bli_her2k4m1 -bli_her2k4mh -bli_her2k_basic_check -bli_her2k_check bli_her2k_ex -bli_her2k_front -bli_her2kind -bli_her2kind_get_avail -bli_her2knat -bli_her2_unb_var1 -bli_her2_unb_var1_qfp -bli_her2_unb_var2 -bli_her2_unb_var2_qfp -bli_her2_unb_var3 -bli_her2_unb_var3_qfp -bli_her2_unb_var4 -bli_her2_unb_var4_qfp -bli_her2_unf_var1 -bli_her2_unf_var1_qfp -bli_her2_unf_var4 -bli_her2_unf_var4_qfp -bli_her_check bli_her_ex -bli_her_ex_qfp bli_herk -bli_herk1m -bli_herk3m1 -bli_herk3mh -bli_herk4m1 -bli_herk4mh -bli_herk_basic_check -bli_herk_check -bli_herk_determine_kc -bli_herk_determine_kc_b -bli_herk_determine_kc_f -bli_herk_direct bli_herk_ex -bli_herk_front -bli_herkind -bli_herkind_get_avail -bli_herk_l_ker_var2 -bli_herknat -bli_herk_prune_unref_mparts_k -bli_herk_prune_unref_mparts_m -bli_herk_prune_unref_mparts_n -bli_herk_u_ker_var2 -bli_herk_x_ker_var2 -bli_her_unb_var1 -bli_her_unb_var1_qfp -bli_her_unb_var2 -bli_her_unb_var2_qfp bli_ifprintm bli_ifprintv bli_igetsc @@ -1175,13 +544,8 @@ bli_ind_disable_all_dt bli_ind_disable_dt bli_ind_enable bli_ind_enable_dt -bli_ind_finalize -bli_ind_get_impl_string -bli_ind_init -bli_ind_map_cdt_to_index bli_ind_oper_enable_only bli_ind_oper_find_avail -bli_ind_oper_get_avail bli_ind_oper_get_avail_impl_string bli_ind_oper_is_impl bli_info_get_blas_int_type_size @@ -1189,13 +553,15 @@ bli_info_get_enable_blas bli_info_get_enable_cblas bli_info_get_enable_memkind bli_info_get_enable_openmp +bli_info_get_enable_openmp_as_default bli_info_get_enable_pba_pools bli_info_get_enable_pthreads +bli_info_get_enable_pthreads_as_default bli_info_get_enable_sandbox bli_info_get_enable_sba_pools -bli_info_get_enable_stay_auto_init bli_info_get_enable_threading bli_info_get_gemm_impl_string +bli_info_get_gemmt_impl_string bli_info_get_gemmtrsm_l_ukr_impl_string bli_info_get_gemmtrsm_u_ukr_impl_string bli_info_get_gemm_ukr_impl_string @@ -1209,7 +575,14 @@ bli_info_get_int_type_size_str bli_info_get_max_type_size bli_info_get_num_fp_types bli_info_get_page_size -bli_info_get_pool_addr_align_size +bli_info_get_pool_addr_align_size_a +bli_info_get_pool_addr_align_size_b +bli_info_get_pool_addr_align_size_c +bli_info_get_pool_addr_align_size_gen +bli_info_get_pool_addr_offset_size_a +bli_info_get_pool_addr_offset_size_b +bli_info_get_pool_addr_offset_size_c +bli_info_get_pool_addr_offset_size_gen bli_info_get_simd_align_size bli_info_get_simd_num_registers bli_info_get_simd_size @@ -1227,152 +600,57 @@ bli_info_get_trsm_l_ukr_impl_string bli_info_get_trsm_u_ukr_impl_string bli_info_get_version_str bli_init -bli_init_apis -bli_init_auto -bli_init_once bli_invertd -bli_invertd_check bli_invertd_ex -bli_invertd_ex_qfp bli_invertsc -bli_invertsc_check -bli_invertsc_qfp bli_invertv -bli_invertv_check bli_invertv_ex -bli_invertv_ex_qfp -bli_ipow +bli_invscald +bli_invscald_ex +bli_invscalm +bli_invscalm_ex +bli_invscalv +bli_invscalv_ex bli_iprintm -bli_iprintm_ex bli_iprintv -bli_iprintv_ex bli_isetsc -bli_l0_xsc_check -bli_l0_xx2sc_check -bli_l0_xxsc_check -bli_l1d_ax_check -bli_l1d_axy_check -bli_l1d_x_check -bli_l1d_xy_check -bli_l1m_ax_check -bli_l1m_axy_check -bli_l1m_xy_check -bli_l1v_axby_check -bli_l1v_ax_check -bli_l1v_axy_check -bli_l1v_dot_check -bli_l1v_xby_check -bli_l1v_x_check -bli_l1v_xi_check -bli_l1v_xy_check -bli_l3_basic_check -bli_l3_cntl_create_if bli_l3_cntl_free -bli_l3_determine_kc -bli_l3_direct -bli_l3_ind_oper_enable_only -bli_l3_ind_oper_find_avail -bli_l3_ind_oper_get_enable -bli_l3_ind_oper_get_func -bli_l3_ind_oper_set_enable -bli_l3_ind_oper_set_enable_all -bli_l3_ind_set_enable_dt -bli_l3_packm -bli_l3_prune_unref_mparts_k -bli_l3_prune_unref_mparts_m -bli_l3_prune_unref_mparts_n -bli_l3_thread_decorator -bli_l3_thread_entry -bli_l3_thrinfo_create_root -bli_l3_thrinfo_free -bli_l3_thrinfo_free_paths -bli_l3_thrinfo_init_single -bli_l3_thrinfo_print_gemm_paths -bli_l3_thrinfo_print_trsm_paths -bli_lcm -bli_lsame +bli_l3_thrinfo_create bli_machval -bli_malloc_intl bli_malloc_user -bli_mbool_create -bli_mbool_free -bli_mbool_init -bli_pba_acquire_m -bli_pba_compute_pool_block_sizes -bli_pba_compute_pool_block_sizes_dt -bli_pba_finalize -bli_pba_finalize_pools -bli_pba_init -bli_pba_init_pools -bli_pba_pool_size -bli_pba_query -bli_pba_release -bli_memsys_finalize -bli_memsys_init bli_mkherm -bli_mkherm_check bli_mkherm_ex -bli_mkherm_ex_qfp bli_mksymm -bli_mksymm_check bli_mksymm_ex -bli_mksymm_ex_qfp bli_mktrim -bli_mktrim_check bli_mktrim_ex -bli_mktrim_ex_qfp bli_mulsc -bli_mulsc_check -bli_mulsc_qfp -bli_next_prime_factor bli_norm1m -bli_norm1m_check bli_norm1m_ex -bli_norm1m_ex_qfp bli_norm1v -bli_norm1v_check bli_norm1v_ex -bli_norm1v_ex_qfp bli_normfm -bli_normfm_check bli_normfm_ex -bli_normfm_ex_qfp bli_normfsc -bli_normfsc_check -bli_normfsc_qfp bli_normfv -bli_normfv_check bli_normfv_ex -bli_normfv_ex_qfp bli_normim -bli_normim_check bli_normim_ex -bli_normim_ex_qfp bli_normiv -bli_normiv_check bli_normiv_ex -bli_normiv_ex_qfp bli_obj_alloc_buffer -bli_obj_alloc_buffer_check bli_obj_attach_buffer -bli_obj_attach_buffer_check bli_obj_create bli_obj_create_1x1 bli_obj_create_1x1_with_attached_buffer -bli_obj_create_check bli_obj_create_conf_to -bli_obj_create_const_check -bli_obj_create_scalar_check bli_obj_create_with_attached_buffer bli_obj_create_without_buffer -bli_obj_create_without_buffer_check bli_obj_equals bli_obj_free -bli_obj_free_check bli_obj_imag_equals bli_obj_imag_is_zero bli_obj_print -bli_obj_print_check bli_obj_scalar_apply_scalar bli_obj_scalar_attach bli_obj_scalar_cast_to @@ -1382,21 +660,16 @@ bli_obj_scalar_has_nonzero_imag bli_obj_scalar_init_detached bli_obj_scalar_init_detached_copy_of bli_obj_scalar_reset -bli_packm_acquire_mpart_l2r -bli_packm_acquire_mpart_t2b -bli_packm_acquire_mpart_tl2br +bli_pack_get_pack_a +bli_pack_get_pack_b +bli_packm_alloc +bli_packm_alloc_ex bli_packm_blk_var1 -bli_packm_blk_var1_md bli_packm_cntl_create_node bli_packm_init -bli_packm_init_check -bli_packm_init_pack -bli_packm_int -bli_packm_int_check -bli_packm_offset_to_panel_for -bli_packm_thrinfo_init -bli_packm_thrinfo_init_single -bli_packm_unb_var1 +bli_packm_scalar +bli_pack_set_pack_a +bli_pack_set_pack_b bli_param_map_blis_to_char_conj bli_param_map_blis_to_char_diag bli_param_map_blis_to_char_dt @@ -1414,33 +687,11 @@ bli_param_map_char_to_blis_dt bli_param_map_char_to_blis_side bli_param_map_char_to_blis_trans bli_param_map_char_to_blis_uplo -bli_param_map_netlib_to_blis_diag -bli_param_map_netlib_to_blis_side -bli_param_map_netlib_to_blis_trans -bli_param_map_netlib_to_blis_uplo -bli_partition_2x2 -bli_pblk_print -bli_pool_alloc_block -bli_pool_checkin_block -bli_pool_checkout_block -bli_pool_finalize -bli_pool_free_block -bli_pool_grow -bli_pool_init -bli_pool_print -bli_pool_reinit -bli_pool_shrink -bli_prime_factorization +bli_pba_query bli_printm -bli_printm_ex -bli_print_msg bli_printv -bli_printv_ex bli_projm -bli_projm_check bli_projv -bli_projv_check -bli_prune_unref_mparts bli_pthread_barrier_destroy bli_pthread_barrier_init bli_pthread_barrier_wait @@ -1457,30 +708,22 @@ bli_pthread_mutex_trylock bli_pthread_mutex_unlock bli_pthread_once bli_randm -bli_randm_check bli_randm_ex -bli_randm_ex_qfp bli_randnm -bli_randnm_check bli_randnm_ex -bli_randnm_ex_qfp bli_randnv -bli_randnv_check bli_randnv_ex -bli_randnv_ex_qfp bli_randv -bli_randv_check bli_randv_ex -bli_randv_ex_qfp -bli_rntm_print +bli_rntm_init_from_global +bli_rntm_set_num_threads +bli_rntm_set_ways bli_rntm_set_ways_for_op -bli_rntm_set_ways_from_rntm bli_sabsqsc bli_saddd bli_saddd_ex bli_saddm bli_saddm_ex -bli_saddm_unb_var1 bli_saddsc bli_saddv bli_saddv_ex @@ -1488,7 +731,6 @@ bli_samaxv bli_samaxv_ex bli_sasumv bli_sasumv_ex -bli_sasumv_unb_var1 bli_saxpbyv bli_saxpbyv_ex bli_saxpy2v @@ -1499,65 +741,36 @@ bli_saxpyf bli_saxpyf_ex bli_saxpym bli_saxpym_ex -bli_saxpym_unb_var1 bli_saxpyv bli_saxpyv_ex -bli_sba_acquire -bli_sba_checkin_array -bli_sba_checkout_array -bli_sba_finalize -bli_sba_init -bli_sba_query -bli_sba_release -bli_sba_rntm_set_pool bli_scal2d -bli_scal2d_check bli_scal2d_ex -bli_scal2d_ex_qfp bli_scal2m -bli_scal2m_check bli_scal2m_ex -bli_scal2m_ex_qfp bli_scal2v -bli_scal2v_check bli_scal2v_ex -bli_scal2v_ex_qfp bli_scald -bli_scald_check bli_scald_ex -bli_scald_ex_qfp bli_scalm -bli_scalm_check bli_scalm_ex -bli_scalm_ex_qfp bli_scalv -bli_scalv_check bli_scalv_ex -bli_scalv_ex_qfp bli_sccastm bli_sccastnzm bli_sccastv bli_sccopysc -bli_scgemm_ker_var2_md bli_scopyd bli_scopyd_ex bli_scopym bli_scopym_ex -bli_scopym_unb_var1 bli_scopyv bli_scopyv_ex -bli_scpackm_blk_var1_md -bli_scpackm_cxk_1e_md -bli_scpackm_cxk_1r_md -bli_scpackm_struc_cxk_md bli_scxpbym_md bli_scxpbym_md_ex -bli_scxpbym_md_unb_var1 bli_sdcastm bli_sdcastnzm bli_sdcastv bli_sdcopysc -bli_sdgemm_ker_var2_md bli_sdivsc bli_sdotaxpyv bli_sdotaxpyv_ex @@ -1569,187 +782,107 @@ bli_sdotxf bli_sdotxf_ex bli_sdotxv bli_sdotxv_ex -bli_sdpackm_blk_var1_md -bli_sdpackm_cxk_1e_md -bli_sdpackm_cxk_1r_md -bli_sdpackm_struc_cxk_md bli_sdxpbym_md bli_sdxpbym_md_ex -bli_sdxpbym_md_unb_var1 +bli_seqm +bli_seqsc +bli_seqv bli_setd -bli_setd_check bli_setd_ex -bli_setd_ex_qfp bli_setid -bli_setid_check bli_setid_ex -bli_setid_ex_qfp bli_setijm +bli_setijv bli_setim bli_setiv bli_setm -bli_setm_check bli_setm_ex -bli_setm_ex_qfp bli_setrm bli_setrv bli_setsc -bli_setsc_check -bli_setsc_qfp bli_setv -bli_setv_check bli_setv_ex -bli_setv_ex_qfp bli_sfprintm bli_sfprintv bli_sgemm -bli_sgemm1m -bli_sgemm3m1 -bli_sgemm3mh -bli_sgemm4m1 -bli_sgemm4mb -bli_sgemm4mb_ker_var2 -bli_sgemm4mh bli_sgemm_ex -bli_sgemm_ker_var2 -bli_sgemmtrsm_l_ukernel -bli_sgemmtrsm_u_ukernel -bli_sgemm_ukernel +bli_sgemmt +bli_sgemmt_ex bli_sgemv bli_sgemv_ex -bli_sgemv_unb_var1 -bli_sgemv_unb_var2 -bli_sgemv_unf_var1 -bli_sgemv_unf_var2 bli_sger bli_sger_ex -bli_sger_unb_var1 -bli_sger_unb_var2 bli_sgetijm +bli_sgetijv bli_sgetsc bli_shemm -bli_shemm1m -bli_shemm3m1 -bli_shemm3mh -bli_shemm4m1 -bli_shemm4mh bli_shemm_ex bli_shemv bli_shemv_ex -bli_shemv_unb_var1 -bli_shemv_unb_var2 -bli_shemv_unb_var3 -bli_shemv_unb_var4 -bli_shemv_unf_var1 -bli_shemv_unf_var1a -bli_shemv_unf_var3 -bli_shemv_unf_var3a bli_sher bli_sher2 bli_sher2_ex bli_sher2k -bli_sher2k1m -bli_sher2k3m1 -bli_sher2k3mh -bli_sher2k4m1 -bli_sher2k4mh bli_sher2k_ex -bli_sher2_unb_var1 -bli_sher2_unb_var2 -bli_sher2_unb_var3 -bli_sher2_unb_var4 -bli_sher2_unf_var1 -bli_sher2_unf_var4 bli_sher_ex bli_sherk -bli_sherk1m -bli_sherk3m1 -bli_sherk3mh -bli_sherk4m1 -bli_sherk4mh bli_sherk_ex -bli_sherk_l_ker_var2 -bli_sherk_u_ker_var2 -bli_sher_unb_var1 -bli_sher_unb_var2 bli_shiftd -bli_shiftd_check bli_shiftd_ex -bli_shiftd_ex_qfp bli_sinvertd bli_sinvertd_ex bli_sinvertsc bli_sinvertv bli_sinvertv_ex -bli_slamch +bli_sinvscald +bli_sinvscald_ex +bli_sinvscalm +bli_sinvscalm_ex +bli_sinvscalv +bli_sinvscalv_ex bli_sleep bli_smachval bli_smkherm bli_smkherm_ex -bli_smkherm_unb_var1 bli_smksymm bli_smksymm_ex -bli_smksymm_unb_var1 bli_smktrim bli_smktrim_ex -bli_smktrim_unb_var1 bli_smulsc bli_snorm1m bli_snorm1m_ex -bli_snorm1m_unb_var1 bli_snorm1v bli_snorm1v_ex -bli_snorm1v_unb_var1 bli_snormfm bli_snormfm_ex -bli_snormfm_unb_var1 bli_snormfsc bli_snormfv bli_snormfv_ex -bli_snormfv_unb_var1 bli_snormim bli_snormim_ex -bli_snormim_unb_var1 bli_snormiv bli_snormiv_ex -bli_snormiv_unb_var1 -bli_spackm_blk_var1 -bli_spackm_cxk -bli_spackm_herm_cxk -bli_spackm_struc_cxk -bli_spackm_tri_cxk -bli_spackm_unb_var1 bli_sprintm -bli_sprintm_ex bli_sprintv -bli_sprintv_ex bli_sqrtsc -bli_sqrtsc_check -bli_sqrtsc_qfp bli_srandm bli_srandm_ex -bli_srandm_unb_var1 bli_srandnm bli_srandnm_ex -bli_srandnm_unb_var1 bli_srandnv bli_srandnv_ex -bli_srandnv_unb_var1 bli_srandv bli_srandv_ex -bli_srandv_unb_var1 bli_sscal2d bli_sscal2d_ex bli_sscal2m bli_sscal2m_ex -bli_sscal2m_unb_var1 bli_sscal2v bli_sscal2v_ex bli_sscald bli_sscald_ex bli_sscalm bli_sscalm_ex -bli_sscalm_unb_var1 bli_sscalv bli_sscalv_ex bli_sscastm @@ -1761,42 +894,29 @@ bli_ssetd_ex bli_ssetid bli_ssetid_ex bli_ssetijm +bli_ssetijv bli_ssetm bli_ssetm_ex -bli_ssetm_unb_var1 bli_ssetsc bli_ssetv bli_ssetv_ex -bli_ssgemm_ker_var2_md bli_sshiftd bli_sshiftd_ex -bli_sspackm_blk_var1_md -bli_sspackm_cxk_1e_md -bli_sspackm_cxk_1r_md -bli_sspackm_struc_cxk_md bli_ssqrtsc bli_ssubd bli_ssubd_ex bli_ssubm bli_ssubm_ex -bli_ssubm_unb_var1 bli_ssubsc bli_ssubv bli_ssubv_ex bli_ssumsqv bli_ssumsqv_ex -bli_ssumsqv_unb_var1 bli_sswapv bli_sswapv_ex bli_ssxpbym_md bli_ssxpbym_md_ex -bli_ssxpbym_md_unb_var1 bli_ssymm -bli_ssymm1m -bli_ssymm3m1 -bli_ssymm3mh -bli_ssymm4m1 -bli_ssymm4mh bli_ssymm_ex bli_ssymv bli_ssymv_ex @@ -1804,330 +924,99 @@ bli_ssyr bli_ssyr2 bli_ssyr2_ex bli_ssyr2k -bli_ssyr2k1m -bli_ssyr2k3m1 -bli_ssyr2k3mh -bli_ssyr2k4m1 -bli_ssyr2k4mh bli_ssyr2k_ex bli_ssyr_ex bli_ssyrk -bli_ssyrk1m -bli_ssyrk3m1 -bli_ssyrk3mh -bli_ssyrk4m1 -bli_ssyrk4mh bli_ssyrk_ex -bli_string_mkupper bli_strmm -bli_strmm1m bli_strmm3 -bli_strmm31m -bli_strmm33m1 -bli_strmm33mh -bli_strmm34m1 -bli_strmm34mh bli_strmm3_ex -bli_strmm3m1 -bli_strmm4m1 bli_strmm_ex -bli_strmm_ll_ker_var2 -bli_strmm_lu_ker_var2 -bli_strmm_rl_ker_var2 -bli_strmm_ru_ker_var2 bli_strmv bli_strmv_ex -bli_strmv_unb_var1 -bli_strmv_unb_var2 -bli_strmv_unf_var1 -bli_strmv_unf_var2 bli_strsm -bli_strsm1m -bli_strsm3m1 -bli_strsm4m1 bli_strsm_ex -bli_strsm_ll_ker_var2 -bli_strsm_l_ukernel -bli_strsm_lu_ker_var2 -bli_strsm_rl_ker_var2 -bli_strsm_ru_ker_var2 -bli_strsm_u_ukernel bli_strsv bli_strsv_ex -bli_strsv_unb_var1 -bli_strsv_unb_var2 -bli_strsv_unf_var1 -bli_strsv_unf_var2 bli_subd -bli_subd_check bli_subd_ex -bli_subd_ex_qfp bli_subm -bli_subm_check bli_subm_ex -bli_subm_ex_qfp bli_subsc -bli_subsc_check -bli_subsc_qfp bli_subv -bli_subv_check bli_subv_ex -bli_subv_ex_qfp bli_sumsqv -bli_sumsqv_check bli_sumsqv_ex -bli_sumsqv_ex_qfp -bli_sunpackm_blk_var1 -bli_sunpackm_cxk -bli_sunpackm_unb_var1 bli_sunzipsc bli_swapv -bli_swapv_check bli_swapv_ex -bli_swapv_ex_qfp bli_sxpbyd bli_sxpbyd_ex bli_sxpbym bli_sxpbym_ex -bli_sxpbym_unb_var1 bli_sxpbyv bli_sxpbyv_ex bli_symm -bli_symm1m -bli_symm3m1 -bli_symm3mh -bli_symm4m1 -bli_symm4mh -bli_symm_check bli_symm_ex -bli_symm_front -bli_symmind -bli_symmind_get_avail -bli_symmnat bli_symv -bli_symv_check bli_symv_ex -bli_symv_ex_qfp bli_syr bli_syr2 -bli_syr2_check bli_syr2_ex -bli_syr2_ex_qfp bli_syr2k -bli_syr2k1m -bli_syr2k3m1 -bli_syr2k3mh -bli_syr2k4m1 -bli_syr2k4mh -bli_syr2k_check bli_syr2k_ex -bli_syr2k_front -bli_syr2kind -bli_syr2kind_get_avail -bli_syr2knat -bli_syr_check bli_syr_ex -bli_syr_ex_qfp bli_syrk -bli_syrk1m -bli_syrk3m1 -bli_syrk3mh -bli_syrk4m1 -bli_syrk4mh -bli_syrk_check bli_syrk_ex -bli_syrk_front -bli_syrkind -bli_syrkind_get_avail -bli_syrknat bli_szcastm bli_szcastnzm bli_szcastv bli_szcopysc -bli_szgemm_ker_var2_md bli_szipsc -bli_szpackm_blk_var1_md -bli_szpackm_cxk_1e_md -bli_szpackm_cxk_1r_md -bli_szpackm_struc_cxk_md bli_szxpbym_md bli_szxpbym_md_ex -bli_szxpbym_md_unb_var1 bli_thrcomm_barrier -bli_thrcomm_barrier_atomic bli_thrcomm_bcast -bli_thrcomm_cleanup -bli_thrcomm_create -bli_thrcomm_free -bli_thrcomm_init -bli_thread_finalize -bli_thread_get_env bli_thread_get_ic_nt bli_thread_get_ir_nt bli_thread_get_jc_nt bli_thread_get_jr_nt bli_thread_get_num_threads bli_thread_get_pc_nt -bli_thread_init -bli_thread_init_rntm -bli_thread_init_rntm_from_env -bli_thread_range_b2t -bli_thread_range_l2r -bli_thread_range_mdim -bli_thread_range_ndim -bli_thread_range_r2l +bli_thread_get_thread_impl +bli_thread_get_thread_impl_str bli_thread_range_sub -bli_thread_range_t2b -bli_thread_range_weighted_b2t -bli_thread_range_weighted_l2r -bli_thread_range_weighted_r2l -bli_thread_range_weighted_sub -bli_thread_range_weighted_t2b -bli_thread_range_width_l bli_thread_set_num_threads bli_thread_set_num_threads_ +bli_thread_set_thread_impl bli_thread_set_ways bli_thread_set_ways_ -bli_thrinfo_create -bli_thrinfo_create_for_cntl -bli_thrinfo_create_for_cntl_prenode bli_thrinfo_free -bli_thrinfo_grow -bli_thrinfo_init -bli_thrinfo_init_single -bli_thrinfo_rgrow -bli_thrinfo_rgrow_prenode bli_trmm -bli_trmm1m bli_trmm3 -bli_trmm31m -bli_trmm33m1 -bli_trmm33mh -bli_trmm34m1 -bli_trmm34mh bli_trmm3_ex -bli_trmm3_front -bli_trmm3ind -bli_trmm3ind_get_avail -bli_trmm3m1 -bli_trmm3nat -bli_trmm4m1 -bli_trmm_check -bli_trmm_determine_kc -bli_trmm_determine_kc_b -bli_trmm_determine_kc_f -bli_trmm_direct bli_trmm_ex -bli_trmm_front -bli_trmmind -bli_trmmind_get_avail -bli_trmm_ll_ker_var2 -bli_trmm_lu_ker_var2 -bli_trmmnat -bli_trmm_prune_unref_mparts_k -bli_trmm_prune_unref_mparts_m -bli_trmm_prune_unref_mparts_n -bli_trmm_rl_ker_var2 -bli_trmm_ru_ker_var2 -bli_trmm_xx_ker_var2 bli_trmv -bli_trmv_check bli_trmv_ex -bli_trmv_ex_qfp -bli_trmv_unb_var1 -bli_trmv_unb_var1_qfp -bli_trmv_unb_var2 -bli_trmv_unb_var2_qfp -bli_trmv_unf_var1 -bli_trmv_unf_var1_qfp -bli_trmv_unf_var2 -bli_trmv_unf_var2_qfp bli_trsm -bli_trsm1m -bli_trsm3m1 -bli_trsm4m1 -bli_trsm_blk_var1 -bli_trsm_blk_var2 -bli_trsm_blk_var3 -bli_trsm_check -bli_trsm_cntl_create -bli_trsm_cntl_create_node -bli_trsm_cntl_free -bli_trsm_determine_kc -bli_trsm_determine_kc_b -bli_trsm_determine_kc_f -bli_trsm_direct bli_trsm_ex -bli_trsm_front -bli_trsmind -bli_trsmind_get_avail -bli_trsm_int -bli_trsm_l_cntl_create -bli_trsm_ll_ker_var2 -bli_trsm_l_ukernel_qfp -bli_trsm_lu_ker_var2 -bli_trsmnat -bli_trsm_packa -bli_trsm_packb -bli_trsm_prune_unref_mparts_k -bli_trsm_prune_unref_mparts_m -bli_trsm_prune_unref_mparts_n -bli_trsm_r_cntl_create -bli_trsm_rl_ker_var2 -bli_trsm_ru_ker_var2 bli_trsm_ukernel -bli_trsm_u_ukernel_qfp -bli_trsm_xx_ker_var2 bli_trsv -bli_trsv_check bli_trsv_ex -bli_trsv_ex_qfp -bli_trsv_unb_var1 -bli_trsv_unb_var1_qfp -bli_trsv_unb_var2 -bli_trsv_unb_var2_qfp -bli_trsv_unf_var1 -bli_trsv_unf_var1_qfp -bli_trsv_unf_var2 -bli_trsv_unf_var2_qfp -bli_unpackm_blk_var1 -bli_unpackm_cntl_create_node -bli_unpackm_int -bli_unpackm_int_check -bli_unpackm_unb_var1 bli_unzipsc -bli_unzipsc_check -bli_unzipsc_qfp -bli_utilm_fprint_check -bli_utilm_mkhst_check -bli_utilm_norm_check -bli_utilm_rand_check -bli_utilv_norm_check -bli_utilv_sumsqv_check -bli_utilv_xa_check bli_xpbyd -bli_xpbyd_check bli_xpbyd_ex -bli_xpbyd_ex_qfp bli_xpbym -bli_xpbym_check bli_xpbym_ex -bli_xpbym_ex_qfp bli_xpbym_md bli_xpbym_md_ex -bli_xpbym_md_ex_qfp2 bli_xpbyv -bli_xpbyv_check bli_xpbyv_ex -bli_xpbyv_ex_qfp -bli_xxmv_check -bli_xxr_check bli_zabsqsc bli_zaddd bli_zaddd_ex bli_zaddm bli_zaddm_ex -bli_zaddm_unb_var1 bli_zaddsc bli_zaddv bli_zaddv_ex @@ -2135,7 +1024,6 @@ bli_zamaxv bli_zamaxv_ex bli_zasumv bli_zasumv_ex -bli_zasumv_unb_var1 bli_zaxpbyv bli_zaxpbyv_ex bli_zaxpy2v @@ -2146,33 +1034,24 @@ bli_zaxpyf bli_zaxpyf_ex bli_zaxpym bli_zaxpym_ex -bli_zaxpym_unb_var1 bli_zaxpyv bli_zaxpyv_ex bli_zccastm bli_zccastnzm bli_zccastv bli_zccopysc -bli_zcgemm_ker_var2_md bli_zcopyd bli_zcopyd_ex bli_zcopym bli_zcopym_ex -bli_zcopym_unb_var1 bli_zcopyv bli_zcopyv_ex -bli_zcpackm_blk_var1_md -bli_zcpackm_cxk_1e_md -bli_zcpackm_cxk_1r_md -bli_zcpackm_struc_cxk_md bli_zcxpbym_md bli_zcxpbym_md_ex -bli_zcxpbym_md_unb_var1 bli_zdcastm bli_zdcastnzm bli_zdcastv bli_zdcopysc -bli_zdgemm_ker_var2_md bli_zdivsc bli_zdotaxpyv bli_zdotaxpyv_ex @@ -2184,174 +1063,89 @@ bli_zdotxf bli_zdotxf_ex bli_zdotxv bli_zdotxv_ex -bli_zdpackm_blk_var1_md -bli_zdpackm_cxk_1e_md -bli_zdpackm_cxk_1r_md -bli_zdpackm_struc_cxk_md bli_zdxpbym_md bli_zdxpbym_md_ex -bli_zdxpbym_md_unb_var1 +bli_zeqm +bli_zeqsc +bli_zeqv bli_zfprintm bli_zfprintv bli_zgemm -bli_zgemm1m -bli_zgemm3m1 -bli_zgemm3mh -bli_zgemm4m1 -bli_zgemm4mb -bli_zgemm4mb_ker_var2 -bli_zgemm4mh bli_zgemm_ex -bli_zgemm_ker_var2 -bli_zgemm_md_c2r_ref -bli_zgemmtrsm_l_ukernel -bli_zgemmtrsm_u_ukernel -bli_zgemm_ukernel +bli_zgemmt +bli_zgemmt_ex bli_zgemv bli_zgemv_ex -bli_zgemv_unb_var1 -bli_zgemv_unb_var2 -bli_zgemv_unf_var1 -bli_zgemv_unf_var2 bli_zger bli_zger_ex -bli_zger_unb_var1 -bli_zger_unb_var2 bli_zgetijm +bli_zgetijv bli_zgetsc bli_zhemm -bli_zhemm1m -bli_zhemm3m1 -bli_zhemm3mh -bli_zhemm4m1 -bli_zhemm4mh bli_zhemm_ex bli_zhemv bli_zhemv_ex -bli_zhemv_unb_var1 -bli_zhemv_unb_var2 -bli_zhemv_unb_var3 -bli_zhemv_unb_var4 -bli_zhemv_unf_var1 -bli_zhemv_unf_var1a -bli_zhemv_unf_var3 -bli_zhemv_unf_var3a bli_zher bli_zher2 bli_zher2_ex bli_zher2k -bli_zher2k1m -bli_zher2k3m1 -bli_zher2k3mh -bli_zher2k4m1 -bli_zher2k4mh bli_zher2k_ex -bli_zher2_unb_var1 -bli_zher2_unb_var2 -bli_zher2_unb_var3 -bli_zher2_unb_var4 -bli_zher2_unf_var1 -bli_zher2_unf_var4 bli_zher_ex bli_zherk -bli_zherk1m -bli_zherk3m1 -bli_zherk3mh -bli_zherk4m1 -bli_zherk4mh bli_zherk_ex -bli_zherk_l_ker_var2 -bli_zherk_u_ker_var2 -bli_zher_unb_var1 -bli_zher_unb_var2 bli_zinvertd bli_zinvertd_ex bli_zinvertsc bli_zinvertv bli_zinvertv_ex +bli_zinvscald +bli_zinvscald_ex +bli_zinvscalm +bli_zinvscalm_ex +bli_zinvscalv +bli_zinvscalv_ex bli_zipsc -bli_zipsc_check -bli_zipsc_qfp bli_zmachval bli_zmkherm bli_zmkherm_ex -bli_zmkherm_unb_var1 bli_zmksymm bli_zmksymm_ex -bli_zmksymm_unb_var1 bli_zmktrim bli_zmktrim_ex -bli_zmktrim_unb_var1 bli_zmulsc bli_znorm1m bli_znorm1m_ex -bli_znorm1m_unb_var1 bli_znorm1v bli_znorm1v_ex -bli_znorm1v_unb_var1 bli_znormfm bli_znormfm_ex -bli_znormfm_unb_var1 bli_znormfsc bli_znormfv bli_znormfv_ex -bli_znormfv_unb_var1 bli_znormim bli_znormim_ex -bli_znormim_unb_var1 bli_znormiv bli_znormiv_ex -bli_znormiv_unb_var1 -bli_zpackm_blk_var1 -bli_zpackm_cxk -bli_zpackm_cxk_1er -bli_zpackm_cxk_3mis -bli_zpackm_cxk_4mi -bli_zpackm_cxk_rih -bli_zpackm_herm_cxk -bli_zpackm_herm_cxk_1er -bli_zpackm_herm_cxk_3mis -bli_zpackm_herm_cxk_4mi -bli_zpackm_herm_cxk_rih -bli_zpackm_struc_cxk -bli_zpackm_struc_cxk_1er -bli_zpackm_struc_cxk_3mis -bli_zpackm_struc_cxk_4mi -bli_zpackm_struc_cxk_rih -bli_zpackm_tri_cxk -bli_zpackm_tri_cxk_1er -bli_zpackm_tri_cxk_3mis -bli_zpackm_tri_cxk_4mi -bli_zpackm_tri_cxk_rih -bli_zpackm_unb_var1 bli_zprintm -bli_zprintm_ex bli_zprintv -bli_zprintv_ex bli_zrandm bli_zrandm_ex -bli_zrandm_unb_var1 bli_zrandnm bli_zrandnm_ex -bli_zrandnm_unb_var1 bli_zrandnv bli_zrandnv_ex -bli_zrandnv_unb_var1 bli_zrandv bli_zrandv_ex -bli_zrandv_unb_var1 bli_zscal2d bli_zscal2d_ex bli_zscal2m bli_zscal2m_ex -bli_zscal2m_unb_var1 bli_zscal2v bli_zscal2v_ex bli_zscald bli_zscald_ex bli_zscalm bli_zscalm_ex -bli_zscalm_unb_var1 bli_zscalv bli_zscalv_ex bli_zscastm @@ -2363,42 +1157,29 @@ bli_zsetd_ex bli_zsetid bli_zsetid_ex bli_zsetijm +bli_zsetijv bli_zsetm bli_zsetm_ex -bli_zsetm_unb_var1 bli_zsetsc bli_zsetv bli_zsetv_ex -bli_zsgemm_ker_var2_md bli_zshiftd bli_zshiftd_ex -bli_zspackm_blk_var1_md -bli_zspackm_cxk_1e_md -bli_zspackm_cxk_1r_md -bli_zspackm_struc_cxk_md bli_zsqrtsc bli_zsubd bli_zsubd_ex bli_zsubm bli_zsubm_ex -bli_zsubm_unb_var1 bli_zsubsc bli_zsubv bli_zsubv_ex bli_zsumsqv bli_zsumsqv_ex -bli_zsumsqv_unb_var1 bli_zswapv bli_zswapv_ex bli_zsxpbym_md bli_zsxpbym_md_ex -bli_zsxpbym_md_unb_var1 bli_zsymm -bli_zsymm1m -bli_zsymm3m1 -bli_zsymm3mh -bli_zsymm4m1 -bli_zsymm4mh bli_zsymm_ex bli_zsymv bli_zsymv_ex @@ -2406,85 +1187,37 @@ bli_zsyr bli_zsyr2 bli_zsyr2_ex bli_zsyr2k -bli_zsyr2k1m -bli_zsyr2k3m1 -bli_zsyr2k3mh -bli_zsyr2k4m1 -bli_zsyr2k4mh bli_zsyr2k_ex bli_zsyr_ex bli_zsyrk -bli_zsyrk1m -bli_zsyrk3m1 -bli_zsyrk3mh -bli_zsyrk4m1 -bli_zsyrk4mh bli_zsyrk_ex bli_ztrmm -bli_ztrmm1m bli_ztrmm3 -bli_ztrmm31m -bli_ztrmm33m1 -bli_ztrmm33mh -bli_ztrmm34m1 -bli_ztrmm34mh bli_ztrmm3_ex -bli_ztrmm3m1 -bli_ztrmm4m1 bli_ztrmm_ex -bli_ztrmm_ll_ker_var2 -bli_ztrmm_lu_ker_var2 -bli_ztrmm_rl_ker_var2 -bli_ztrmm_ru_ker_var2 bli_ztrmv bli_ztrmv_ex -bli_ztrmv_unb_var1 -bli_ztrmv_unb_var2 -bli_ztrmv_unf_var1 -bli_ztrmv_unf_var2 bli_ztrsm -bli_ztrsm1m -bli_ztrsm3m1 -bli_ztrsm4m1 bli_ztrsm_ex -bli_ztrsm_ll_ker_var2 -bli_ztrsm_l_ukernel -bli_ztrsm_lu_ker_var2 -bli_ztrsm_rl_ker_var2 -bli_ztrsm_ru_ker_var2 -bli_ztrsm_u_ukernel bli_ztrsv bli_ztrsv_ex -bli_ztrsv_unb_var1 -bli_ztrsv_unb_var2 -bli_ztrsv_unf_var1 -bli_ztrsv_unf_var2 -bli_zunpackm_blk_var1 -bli_zunpackm_cxk -bli_zunpackm_unb_var1 bli_zunzipsc bli_zxpbyd bli_zxpbyd_ex bli_zxpbym bli_zxpbym_ex -bli_zxpbym_unb_var1 bli_zxpbyv bli_zxpbyv_ex bli_zzcastm bli_zzcastnzm bli_zzcastv bli_zzcopysc -bli_zzgemm_ker_var2_md bli_zzipsc -bli_zzpackm_blk_var1_md -bli_zzpackm_cxk_1e_md -bli_zzpackm_cxk_1r_md -bli_zzpackm_struc_cxk_md bli_zzxpbym_md bli_zzxpbym_md_ex -bli_zzxpbym_md_unb_var1 sasum_ sasumsub_ +saxpby_ saxpy_ scabs1_ scasum_ @@ -2498,6 +1231,8 @@ sdsdot_ sdsdotsub_ sgbmv_ sgemm_ +sgemm_batch_ +sgemmt_ sgemv_ sger_ snrm2_ @@ -2528,6 +1263,7 @@ strsm_ strsv_ dasum_ dasumsub_ +daxpby_ daxpy_ dcabs1_ dcopy_ @@ -2535,6 +1271,8 @@ ddot_ ddotsub_ dgbmv_ dgemm_ +dgemm_batch_ +dgemmt_ dgemv_ dger_ dnrm2_ @@ -2569,6 +1307,7 @@ dzasum_ dzasumsub_ dznrm2_ dznrm2sub_ +caxpby_ caxpy_ ccopy_ cdotc_ @@ -2577,6 +1316,9 @@ cdotu_ cdotusub_ cgbmv_ cgemm_ +cgemm3m_ +cgemm_batch_ +cgemmt_ cgemv_ cgerc_ cgeru_ @@ -2606,6 +1348,7 @@ ctrmm_ ctrmv_ ctrsm_ ctrsv_ +zaxpby_ zaxpy_ zcopy_ zdotc_ @@ -2616,6 +1359,9 @@ zdrot_ zdscal_ zgbmv_ zgemm_ +zgemm3m_ +zgemm_batch_ +zgemmt_ zgemv_ zgerc_ zgeru_ @@ -2651,12 +1397,16 @@ isamax_ isamaxsub_ izamax_ izamaxsub_ +cblas_caxpby cblas_caxpy cblas_ccopy cblas_cdotc_sub cblas_cdotu_sub cblas_cgbmv cblas_cgemm +cblas_cgemm3m +cblas_cgemm_batch +cblas_cgemmt cblas_cgemv cblas_cgerc cblas_cgeru @@ -2685,11 +1435,14 @@ cblas_ctrmv cblas_ctrsm cblas_ctrsv cblas_dasum +cblas_daxpby cblas_daxpy cblas_dcopy cblas_ddot cblas_dgbmv cblas_dgemm +cblas_dgemm_batch +cblas_dgemmt cblas_dgemv cblas_dger cblas_dnrm2 @@ -2725,6 +1478,7 @@ cblas_idamax cblas_isamax cblas_izamax cblas_sasum +cblas_saxpby cblas_saxpy cblas_scasum cblas_scnrm2 @@ -2733,6 +1487,8 @@ cblas_sdot cblas_sdsdot cblas_sgbmv cblas_sgemm +cblas_sgemm_batch +cblas_sgemmt cblas_sgemv cblas_sger cblas_snrm2 @@ -2761,6 +1517,7 @@ cblas_strmv cblas_strsm cblas_strsv cblas_xerbla +cblas_zaxpby cblas_zaxpy cblas_zcopy cblas_zdotc_sub @@ -2768,6 +1525,9 @@ cblas_zdotu_sub cblas_zdscal cblas_zgbmv cblas_zgemm +cblas_zgemm3m +cblas_zgemm_batch +cblas_zgemmt cblas_zgemv cblas_zgerc cblas_zgeru diff --git a/frame/1m/bli_l1m_oft_var.h b/frame/1m/bli_l1m_oft_var.h index 325ed0ecf..4888cbdaa 100644 --- a/frame/1m/bli_l1m_oft_var.h +++ b/frame/1m/bli_l1m_oft_var.h @@ -48,9 +48,8 @@ typedef void (*PASTECH(opname,_var_oft)) \ const obj_t* a, \ obj_t* p, \ const cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ - const thrinfo_t* thread \ + const cntl_t* cntl, \ + thrinfo_t* thread \ ); GENTDEF( packm ) diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c index 0a641cf9e..487116329 100644 --- a/frame/1m/bli_l1m_tapi.c +++ b/frame/1m/bli_l1m_tapi.c @@ -77,8 +77,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ n, \ ( ctype* )x, rs_x, cs_x, \ y, rs_y, cs_y, \ - ( cntx_t* )cntx, \ - rntm \ + ( cntx_t* )cntx \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, @@ -142,8 +141,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ n, \ ( ctype* )x, rs_x, cs_x, \ y, rs_y, cs_y, \ - ( cntx_t* )cntx, \ - rntm \ + ( cntx_t* )cntx \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, @@ -216,8 +214,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )alpha, \ ( ctype* )x, rs_x, cs_x, \ y, rs_y, cs_y, \ - ( cntx_t* )cntx, \ - rntm \ + ( cntx_t* )cntx \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, @@ -305,8 +302,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )alpha, \ ( ctype* )x, rs_x, cs_x, \ y, rs_y, cs_y, \ - ( cntx_t* )cntx, \ - rntm \ + ( cntx_t* )cntx \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, @@ -373,8 +369,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ n, \ ( ctype* )alpha, \ x, rs_x, cs_x, \ - ( cntx_t* )cntx, \ - rntm \ + ( cntx_t* )cntx \ ); \ } @@ -422,8 +417,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ n, \ ( ctype* )x, rs_x, cs_x, \ y, rs_y, cs_y, \ - ( cntx_t* )cntx, \ - rntm \ + ( cntx_t* )cntx \ ); \ \ return; \ @@ -442,8 +436,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )x, rs_x, cs_x, \ ( ctype* )beta, \ y, rs_y, cs_y, \ - ( cntx_t* )cntx, \ - rntm \ + ( cntx_t* )cntx \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, @@ -524,8 +517,7 @@ void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( ctype_x* )x, rs_x, cs_x, \ ( ctype_y* )beta, \ y, rs_y, cs_y, \ - ( cntx_t* )cntx, \ - rntm \ + ( cntx_t* )cntx \ ); \ } diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c index 1bcd9b9ca..9d051c169 100644 --- a/frame/1m/bli_l1m_unb_var1.c +++ b/frame/1m/bli_l1m_unb_var1.c @@ -51,8 +51,7 @@ void PASTEMAC(ch,opname) \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ - cntx_t* cntx, \ - rntm_t* rntm \ + cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -168,8 +167,7 @@ void PASTEMAC(ch,opname) \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ - cntx_t* cntx, \ - rntm_t* rntm \ + cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -286,8 +284,7 @@ void PASTEMAC(ch,opname) \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ - cntx_t* cntx, \ - rntm_t* rntm \ + cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -395,8 +392,7 @@ void PASTEMAC(ch,opname) \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ - cntx_t* cntx, \ - rntm_t* rntm \ + cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ @@ -513,8 +509,7 @@ void PASTEMAC2(chx,chy,opname) \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ - cntx_t* cntx, \ - rntm_t* rntm \ + cntx_t* cntx \ ) \ { \ uplo_t uplox_eff; \ diff --git a/frame/1m/bli_l1m_unb_var1.h b/frame/1m/bli_l1m_unb_var1.h index fe01989e3..06aed2fe1 100644 --- a/frame/1m/bli_l1m_unb_var1.h +++ b/frame/1m/bli_l1m_unb_var1.h @@ -50,8 +50,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ - cntx_t* cntx, \ - rntm_t* rntm \ + cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( addm ) @@ -73,8 +72,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ - cntx_t* cntx, \ - rntm_t* rntm \ + cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( axpym ) @@ -94,8 +92,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ - cntx_t* cntx, \ - rntm_t* rntm \ + cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( invscalm ) @@ -117,8 +114,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ - cntx_t* cntx, \ - rntm_t* rntm \ + cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( xpbym ) @@ -138,8 +134,7 @@ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ - cntx_t* cntx, \ - rntm_t* rntm \ + cntx_t* cntx \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 7d73bf903..80878fba0 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -39,6 +39,7 @@ #include "bli_packm_init.h" #include "bli_packm_int.h" #include "bli_packm_scalar.h" +#include "bli_packm_thrinfo.h" #include "bli_packm_part.h" diff --git a/frame/1m/packm/bli_packm_alloc.c b/frame/1m/packm/bli_packm_alloc.c index 07f54de78..18cc6f627 100644 --- a/frame/1m/packm/bli_packm_alloc.c +++ b/frame/1m/packm/bli_packm_alloc.c @@ -38,9 +38,8 @@ void* bli_packm_alloc ( siz_t size_needed, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread ) { // Query the pack buffer type from the control tree node. @@ -50,51 +49,48 @@ void* bli_packm_alloc ( size_needed, pack_buf_type, - rntm, - cntl, thread ); } void* bli_packm_alloc_ex ( - siz_t size_needed, - packbuf_t pack_buf_type, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + siz_t size_needed, + packbuf_t pack_buf_type, + thrinfo_t* thread ) { - // Query the address of the mem_t entry within the control tree node. - mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl ); + // Query the address of the mem_t entry within the thrinfo tree node. + mem_t* mem_p = bli_thrinfo_mem( thread ); + pba_t* pba = bli_thrinfo_pba( thread ); mem_t* local_mem_p; mem_t local_mem_s; - siz_t cntl_mem_size = 0; + siz_t mem_size = 0; - if ( bli_mem_is_alloc( cntl_mem_p ) ) - cntl_mem_size = bli_mem_size( cntl_mem_p ); + if ( bli_mem_is_alloc( mem_p ) ) + mem_size = bli_mem_size( mem_p ); - if ( cntl_mem_size < size_needed ) + if ( mem_size < size_needed ) { - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) { // The chief thread releases the existing block associated with - // the mem_t entry in the control tree, and then re-acquires a + // the mem_t entry in the thrinfo tree, and then re-acquires a // new block, saving the associated mem_t entry to local_mem_s. - if ( bli_mem_is_alloc( cntl_mem_p ) ) + if ( bli_mem_is_alloc( mem_p ) ) { bli_pba_release ( - rntm, - cntl_mem_p + pba, + mem_p ); } bli_pba_acquire_m ( - rntm, + pba, size_needed, pack_buf_type, &local_mem_s @@ -103,17 +99,17 @@ void* bli_packm_alloc_ex // Broadcast the address of the chief thread's local mem_t entry to // all threads. - local_mem_p = bli_thread_broadcast( rntm, thread, &local_mem_s ); + local_mem_p = bli_thrinfo_broadcast( thread, &local_mem_s ); // Save the chief thread's local mem_t entry to the mem_t field in - // this thread's control tree node. - *cntl_mem_p = *local_mem_p; + // this thread's thrinfo tree node. + *mem_p = *local_mem_p; // Barrier so that the master thread doesn't return from the function // before we are done reading. - bli_thread_barrier( rntm, thread ); + bli_thrinfo_barrier( thread ); } - return bli_mem_buffer( cntl_mem_p ); + return bli_mem_buffer( mem_p ); } diff --git a/frame/1m/packm/bli_packm_alloc.h b/frame/1m/packm/bli_packm_alloc.h index aec2e1af5..a24e7a500 100644 --- a/frame/1m/packm/bli_packm_alloc.h +++ b/frame/1m/packm/bli_packm_alloc.h @@ -35,17 +35,14 @@ BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( - siz_t size_needed, - packbuf_t pack_buf_type, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + siz_t size_needed, + packbuf_t pack_buf_type, + thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 601f2c05c..da49126a5 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -57,9 +57,8 @@ void bli_packm_blk_var1 const obj_t* c, obj_t* p, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread ) { // Extract various fields from the control tree. @@ -71,7 +70,7 @@ void bli_packm_blk_var1 // Every thread initializes p and determines the size of memory // block needed (which gets embedded into the otherwise "blank" mem_t // entry in the control tree node). Return early if no packing is required. - if ( !bli_packm_init( c, p, cntx, rntm, cntl, thread ) ) + if ( !bli_packm_init( c, p, cntx, cntl, thread ) ) return; // Check parameters. @@ -161,8 +160,8 @@ void bli_packm_blk_var1 // Query the number of threads and thread ids from the current thread's // packm thrinfo_t node. - const dim_t nt = bli_thread_n_way( thread ); - const dim_t tid = bli_thread_work_id( thread ); + const dim_t nt = bli_thrinfo_num_threads( thread ); + const dim_t tid = bli_thrinfo_thread_id( thread ); // Determine the thread range and increment using the current thread's // packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h index 5797e3b94..870988fec 100644 --- a/frame/1m/packm/bli_packm_blk_var1.h +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -49,11 +49,10 @@ typedef struct BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( - const obj_t* c, - obj_t* p, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* t + const obj_t* c, + obj_t* p, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index e99ed9cf3..7f7401045 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -37,7 +37,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, @@ -57,7 +57,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node #endif // Allocate a packm_params_t struct. - params = bli_sba_acquire( rntm, sizeof( packm_params_t ) ); + params = bli_sba_acquire( pool, sizeof( packm_params_t ) ); // Initialize the packm_params_t struct. params->size = sizeof( packm_params_t ); @@ -79,7 +79,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node // sync with the cntl_t tree. cntl = bli_cntl_create_node ( - rntm, + pool, BLIS_NOID, BLIS_NO_PART, var_func, diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index be0fc8fde..8a43f711d 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -85,7 +85,7 @@ BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( const cntl_t* cntl ) cntl_t* bli_packm_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 67e02ac0e..d4480f2c1 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -40,9 +40,8 @@ bool bli_packm_init const obj_t* c, obj_t* p, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread ) { bli_init_once(); @@ -179,7 +178,7 @@ bool bli_packm_init // Update the buffer address in p to point to the buffer associated // with the mem_t entry acquired from the memory broker (now cached in // the control tree node). - void* buffer = bli_packm_alloc( size_p, rntm, cntl, thread ); + void* buffer = bli_packm_alloc( size_p, cntl, thread ); bli_obj_set_buffer( buffer, p ); return true; diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h index 6f9b47273..b34bd5379 100644 --- a/frame/1m/packm/bli_packm_init.h +++ b/frame/1m/packm/bli_packm_init.h @@ -37,8 +37,7 @@ BLIS_EXPORT_BLIS bool bli_packm_init const obj_t* a, obj_t* p, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index ae788e671..fa4fcb47a 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -39,9 +39,8 @@ void bli_packm_int const obj_t* a, obj_t* p, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread_par ) { bli_init_once(); @@ -51,7 +50,8 @@ void bli_packm_int // Barrier so that we know threads are done with previous computation // with the same packing buffer before starting to pack. - bli_thread_barrier( rntm, thread ); + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + bli_thrinfo_barrier( thread ); // Invoke the variant with kappa_use. f @@ -59,12 +59,11 @@ void bli_packm_int a, p, cntx, - rntm, cntl, thread ); // Barrier so that packing is done before computation. - bli_thread_barrier( rntm, thread ); + bli_thrinfo_barrier( thread ); } diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h index a4cf17d59..b7720cd3e 100644 --- a/frame/1m/packm/bli_packm_int.h +++ b/frame/1m/packm/bli_packm_int.h @@ -37,7 +37,6 @@ void bli_packm_int const obj_t* a, obj_t* p, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - const thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread ); diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.c b/frame/1m/packm/bli_packm_struc_cxk_md.c index 650b6178c..b83a0271f 100644 --- a/frame/1m/packm/bli_packm_struc_cxk_md.c +++ b/frame/1m/packm/bli_packm_struc_cxk_md.c @@ -306,8 +306,6 @@ void PASTEMAC2(cha,chp,opname) \ PASTEMAC(cha,ctyper)* restrict alpha1_i = ( PASTEMAC(cha,ctyper)* )a + 1; \ PASTEMAC(chp,ctyper)* restrict pi1_r = ( PASTEMAC(chp,ctyper)* )p; \ PASTEMAC(chp,ctyper)* restrict pi1_i = ( PASTEMAC(chp,ctyper)* )p + ldp; \ -\ - ( void )kappa_i; \ \ if ( PASTEMAC(chp,eq1)( *kappa ) ) \ { \ diff --git a/frame/1m/packm/bli_packm_thrinfo.c b/frame/1m/packm/bli_packm_thrinfo.c deleted file mode 100644 index 4b57971ef..000000000 --- a/frame/1m/packm/bli_packm_thrinfo.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_packm_thrinfo_init - ( - thrinfo_t* thread, - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - bszid_t bszid, - thrinfo_t* sub_node - ) -{ - bli_thrinfo_init - ( - thread, - ocomm, ocomm_id, - n_way, work_id, - FALSE, - BLIS_NO_PART, - sub_node - ); -} - -void bli_packm_thrinfo_init_single - ( - thrinfo_t* thread - ) -{ - bli_packm_thrinfo_init - ( - thread, - &BLIS_SINGLE_COMM, 0, - 1, - 0, - BLIS_NO_PART, - NULL - ); -} - diff --git a/frame/1m/packm/bli_packm_thrinfo.h b/frame/1m/packm/bli_packm_thrinfo.h index 85b61931c..1ac7f88df 100644 --- a/frame/1m/packm/bli_packm_thrinfo.h +++ b/frame/1m/packm/bli_packm_thrinfo.h @@ -5,7 +5,6 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -64,42 +63,3 @@ #endif - -// -// thrinfo_t APIs specific to packm. -// - -#if 0 -thrinfo_t* bli_packm_thrinfo_create - ( - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - thrinfo_t* sub_node - ); -#endif - -void bli_packm_thrinfo_init - ( - thrinfo_t* thread, - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - bszid_t bszid, - thrinfo_t* sub_node - ); - -void bli_packm_thrinfo_init_single - ( - thrinfo_t* thread - ); - -#if 0 -void bli_packm_thrinfo_free - ( - thrinfo_t* thread - ); -#endif - diff --git a/frame/1m/unpackm/bli_unpackm_cntl.c b/frame/1m/unpackm/bli_unpackm_cntl.c index 95d0545be..e33e3b151 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.c +++ b/frame/1m/unpackm/bli_unpackm_cntl.c @@ -37,10 +37,10 @@ cntl_t* bli_unpackm_cntl_create_node ( - rntm_t* rntm, - void_fp var_func, - void_fp unpackm_var_func, - cntl_t* sub_node + pool_t* pool, + void_fp var_func, + void_fp unpackm_var_func, + cntl_t* sub_node ) { cntl_t* cntl; @@ -64,7 +64,7 @@ cntl_t* bli_unpackm_cntl_create_node // sync with the cntl_t tree. cntl = bli_cntl_create_node ( - rntm, + pool, BLIS_NOID, BLIS_NO_PART, var_func, diff --git a/frame/1m/unpackm/bli_unpackm_cntl.h b/frame/1m/unpackm/bli_unpackm_cntl.h index 5c41d9465..075800d0a 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.h +++ b/frame/1m/unpackm/bli_unpackm_cntl.h @@ -48,9 +48,9 @@ typedef struct unpackm_params_s unpackm_params_t; cntl_t* bli_unpackm_cntl_create_node ( - rntm_t* rntm, - void_fp var_func, - void_fp unpackm_var_func, - cntl_t* sub_node + pool_t* pool, + void_fp var_func, + void_fp unpackm_var_func, + cntl_t* sub_node ); diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c index 3b542b061..2ced9a1a2 100644 --- a/frame/1m/unpackm/bli_unpackm_int.c +++ b/frame/1m/unpackm/bli_unpackm_int.c @@ -61,7 +61,7 @@ void bli_unpackm_int f = bli_cntl_unpackm_params_var_func( cntl ); // Invoke the variant. - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) { f ( @@ -74,6 +74,6 @@ void bli_unpackm_int } // Barrier so that unpacking is done before computation. - bli_thread_barrier( rntm, thread ); + bli_thrinfo_barrier( thread ); } diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index 9d39fc47d..a55091539 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -33,6 +33,10 @@ */ +#include "bli_l3_thrinfo.h" +#include "bli_l3_decor.h" +#include "bli_l3_sup_decor.h" + #include "bli_l3_cntl.h" #include "bli_l3_check.h" #include "bli_l3_int.h" diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 78482b5f6..586aeb6ea 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -35,6 +35,40 @@ #include "blis.h" +void bli_l3_adjust_kc + ( + const obj_t* a, + const obj_t* b, + dim_t* b_alg, + dim_t* b_max, + const cntx_t* cntx, + const cntl_t* cntl + ) +{ + const opid_t family = bli_cntl_family( cntl ); + const num_t dt = bli_obj_exec_dt( a ); + dim_t mnr = 1; + + // Nudge the default and maximum kc blocksizes up to the nearest + // multiple of MR if A is Hermitian, symmetric, or triangular or + // NR if B is Hermitian, symmetric, or triangular. If neither case + // applies, then we leave the blocksizes unchanged. For trsm we + // always use MR (rather than sometimes using NR) because even + // when the triangle is on the right, packing of that matrix uses + // MR, since only left-side trsm micro-kernels are supported. + if ( !bli_obj_root_is_general( a ) || family == BLIS_TRSM ) + { + mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); + } + else if ( !bli_obj_root_is_general( b ) ) + { + mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); + } + + *b_alg = bli_align_dim_to_mult( *b_alg, mnr ); + *b_max = bli_align_dim_to_mult( *b_max, mnr ); +} + dim_t bli_l3_determine_kc ( dir_t direct, @@ -47,261 +81,16 @@ dim_t bli_l3_determine_kc const cntl_t* cntl ) { - opid_t family = bli_cntl_family( cntl ); - - if ( family == BLIS_GEMM ) - return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); - else if ( family == BLIS_GEMMT ) - return bli_gemmt_determine_kc( direct, i, dim, a, b, bszid, cntx ); - else if ( family == BLIS_TRMM ) - return bli_trmm_determine_kc( direct, i, dim, a, b, bszid, cntx ); - else if ( family == BLIS_TRSM ) - return bli_trsm_determine_kc( direct, i, dim, a, b, bszid, cntx ); - - // This should never execute. - return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); -} - -// ----------------------------------------------------------------------------- - -// -// NOTE: We call a gemm/hemm/symm, trmm, or trsm-specific blocksize -// function to determine the kc blocksize so that we can implement the -// "nudging" of kc to be a multiple of mr or nr, as needed. -// - -#undef GENFRONT -#define GENFRONT( opname, l3op ) \ -\ -dim_t PASTEMAC0(opname) \ - ( \ - dir_t direct, \ - dim_t i, \ - dim_t dim, \ - const obj_t* a, \ - const obj_t* b, \ - bszid_t bszid, \ - const cntx_t* cntx \ - ) \ -{ \ - if ( direct == BLIS_FWD ) \ - return PASTEMAC(l3op,_determine_kc_f)( i, dim, a, b, bszid, cntx ); \ - else \ - return PASTEMAC(l3op,_determine_kc_b)( i, dim, a, b, bszid, cntx ); \ -} - -GENFRONT( gemm_determine_kc, gemm ) -GENFRONT( gemmt_determine_kc, gemmt ) -GENFRONT( trmm_determine_kc, trmm ) -GENFRONT( trsm_determine_kc, trsm ) - -// ----------------------------------------------------------------------------- - -#undef GENFRONT -#define GENFRONT( opname, chdir ) \ -\ -dim_t PASTEMAC0(opname) \ - ( \ - dim_t i, \ - dim_t dim, \ - const obj_t* a, \ - const obj_t* b, \ - bszid_t bszid, \ - const cntx_t* cntx \ - ) \ -{ \ - /* bli_*_determine_kc_f(): + const num_t dt = bli_obj_exec_dt( a ); + const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); + dim_t b_alg = bli_blksz_get_def( dt, bsize ); + dim_t b_max = bli_blksz_get_max( dt, bsize ); - We assume that this function is being called from an algorithm that - is moving "forward" (ie: top to bottom, left to right, top-left - to bottom-right). */ \ -\ - /* bli_*_determine_kc_b(): + bli_l3_adjust_kc( a, b, &b_alg, &b_max, cntx, cntl ); - We assume that this function is being called from an algorithm that - is moving "backward" (ie: bottom to top, right to left, bottom-right - to top-left). */ \ -\ - /* Extract the execution datatype and use it to query the corresponding - blocksize and blocksize maximum values from the blksz_t object. */ \ - const num_t dt = bli_obj_exec_dt( a ); \ - const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \ - dim_t b_alg = bli_blksz_get_def( dt, bsize ); \ - dim_t b_max = bli_blksz_get_max( dt, bsize ); \ -\ - /* Nudge the default and maximum kc blocksizes up to the nearest - multiple of MR if A is Hermitian or symmetric, or NR if B is - Hermitian or symmetric. If neither case applies, then we leave - the blocksizes unchanged. */ \ - dim_t mnr; \ - if ( bli_obj_root_is_herm_or_symm( a ) ) \ - { \ - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ - b_max = bli_align_dim_to_mult( b_max, mnr ); \ - } \ - else if ( bli_obj_root_is_herm_or_symm( b ) ) \ - { \ - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ - b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ - b_max = bli_align_dim_to_mult( b_max, mnr ); \ - } \ -\ - /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined - in bli_blksz.c */ \ - return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ + if ( direct == BLIS_FWD ) + return bli_determine_blocksize_f_sub( i, dim, b_alg, b_max ); + else + return bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); } -GENFRONT( gemm_determine_kc_f, f ) -GENFRONT( gemm_determine_kc_b, b ) - -// ----------------------------------------------------------------------------- - -#undef GENFRONT -#define GENFRONT( opname, chdir ) \ -\ -dim_t PASTEMAC0(opname) \ - ( \ - dim_t i, \ - dim_t dim, \ - const obj_t* a, \ - const obj_t* b, \ - bszid_t bszid, \ - const cntx_t* cntx \ - ) \ -{ \ - /* bli_*_determine_kc_f(): - - We assume that this function is being called from an algorithm that - is moving "forward" (ie: top to bottom, left to right, top-left - to bottom-right). */ \ -\ - /* bli_*_determine_kc_b(): - - We assume that this function is being called from an algorithm that - is moving "backward" (ie: bottom to top, right to left, bottom-right - to top-left). */ \ -\ - /* Extract the execution datatype and use it to query the corresponding - blocksize and blocksize maximum values from the blksz_t object. */ \ - const num_t dt = bli_obj_exec_dt( a ); \ - const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \ - const dim_t b_alg = bli_blksz_get_def( dt, bsize ); \ - const dim_t b_max = bli_blksz_get_max( dt, bsize ); \ -\ - /* Notice that for gemmt, we do not need to perform any special handling - for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \ -\ - /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined - in bli_blksz.c */ \ - return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ -} - -GENFRONT( gemmt_determine_kc_f, f ) -GENFRONT( gemmt_determine_kc_b, b ) - -// ----------------------------------------------------------------------------- - -#undef GENFRONT -#define GENFRONT( opname, chdir ) \ -\ -dim_t PASTEMAC0(opname) \ - ( \ - dim_t i, \ - dim_t dim, \ - const obj_t* a, \ - const obj_t* b, \ - bszid_t bszid, \ - const cntx_t* cntx \ - ) \ -{ \ - /* bli_*_determine_kc_f(): - - We assume that this function is being called from an algorithm that - is moving "forward" (ie: top to bottom, left to right, top-left - to bottom-right). */ \ -\ - /* bli_*_determine_kc_b(): - - We assume that this function is being called from an algorithm that - is moving "backward" (ie: bottom to top, right to left, bottom-right - to top-left). */ \ -\ - /* Extract the execution datatype and use it to query the corresponding - blocksize and blocksize maximum values from the blksz_t object. */ \ - const num_t dt = bli_obj_exec_dt( a ); \ - const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \ - dim_t b_alg = bli_blksz_get_def( dt, bsize ); \ - dim_t b_max = bli_blksz_get_max( dt, bsize ); \ -\ - /* Nudge the default and maximum kc blocksizes up to the nearest - multiple of MR if the triangular matrix is on the left, or NR - if the triangular matrix is one the right. */ \ - dim_t mnr; \ - if ( bli_obj_root_is_triangular( a ) ) \ - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - else \ - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ -\ - b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ - b_max = bli_align_dim_to_mult( b_max, mnr ); \ -\ - /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined - in bli_blksz.c */ \ - return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ -} - -GENFRONT( trmm_determine_kc_f, f ) -GENFRONT( trmm_determine_kc_b, b ) - -// ----------------------------------------------------------------------------- - -#undef GENFRONT -#define GENFRONT( opname, chdir ) \ -\ -dim_t PASTEMAC0(opname) \ - ( \ - dim_t i, \ - dim_t dim, \ - const obj_t* a, \ - const obj_t* b, \ - bszid_t bszid, \ - const cntx_t* cntx \ - ) \ -{ \ - /* bli_*_determine_kc_f(): - - We assume that this function is being called from an algorithm that - is moving "forward" (ie: top to bottom, left to right, top-left - to bottom-right). */ \ -\ - /* bli_*_determine_kc_b(): - - We assume that this function is being called from an algorithm that - is moving "backward" (ie: bottom to top, right to left, bottom-right - to top-left). */ \ -\ - /* Extract the execution datatype and use it to query the corresponding - blocksize and blocksize maximum values from the blksz_t object. */ \ - const num_t dt = bli_obj_exec_dt( a ); \ - const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \ - dim_t b_alg = bli_blksz_get_def( dt, bsize ); \ - dim_t b_max = bli_blksz_get_max( dt, bsize ); \ -\ - /* Nudge the default and maximum kc blocksizes up to the nearest - multiple of MR. We always use MR (rather than sometimes using NR) - because even when the triangle is on the right, packing of that - matrix uses MR, since only left-side trsm micro-kernels are - supported. */ \ - const dim_t mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ - b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ - b_max = bli_align_dim_to_mult( b_max, mnr ); \ -\ - /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined - in bli_blksz.c */ \ - return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ -} - -GENFRONT( trsm_determine_kc_f, f ) -GENFRONT( trsm_determine_kc_b, b ) - diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h index 1ec889e03..843d5f241 100644 --- a/frame/3/bli_l3_blocksize.h +++ b/frame/3/bli_l3_blocksize.h @@ -32,6 +32,16 @@ */ +void bli_l3_adjust_kc + ( + const obj_t* a, + const obj_t* b, + dim_t* b_alg, + dim_t* b_max, + const cntx_t* cntx, + const cntl_t* cntl + ); + dim_t bli_l3_determine_kc ( dir_t direct, @@ -43,50 +53,3 @@ dim_t bli_l3_determine_kc const cntx_t* cntx, const cntl_t* cntl ); - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -dim_t PASTEMAC0(opname) \ - ( \ - dir_t direct, \ - dim_t i, \ - dim_t dim, \ - const obj_t* a, \ - const obj_t* b, \ - bszid_t bszid, \ - const cntx_t* cntx \ - ); - -GENPROT( gemm_determine_kc ) -GENPROT( gemmt_determine_kc ) -GENPROT( trmm_determine_kc ) -GENPROT( trsm_determine_kc ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -dim_t PASTEMAC0(opname) \ - ( \ - dim_t i, \ - dim_t dim, \ - const obj_t* a, \ - const obj_t* b, \ - bszid_t bszid, \ - const cntx_t* cntx \ - ); - -GENPROT( gemm_determine_kc_f ) -GENPROT( gemm_determine_kc_b ) - -GENPROT( gemmt_determine_kc_f ) -GENPROT( gemmt_determine_kc_b ) - -GENPROT( trmm_determine_kc_f ) -GENPROT( trmm_determine_kc_b ) - -GENPROT( trsm_determine_kc_f ) -GENPROT( trsm_determine_kc_b ) - diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index d7fd9649e..27d140143 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -44,8 +44,8 @@ void bli_l3_cntl_create_if const obj_t* a, const obj_t* b, const obj_t* c, - rntm_t* rntm, - cntl_t* cntl_orig, + pool_t* pool, + const cntl_t* cntl_orig, cntl_t** cntl_use ) { @@ -59,7 +59,7 @@ void bli_l3_cntl_create_if { *cntl_use = bli_gemm_cntl_create ( - rntm, + pool, family, schema_a, schema_b, @@ -70,12 +70,18 @@ void bli_l3_cntl_create_if { side_t side; + // NOTE: We no longer ever use right-sided trsm, and therefore this + // function will only ever get called with side = BLIS_LEFT, which + // means that in the future, we can remove the a, b, and c operands + // from the function signature. (This assumes that the call to + // bli_obj_ker_fn( c ) is replaced in some future reorganization + // that moves the .ker_fn argument from obj_t to, say, the rntm_t.) if ( bli_obj_is_triangular( a ) ) side = BLIS_LEFT; else side = BLIS_RIGHT; *cntl_use = bli_trsm_cntl_create ( - rntm, + pool, side, schema_a, schema_b, @@ -88,7 +94,7 @@ void bli_l3_cntl_create_if // If the user provided a control tree, create a copy and use it // instead (so that threads can use its local tree as a place to // cache things like pack mem_t entries). - *cntl_use = bli_cntl_copy( rntm, cntl_orig ); + *cntl_use = bli_cntl_copy( pool, cntl_orig ); // Recursively set the family fields of the newly copied control tree // nodes. @@ -98,9 +104,8 @@ void bli_l3_cntl_create_if void bli_l3_cntl_free ( - rntm_t* rntm, - cntl_t* cntl_use, - thrinfo_t* thread + pool_t* pool, + cntl_t* cntl_use ) { // NOTE: We don't actually need to call separate _cntl_free() functions @@ -114,11 +119,11 @@ void bli_l3_cntl_free family == BLIS_GEMMT || family == BLIS_TRMM ) { - bli_gemm_cntl_free( rntm, cntl_use, thread ); + bli_gemm_cntl_free( pool, cntl_use ); } else // if ( family == BLIS_TRSM ) { - bli_trsm_cntl_free( rntm, cntl_use, thread ); + bli_trsm_cntl_free( pool, cntl_use ); } } diff --git a/frame/3/bli_l3_cntl.h b/frame/3/bli_l3_cntl.h index eb4321ecd..68e837663 100644 --- a/frame/3/bli_l3_cntl.h +++ b/frame/3/bli_l3_cntl.h @@ -46,15 +46,14 @@ void bli_l3_cntl_create_if const obj_t* a, const obj_t* b, const obj_t* c, - rntm_t* rntm, - cntl_t* cntl_orig, + pool_t* pool, + const cntl_t* cntl_orig, cntl_t** cntl_use ); -void bli_l3_cntl_free +BLIS_EXPORT_BLIS void bli_l3_cntl_free ( - rntm_t* rntm, - cntl_t* cntl_use, - thrinfo_t* thread + pool_t* pool, + cntl_t* cntl_use ); diff --git a/frame/3/bli_l3_decor.c b/frame/3/bli_l3_decor.c new file mode 100644 index 000000000..e482d37a1 --- /dev/null +++ b/frame/3/bli_l3_decor.c @@ -0,0 +1,298 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +struct l3_decor_params_s +{ + l3int_ft func; + opid_t family; + const obj_t* alpha; + const obj_t* a; + const obj_t* b; + const obj_t* beta; + const obj_t* c; + const cntx_t* cntx; + rntm_t* rntm; + array_t* array; +}; +typedef struct l3_decor_params_s l3_decor_params_t; + +static void bli_l3_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, const void* data_void ) +{ + const l3_decor_params_t* data = data_void; + + const l3int_ft func = data->func; + const opid_t family = data->family; + const obj_t* alpha = data->alpha; + const obj_t* a = data->a; + const obj_t* b = data->b; + const obj_t* beta = data->beta; + const obj_t* c = data->c; + const cntx_t* cntx = data->cntx; + rntm_t* rntm = data->rntm; + array_t* array = data->array; + + bli_l3_thread_decorator_thread_check( gl_comm, rntm ); + + // Alias thread-local copies of A, B, and C. These will be the objects + // we pass down the algorithmic function stack. Making thread-local + // aliases is highly recommended in case a thread needs to change any + // of the properties of an object without affecting other threads' + // objects. + obj_t a_t, b_t, c_t; + bli_obj_alias_to( a, &a_t ); + bli_obj_alias_to( b, &b_t ); + bli_obj_alias_to( c, &c_t ); + + // This is part of a hack to support mixed domain in bli_gemm_front(). + // Sometimes we need to specify a non-standard schema for A and B, and + // we decided to transmit them via the schema field in the obj_t's + // rather than pass them in as function parameters. Once the values + // have been read, we immediately reset them back to their expected + // values for unpacked objects. + pack_t schema_a = bli_obj_pack_schema( &a_t ); + pack_t schema_b = bli_obj_pack_schema( &b_t ); + bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t ); + bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t ); + + // Create a default control tree for the operation, if needed. + cntl_t* cntl_use; + pool_t* sba_pool = bli_apool_array_elem( tid, array ); + bli_l3_cntl_create_if( family, schema_a, schema_b, + &a_t, &b_t, &c_t, sba_pool, NULL, &cntl_use ); + + // Create the root node of the current thread's thrinfo_t structure. + // The root node is the *parent* of the node corresponding to the first + // control tree node. + thrinfo_t* thread = bli_l3_thrinfo_create( tid, gl_comm, array, rntm, cntl_use ); + + func + ( + alpha, + &a_t, + &b_t, + beta, + &c_t, + cntx, + cntl_use, + thread + ); + + // Free the thread's local control tree. + bli_l3_cntl_free( sba_pool, cntl_use ); + + // Free the current thread's thrinfo_t structure. + bli_thrinfo_free( thread ); +} + +void bli_l3_thread_decorator + ( + l3int_ft func, + opid_t family, + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + const rntm_t* rntm + ) +{ + rntm_t rntm_l = *rntm; + + // Query the threading implementation and the number of threads requested. + timpl_t ti = bli_rntm_thread_impl( &rntm_l ); + dim_t nt = bli_rntm_num_threads( &rntm_l ); + +#if 0 + printf( "(pre-opt) application requested rntm.thread_impl = %s\n", + ( ti == BLIS_SINGLE ? "single" : + ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); +#endif + + if ( bli_error_checking_is_enabled() ) + bli_l3_thread_decorator_check( &rntm_l ); + +#ifdef BLIS_ENABLE_NT1_VIA_SINGLE + if ( nt == 1 ) + { + // An optimization. If the caller requests only one thread, force + // the sequential level-3 thread decorator even if that means + // overriding the caller's preferred threading implementation (as + // communicated via the rntm_t). + rntm_l = *rntm; + ti = BLIS_SINGLE; + bli_rntm_set_thread_impl( BLIS_SINGLE, &rntm_l ); + rntm = &rntm_l; + } +#endif + + if ( 1 < nt && ti == BLIS_SINGLE ) + { + // Here, we resolve conflicting information. The caller requested + // a sequential threading implementation, but also requested more + // than one thread. Here, we choose to favor the requested threading + // implementation over the number of threads, and so reset all + // parallelism parameters to 1. + nt = 1; + bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l ); + bli_rntm_set_num_threads_only( 1, &rntm_l ); + } + +#if 0 + printf( "(post-opt) moving forward with rntm.thread_impl = %s\n", + ( ti == BLIS_SINGLE ? "single" : + ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); +#endif + + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* array = bli_sba_checkout_array( nt ); + + l3_decor_params_t params; + params.func = func; + params.family = family; + params.alpha = alpha; + params.a = a; + params.b = b; + params.beta = beta; + params.c = c; + params.cntx = cntx; + params.rntm = &rntm_l; + params.array = array; + + // Launch the threads using the threading implementation specified by ti, + // and use bli_l3_thread_decorator_entry() as their entry points. The + // params struct will be passed along to each thread. + bli_thread_launch( ti, nt, bli_l3_thread_decorator_entry, ¶ms ); + + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); +} + +void bli_l3_thread_decorator_check + ( + const rntm_t* rntm + ) +{ + //err_t e_val; + + //e_val = bli_check_valid_thread_impl( bli_rntm_thread_impl( rntm ) ); + //bli_check_error_code( e_val ); + + const timpl_t ti = bli_rntm_thread_impl( rntm ); + + if ( +#ifndef BLIS_ENABLE_OPENMP + ti == BLIS_OPENMP || +#endif +#ifndef BLIS_ENABLE_PTHREADS + ti == BLIS_POSIX || +#endif + FALSE + ) + { + fprintf( stderr, "\n" ); + fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); + fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); + fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ ); + bli_abort(); + } +} + +void bli_l3_thread_decorator_thread_check + ( + thrcomm_t* gl_comm, + rntm_t* rntm + ) +{ +#ifdef BLIS_ENABLE_OPENMP + + if ( bli_thrcomm_thread_impl( gl_comm ) != BLIS_OPENMP) + return; + + dim_t n_threads_real = omp_get_num_threads(); + dim_t n_threads = bli_thrcomm_num_threads( gl_comm ); + dim_t tid = omp_get_thread_num(); + + // Check if the number of OpenMP threads created within this parallel + // region is different from the number of threads that were requested + // of BLIS. This inequality may trigger when, for example, the + // following conditions are satisfied: + // - an application is executing an OpenMP parallel region in which + // BLIS is invoked, + // - BLIS is configured for multithreading via OpenMP, + // - OMP_NUM_THREADS = t > 1, + // - the number of threads requested of BLIS (regardless of method) + // is p <= t, + // - OpenMP nesting is disabled. + // In this situation, the application spawns t threads. Each application + // thread calls gemm (for example). Each gemm will attempt to spawn p + // threads via OpenMP. However, since nesting is disabled, the OpenMP + // implementation finds that t >= p threads are already spawned, and + // thus it doesn't spawn *any* additional threads for each gemm. + if ( n_threads_real != n_threads ) + { + // If the number of threads active in the current region is not + // equal to the number requested of BLIS, we then only continue + // if the number of threads in the current region is 1. If, for + // example, BLIS requested 4 threads but only got 3, then we + // abort(). + if ( n_threads_real != 1 ) + { + bli_print_msg( "A different number of threads was " + "created than was requested.", + __FILE__, __LINE__ ); + bli_abort(); + } + + if ( tid == 0 ) + { + bli_thrcomm_init( BLIS_OPENMP, 1, gl_comm ); + bli_rntm_set_num_threads_only( 1, rntm ); + bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm ); + } + + // Synchronize all threads and continue. + _Pragma( "omp barrier" ) + } + +#endif +} + diff --git a/frame/thread/bli_l3_decor.h b/frame/3/bli_l3_decor.h similarity index 78% rename from frame/thread/bli_l3_decor.h rename to frame/3/bli_l3_decor.h index 087eda874..e00b8ed49 100644 --- a/frame/thread/bli_l3_decor.h +++ b/frame/3/bli_l3_decor.h @@ -45,26 +45,10 @@ typedef void (*l3int_ft) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ); -// Level-3 thread decorator function type. -typedef void (*l3_decor_ft) - ( - l3int_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); - // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( @@ -76,20 +60,19 @@ void bli_l3_thread_decorator const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm ); void bli_l3_thread_decorator_check ( - rntm_t* rntm + const rntm_t* rntm ); -// Include definitions specific to the method of multithreading for the -// conventional code path. -#include "bli_l3_decor_single.h" -#include "bli_l3_decor_openmp.h" -#include "bli_l3_decor_pthreads.h" +void bli_l3_thread_decorator_thread_check + ( + thrcomm_t* gl_comm, + rntm_t* rntm + ); #endif diff --git a/frame/3/bli_l3_int.c b/frame/3/bli_l3_int.c index b9d389839..70e6be3a9 100644 --- a/frame/3/bli_l3_int.c +++ b/frame/3/bli_l3_int.c @@ -42,8 +42,7 @@ void bli_l3_int const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -68,9 +67,9 @@ void bli_l3_int if ( bli_obj_has_zero_dim( a ) || bli_obj_has_zero_dim( b ) ) { - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) bli_scalm( beta, c ); - bli_thread_barrier( rntm, thread ); + bli_thrinfo_barrier( thread ); return; } @@ -82,9 +81,9 @@ void bli_l3_int // This should never execute. bli_abort(); - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) bli_scalm( beta, c ); - bli_thread_barrier( rntm, thread ); + bli_thrinfo_barrier( thread ); return; } @@ -130,9 +129,6 @@ void bli_l3_int if ( !bli_obj_equals( beta, &BLIS_ONE ) ) bli_obj_scalar_apply_scalar( beta, &c_local ); - // Create the next node in the thrinfo_t structure. - bli_thrinfo_grow( rntm, cntl, thread ); - // Extract the function pointer from the current control tree node. l3_var_oft f = bli_cntl_var_func( cntl ); @@ -143,7 +139,6 @@ void bli_l3_int &b_local, &c_local, cntx, - rntm, cntl, thread ); diff --git a/frame/3/bli_l3_int.h b/frame/3/bli_l3_int.h index 65485206d..8364d91e4 100644 --- a/frame/3/bli_l3_int.h +++ b/frame/3/bli_l3_int.h @@ -40,8 +40,7 @@ void bli_l3_int const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ); diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c index 16e5f15de..76234525d 100644 --- a/frame/3/bli_l3_oapi_ex.c +++ b/frame/3/bli_l3_oapi_ex.c @@ -50,7 +50,7 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -90,8 +90,8 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF) // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Default to using native execution. num_t dt = bli_obj_dt( c ); @@ -122,7 +122,7 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF) bli_gemm_check( alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. - bli_gemm_front( alpha, a, b, beta, c, cntx, rntm, NULL ); + bli_gemm_front( alpha, a, b, beta, c, cntx, &rntm_l ); } #endif @@ -136,7 +136,7 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -157,8 +157,8 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Default to using native execution. num_t dt = bli_obj_dt( c ); @@ -186,7 +186,7 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) bli_gemmt_check( alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. - bli_gemmt_front( alpha, a, b, beta, c, cntx, rntm, NULL ); + bli_gemmt_front( alpha, a, b, beta, c, cntx, &rntm_l ); } @@ -198,7 +198,7 @@ void PASTEMAC(her2k,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -244,7 +244,7 @@ void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -277,7 +277,7 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -285,8 +285,8 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF) // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Default to using native execution. num_t dt = bli_obj_dt( c ); @@ -314,7 +314,7 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF) bli_hemm_check( side, alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. - bli_hemm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); + bli_hemm_front( side, alpha, a, b, beta, c, cntx, &rntm_l ); } @@ -327,7 +327,7 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -335,8 +335,8 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF) // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Default to using native execution. num_t dt = bli_obj_dt( c ); @@ -364,7 +364,7 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF) bli_symm_check( side, alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. - bli_symm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); + bli_symm_front( side, alpha, a, b, beta, c, cntx, &rntm_l ); } @@ -377,7 +377,7 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -385,8 +385,8 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF) // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Default to using native execution. num_t dt = bli_obj_dt( c ); @@ -414,7 +414,7 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF) bli_trmm3_check( side, alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. - bli_trmm3_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); + bli_trmm3_front( side, alpha, a, b, beta, c, cntx, &rntm_l ); } @@ -425,7 +425,7 @@ void PASTEMAC(herk,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -459,7 +459,7 @@ void PASTEMAC(syrk,BLIS_OAPI_EX_SUF) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -484,7 +484,7 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF) const obj_t* a, const obj_t* b, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -492,8 +492,8 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF) // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Default to using native execution. num_t dt = bli_obj_dt( b ); @@ -520,7 +520,7 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF) bli_trmm_check( side, alpha, a, b, cntx ); // Invoke the operation's front-end and request the default control tree. - bli_trmm_front( side, alpha, a, b, cntx, rntm, NULL ); + bli_trmm_front( side, alpha, a, b, cntx, &rntm_l ); } @@ -531,7 +531,7 @@ void PASTEMAC(trsm,BLIS_OAPI_EX_SUF) const obj_t* a, const obj_t* b, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -539,8 +539,8 @@ void PASTEMAC(trsm,BLIS_OAPI_EX_SUF) // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Default to using native execution. num_t dt = bli_obj_dt( b ); @@ -567,5 +567,5 @@ void PASTEMAC(trsm,BLIS_OAPI_EX_SUF) bli_trsm_check( side, alpha, a, b, cntx ); // Invoke the operation's front-end and request the default control tree. - bli_trsm_front( side, alpha, a, b, cntx, rntm, NULL ); + bli_trsm_front( side, alpha, a, b, cntx, &rntm_l ); } diff --git a/frame/3/bli_l3_oapi_ex.h b/frame/3/bli_l3_oapi_ex.h index 58091704b..dd7624d92 100644 --- a/frame/3/bli_l3_oapi_ex.h +++ b/frame/3/bli_l3_oapi_ex.h @@ -49,7 +49,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ const obj_t* beta, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENPROT( gemm ) @@ -70,7 +70,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ const obj_t* beta, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENPROT( hemm ) @@ -88,7 +88,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ const obj_t* beta, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENPROT( herk ) @@ -105,7 +105,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ const obj_t* a, \ const obj_t* b, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENPROT( trmm ) diff --git a/frame/3/bli_l3_oft.h b/frame/3/bli_l3_oft.h index 997ade58e..67fa2c75d 100644 --- a/frame/3/bli_l3_oft.h +++ b/frame/3/bli_l3_oft.h @@ -54,7 +54,7 @@ typedef void (*PASTECH(opname,_oft)) \ const obj_t* beta, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENTDEF( gemm ) @@ -77,7 +77,7 @@ typedef void (*PASTECH(opname,_oft)) \ const obj_t* beta, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENTDEF( hemm ) @@ -97,7 +97,7 @@ typedef void (*PASTECH(opname,_oft)) \ const obj_t* beta, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENTDEF( herk ) @@ -116,7 +116,7 @@ typedef void (*PASTECH(opname,_oft)) \ const obj_t* a, \ const obj_t* b, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENTDEF( trmm ) diff --git a/frame/3/bli_l3_oft_var.h b/frame/3/bli_l3_oft_var.h index ee529b115..b295b5812 100644 --- a/frame/3/bli_l3_oft_var.h +++ b/frame/3/bli_l3_oft_var.h @@ -49,8 +49,7 @@ typedef void (*PASTECH(opname,_var_oft)) \ const obj_t* b, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ + const cntl_t* cntl, \ thrinfo_t* thread \ ); diff --git a/frame/3/bli_l3_packab.c b/frame/3/bli_l3_packab.c index 6f18169b2..65776d49f 100644 --- a/frame/3/bli_l3_packab.c +++ b/frame/3/bli_l3_packab.c @@ -40,8 +40,7 @@ void bli_l3_packa const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -60,7 +59,6 @@ void bli_l3_packa &a_local, &a_pack, cntx, - rntm, cntl, thread ); @@ -74,7 +72,6 @@ void bli_l3_packa &BLIS_ONE, c, cntx, - rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); @@ -88,8 +85,7 @@ void bli_l3_packb const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { @@ -112,7 +108,6 @@ void bli_l3_packb &bt_local, &bt_pack, cntx, - rntm, cntl, thread ); @@ -129,7 +124,6 @@ void bli_l3_packb &BLIS_ONE, c, cntx, - rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); diff --git a/frame/3/bli_l3_packab.h b/frame/3/bli_l3_packab.h index f03b7f62c..e58a08e4b 100644 --- a/frame/3/bli_l3_packab.h +++ b/frame/3/bli_l3_packab.h @@ -38,8 +38,7 @@ void bli_l3_packa const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ); @@ -49,8 +48,7 @@ void bli_l3_packb const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ); diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c index eedbd9ec5..57513ab5b 100644 --- a/frame/3/bli_l3_sup.c +++ b/frame/3/bli_l3_sup.c @@ -42,7 +42,7 @@ err_t bli_gemmsup const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { // Return early if small matrix handling is disabled at configure-time. @@ -89,8 +89,8 @@ err_t bli_gemmsup // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } #if 0 const num_t dt = bli_obj_dt( c ); @@ -127,7 +127,7 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n", beta, c, cntx, - rntm + &rntm_l ); } @@ -140,7 +140,7 @@ err_t bli_gemmtsup const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { // Return early if small matrix handling is disabled at configure-time. @@ -174,8 +174,8 @@ err_t bli_gemmtsup // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // We've now ruled out the possibility that the sup thresholds are // unsatisfied. @@ -196,7 +196,7 @@ err_t bli_gemmtsup beta, c, cntx, - rntm + &rntm_l ); } diff --git a/frame/3/bli_l3_sup.h b/frame/3/bli_l3_sup.h index 33b3f8ca7..77ff02d91 100644 --- a/frame/3/bli_l3_sup.h +++ b/frame/3/bli_l3_sup.h @@ -40,7 +40,7 @@ err_t bli_gemmsup const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ); err_t bli_gemmtsup @@ -51,6 +51,6 @@ err_t bli_gemmtsup const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ); diff --git a/frame/thread/bli_l3_decor.c b/frame/3/bli_l3_sup_decor.c similarity index 59% rename from frame/thread/bli_l3_decor.c rename to frame/3/bli_l3_sup_decor.c index 33fb834be..5f415ac50 100644 --- a/frame/thread/bli_l3_decor.c +++ b/frame/3/bli_l3_sup_decor.c @@ -34,36 +34,63 @@ #include "blis.h" -// Initialize a function pointer array containing function addresses for -// each of the threading-specific level-3 thread decorators. - -static l3_decor_ft l3_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] = +struct l3_sup_decor_params_s { - [BLIS_SINGLE] = bli_l3_thread_decorator_single, - [BLIS_OPENMP] = -#if defined(BLIS_ENABLE_OPENMP) - bli_l3_thread_decorator_openmp, -#elif defined(BLIS_ENABLE_PTHREADS) - NULL, -#else - NULL, -#endif - [BLIS_POSIX] = -#if defined(BLIS_ENABLE_PTHREADS) - bli_l3_thread_decorator_pthreads, -#elif defined(BLIS_ENABLE_OPENMP) - NULL, -#else - NULL, -#endif + l3supint_ft func; + opid_t family; + const obj_t* alpha; + const obj_t* a; + const obj_t* b; + const obj_t* beta; + const obj_t* c; + const cntx_t* cntx; + rntm_t* rntm; + array_t* array; }; +typedef struct l3_sup_decor_params_s l3_sup_decor_params_t; + +static void bli_l3_sup_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, const void* data_void ) +{ + const l3_sup_decor_params_t* data = data_void; + + const l3supint_ft func = data->func; + const opid_t family = data->family; + const obj_t* alpha = data->alpha; + const obj_t* a = data->a; + const obj_t* b = data->b; + const obj_t* beta = data->beta; + const obj_t* c = data->c; + const cntx_t* cntx = data->cntx; + rntm_t* rntm = data->rntm; + array_t* array = data->array; + + ( void )family; + + bli_l3_thread_decorator_thread_check( gl_comm, rntm ); + + // Create the root node of the thread's thrinfo_t structure. + pool_t* pool = bli_apool_array_elem( tid, array ); + thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm ); + + func + ( + alpha, + a, + b, + beta, + c, + cntx, + rntm, + thread + ); -// Define a dispatcher that chooses a threading-specific function from the -// above function pointer array. + // Free the current thread's thrinfo_t structure. + bli_thrinfo_free( thread ); +} -void bli_l3_thread_decorator +err_t bli_l3_sup_thread_decorator ( - l3int_ft func, + l3supint_ft func, opid_t family, const obj_t* alpha, const obj_t* a, @@ -71,15 +98,14 @@ void bli_l3_thread_decorator const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + const rntm_t* rntm ) { - rntm_t rntm_l; + rntm_t rntm_l = *rntm; // Query the threading implementation and the number of threads requested. - timpl_t ti = bli_rntm_thread_impl( rntm ); - dim_t nt = bli_rntm_num_threads( rntm ); + timpl_t ti = bli_rntm_thread_impl( &rntm_l ); + dim_t nt = bli_rntm_num_threads( &rntm_l ); #if 0 printf( "(pre-opt) application requested rntm.thread_impl = %s\n", @@ -88,7 +114,7 @@ void bli_l3_thread_decorator #endif if ( bli_error_checking_is_enabled() ) - bli_l3_thread_decorator_check( rntm ); + bli_l3_thread_decorator_check( &rntm_l ); #ifdef BLIS_ENABLE_NT1_VIA_SINGLE if ( nt == 1 ) @@ -111,11 +137,9 @@ void bli_l3_thread_decorator // than one thread. Here, we choose to favor the requested threading // implementation over the number of threads, and so reset all // parallelism parameters to 1. - rntm_l = *rntm; nt = 1; bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l ); bli_rntm_set_num_threads_only( 1, &rntm_l ); - rntm = &rntm_l; } #if 0 @@ -124,53 +148,28 @@ void bli_l3_thread_decorator ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); #endif - // Use the timpl_t value to index into the corresponding function address - // from the function pointer array. - const l3_decor_ft fp = l3_decor_fpa[ ti ]; - - // Call the threading-specific decorator function. - fp - ( - func, - family, - alpha, - a, - b, - beta, - c, - cntx, - rntm, - cntl - ); -} - -void bli_l3_thread_decorator_check - ( - rntm_t* rntm - ) -{ - //err_t e_val; - - //e_val = bli_check_valid_thread_impl( bli_rntm_thread_impl( rntm ) ); - //bli_check_error_code( e_val ); - - const timpl_t ti = bli_rntm_thread_impl( rntm ); - - if ( -#ifndef BLIS_ENABLE_OPENMP - ti == BLIS_OPENMP || -#endif -#ifndef BLIS_ENABLE_PTHREADS - ti == BLIS_POSIX || -#endif - FALSE - ) - { - fprintf( stderr, "\n" ); - fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); - fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); - fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ ); - bli_abort(); - } + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* array = bli_sba_checkout_array( nt ); + + l3_sup_decor_params_t params; + params.func = func; + params.family = family; + params.alpha = alpha; + params.a = a; + params.b = b; + params.beta = beta; + params.c = c; + params.cntx = cntx; + params.rntm = &rntm_l; + params.array = array; + + bli_thread_launch( ti, nt, bli_l3_sup_thread_decorator_entry, ¶ms ); + + bli_sba_checkin_array( array ); + + return BLIS_SUCCESS; } diff --git a/frame/thread/bli_l3_sup_decor.h b/frame/3/bli_l3_sup_decor.h similarity index 77% rename from frame/thread/bli_l3_sup_decor.h rename to frame/3/bli_l3_sup_decor.h index a271920b4..c8f31a10f 100644 --- a/frame/thread/bli_l3_sup_decor.h +++ b/frame/3/bli_l3_sup_decor.h @@ -47,24 +47,10 @@ typedef err_t (*l3supint_ft) const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, + const rntm_t* rntm, thrinfo_t* thread ); -// Level-3 sup thread decorator function type. -typedef err_t (*l3_sup_decor_ft) - ( - l3supint_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm - ); - // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( @@ -76,19 +62,8 @@ err_t bli_l3_sup_thread_decorator const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ); -void bli_l3_sup_thread_decorator_check - ( - rntm_t* rntm - ); - -// Include definitions specific to the method of multithreading for the -// sup code path. -#include "bli_l3_sup_decor_single.h" -#include "bli_l3_sup_decor_openmp.h" -#include "bli_l3_sup_decor_pthreads.h" - #endif diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c index 3ff13bdb5..ffba1d661 100644 --- a/frame/3/bli_l3_sup_int.c +++ b/frame/3/bli_l3_sup_int.c @@ -42,7 +42,7 @@ err_t bli_gemmsup_int const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, + const rntm_t* rntm, thrinfo_t* thread ) { @@ -137,15 +137,16 @@ err_t bli_gemmsup_int // Update the ways of parallelism for the jc and ic loops, and then // update the current thread's root thrinfo_t node according to the // new ways of parallelism value for the jc loop. - bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); - bli_l3_sup_thrinfo_update_root( rntm, thread ); + rntm_t rntm_l = *rntm; + bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, &rntm_l ); + bli_l3_sup_thrinfo_update( &rntm_l, &thread ); } if ( use_bp ) { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) printf( "bli_l3_sup_int(): var2m primary\n" ); #endif // block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2() @@ -156,7 +157,7 @@ err_t bli_gemmsup_int else // use_pb { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) printf( "bli_l3_sup_int(): var1n primary\n" ); #endif // panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1() @@ -202,15 +203,16 @@ err_t bli_gemmsup_int // Update the ways of parallelism for the jc and ic loops, and then // update the current thread's root thrinfo_t node according to the // new ways of parallelism value for the jc loop. - bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); - bli_l3_sup_thrinfo_update_root( rntm, thread ); + rntm_t rntm_l = *rntm; + bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, &rntm_l ); + bli_l3_sup_thrinfo_update( &rntm_l, &thread ); } if ( use_bp ) { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) printf( "bli_l3_sup_int(): var2m non-primary\n" ); #endif // panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans @@ -221,7 +223,7 @@ err_t bli_gemmsup_int else // use_pb { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) printf( "bli_l3_sup_int(): var1n non-primary\n" ); #endif // block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans @@ -246,7 +248,7 @@ err_t bli_gemmtsup_int const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, + const rntm_t* rntm, thrinfo_t* thread ) { @@ -311,15 +313,16 @@ err_t bli_gemmtsup_int // Update the ways of parallelism for the jc and ic loops, and then // update the current thread's root thrinfo_t node according to the // new ways of parallelism value for the jc loop. - bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); - bli_l3_sup_thrinfo_update_root( rntm, thread ); + rntm_t rntm_l = *rntm; + bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, &rntm_l ); + bli_l3_sup_thrinfo_update( &rntm_l, &thread ); } if ( use_bp ) { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) printf( "bli_l3_sup_int(): var2m primary\n" ); #endif // block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2() @@ -332,7 +335,7 @@ err_t bli_gemmtsup_int else // use_pb { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) printf( "bli_l3_sup_int(): var1n primary\n" ); #endif // panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1() @@ -380,15 +383,16 @@ err_t bli_gemmtsup_int // Update the ways of parallelism for the jc and ic loops, and then // update the current thread's root thrinfo_t node according to the // new ways of parallelism value for the jc loop. - bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); - bli_l3_sup_thrinfo_update_root( rntm, thread ); + rntm_t rntm_l = *rntm; + bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, &rntm_l ); + bli_l3_sup_thrinfo_update( &rntm_l, &thread ); } if ( use_bp ) { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) printf( "bli_l3_sup_int(): var2m non-primary\n" ); #endif // panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans @@ -401,7 +405,7 @@ err_t bli_gemmtsup_int else // use_pb { #ifdef TRACEVAR - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) printf( "bli_l3_sup_int(): var1n non-primary\n" ); #endif // block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans diff --git a/frame/3/bli_l3_sup_int.h b/frame/3/bli_l3_sup_int.h index 195e3ca40..e76f21360 100644 --- a/frame/3/bli_l3_sup_int.h +++ b/frame/3/bli_l3_sup_int.h @@ -40,7 +40,7 @@ err_t bli_gemmsup_int const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, + const rntm_t* rntm, thrinfo_t* thread ); @@ -52,6 +52,6 @@ err_t bli_gemmtsup_int const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, + const rntm_t* rntm, thrinfo_t* thread ); diff --git a/frame/3/bli_l3_sup_oft.h b/frame/3/bli_l3_sup_oft.h index ba60035b7..c36197201 100644 --- a/frame/3/bli_l3_sup_oft.h +++ b/frame/3/bli_l3_sup_oft.h @@ -53,7 +53,7 @@ typedef err_t (*PASTECH(opname,_oft)) \ const obj_t* beta, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); GENTDEF( gemmsup ) diff --git a/frame/3/bli_l3_sup_packm.c b/frame/3/bli_l3_sup_packm.c index 5ed7700dc..797335aeb 100644 --- a/frame/3/bli_l3_sup_packm.c +++ b/frame/3/bli_l3_sup_packm.c @@ -43,8 +43,6 @@ void bli_packm_sup_init_mem dim_t m, dim_t k, dim_t mr, - rntm_t* rntm, - mem_t* mem, thrinfo_t* thread ) { @@ -54,6 +52,9 @@ void bli_packm_sup_init_mem } else // if ( will_pack == TRUE ) { + mem_t* mem = bli_thrinfo_mem( thread ); + pba_t* pba = bli_thrinfo_pba( thread ); + // NOTE: This "rounding up" of the last upanel is actually optional // for the rrc/crc cases, but absolutely necessary for the other cases // since we NEED that last micropanel to have the same ldim (cs_p) as @@ -64,7 +65,7 @@ void bli_packm_sup_init_mem // Barrier to make sure all threads are caught up and ready to begin // the packm stage. - bli_thread_barrier( rntm, thread ); + bli_thrinfo_barrier( thread ); // Compute the size of the memory block eneded. siz_t size_needed = bli_dt_size( dt ) * m_pack * k_pack; @@ -73,7 +74,7 @@ void bli_packm_sup_init_mem // then we need to acquire a block from the pba. if ( bli_mem_is_unalloc( mem ) ) { - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) { // Acquire directly to the chief thread's mem_t that was // passed in. It needs to be that mem_t struct, and not a @@ -85,7 +86,7 @@ void bli_packm_sup_init_mem // then again, I prefer to keep barriers to a minimum.) bli_pba_acquire_m ( - rntm, + pba, size_needed, pack_buf_type, mem @@ -94,13 +95,13 @@ void bli_packm_sup_init_mem // Broadcast the address of the chief thread's passed-in mem_t // to all threads. - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); + mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); // Non-chief threads: Copy the contents of the chief thread's // passed-in mem_t to the passed-in mem_t for this thread. (The // chief thread already has the mem_t, so it does not need to // perform any copy.) - if ( !bli_thread_am_ochief( thread ) ) + if ( !bli_thrinfo_am_chief( thread ) ) { *mem = *mem_p; } @@ -119,7 +120,7 @@ void bli_packm_sup_init_mem if ( mem_size < size_needed ) { - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) { // The chief thread releases the existing block associated // with the mem_t, and then re-acquires a new block, saving @@ -129,12 +130,12 @@ void bli_packm_sup_init_mem // (temporary) mem_t. bli_pba_release ( - rntm, + pba, mem ); bli_pba_acquire_m ( - rntm, + pba, size_needed, pack_buf_type, mem @@ -143,13 +144,13 @@ void bli_packm_sup_init_mem // Broadcast the address of the chief thread's passed-in mem_t // to all threads. - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); + mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); // Non-chief threads: Copy the contents of the chief thread's // passed-in mem_t to the passed-in mem_t for this thread. (The // chief thread already has the mem_t, so it does not need to // perform any copy.) - if ( !bli_thread_am_ochief( thread ) ) + if ( !bli_thrinfo_am_chief( thread ) ) { *mem = *mem_p; } @@ -166,8 +167,6 @@ void bli_packm_sup_init_mem void bli_packm_sup_finalize_mem ( bool did_pack, - rntm_t* rntm, - mem_t* mem, thrinfo_t* thread ) { @@ -178,8 +177,11 @@ void bli_packm_sup_finalize_mem } else // if ( did_pack == TRUE ) { + mem_t* mem = bli_thrinfo_mem( thread ); + pba_t* pba = bli_thrinfo_pba( thread ); + if ( thread != NULL ) - if ( bli_thread_am_ochief( thread ) ) + if ( bli_thrinfo_am_chief( thread ) ) { // Check the mem_t entry provided by the caller. Only proceed if it // is allocated, which it should be. @@ -187,7 +189,7 @@ void bli_packm_sup_finalize_mem { bli_pba_release ( - rntm, + pba, mem ); } @@ -197,18 +199,18 @@ void bli_packm_sup_finalize_mem void bli_packm_sup_init ( - bool will_pack, - stor3_t stor_id, - pack_t* schema, - dim_t m, - dim_t k, - dim_t mr, - dim_t* m_max, - dim_t* k_max, - const void* x, inc_t rs_x, inc_t cs_x, - void** p, inc_t* rs_p, inc_t* cs_p, - dim_t* pd_p, inc_t* ps_p, - mem_t* mem + bool will_pack, + stor3_t stor_id, + pack_t* schema, + dim_t m, + dim_t k, + dim_t mr, + dim_t* m_max, + dim_t* k_max, + const void* x, inc_t rs_x, inc_t cs_x, + void** p, inc_t* rs_p, inc_t* cs_p, + dim_t* pd_p, inc_t* ps_p, + thrinfo_t* thread ) { // Inspect whether we are going to be packing matrix A. @@ -277,7 +279,7 @@ void bli_packm_sup_init // Set the buffer address provided by the caller to point to the // memory associated with the mem_t entry acquired from the pba. - *p = bli_mem_buffer( mem ); + *p = bli_mem_buffer( bli_thrinfo_mem( thread ) ); } } @@ -334,8 +336,6 @@ void bli_packm_sup void** p, inc_t* rs_p, inc_t* cs_p, inc_t* ps_p, const cntx_t* cntx, - rntm_t* rntm, - mem_t* mem, thrinfo_t* thread ) { @@ -351,8 +351,6 @@ void bli_packm_sup will_pack, pack_buf_type, dt, m_alloc, k_alloc, mr, - rntm, - mem, thread ); @@ -369,7 +367,7 @@ void bli_packm_sup a, rs_a, cs_a, p, rs_p, cs_p, &pd_p, ps_p, - mem + thread ); // Inspect whether we are going to be packing matrix A. @@ -422,7 +420,7 @@ void bli_packm_sup } // Barrier so that packing is done before computation. - bli_thread_barrier( rntm, thread ); + bli_thrinfo_barrier( thread ); } } diff --git a/frame/3/bli_l3_sup_packm.h b/frame/3/bli_l3_sup_packm.h index a84d4e45c..032ba0afe 100644 --- a/frame/3/bli_l3_sup_packm.h +++ b/frame/3/bli_l3_sup_packm.h @@ -42,16 +42,12 @@ void bli_packm_sup_init_mem dim_t m, dim_t k, dim_t mr, - rntm_t* rntm, - mem_t* mem, thrinfo_t* thread ); void bli_packm_sup_finalize_mem ( bool did_pack, - rntm_t* rntm, - mem_t* mem, thrinfo_t* thread ); @@ -68,7 +64,7 @@ void bli_packm_sup_init const void* x, inc_t rs_x, inc_t cs_x, void** p, inc_t* rs_p, inc_t* cs_p, dim_t* pd_p, inc_t* ps_p, - mem_t* mem + thrinfo_t* thread ); void bli_packm_sup @@ -88,8 +84,6 @@ void bli_packm_sup void** p, inc_t* rs_p, inc_t* cs_p, inc_t* ps_p, const cntx_t* cntx, - rntm_t* rntm, - mem_t* mem, thrinfo_t* thread ); diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c index 71357cec4..e47f65aea 100644 --- a/frame/3/bli_l3_sup_packm_var.c +++ b/frame/3/bli_l3_sup_packm_var.c @@ -145,8 +145,8 @@ void PASTEMAC(ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ + const dim_t nt = bli_thrinfo_n_way( thread ); \ + const dim_t tid = bli_thrinfo_work_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ @@ -234,9 +234,9 @@ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel \ /* if ( col_stored ) { \ - if ( bli_thread_work_id( thread ) == 0 ) \ + if ( bli_thrinfo_work_id( thread ) == 0 ) \ { \ - printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thrinfo_work_id( thread ), c_use, p_use ); \ fflush( stdout ); \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ @@ -244,10 +244,10 @@ if ( col_stored ) { \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_barrier( rntm, thread ); \ - if ( bli_thread_work_id( thread ) == 1 ) \ +bli_thrinfo_barrier( thread ); \ + if ( bli_thrinfo_work_id( thread ) == 1 ) \ { \ - printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thrinfo_work_id( thread ), c_use, p_use ); \ fflush( stdout ); \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ @@ -255,12 +255,12 @@ bli_thread_barrier( rntm, thread ); \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_barrier( rntm, thread ); \ +bli_thrinfo_barrier( thread ); \ } \ else { \ - if ( bli_thread_work_id( thread ) == 0 ) \ + if ( bli_thrinfo_work_id( thread ) == 0 ) \ { \ - printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thrinfo_work_id( thread ), c_use, p_use ); \ fflush( stdout ); \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ @@ -268,10 +268,10 @@ else { \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_barrier( rntm, thread ); \ - if ( bli_thread_work_id( thread ) == 1 ) \ +bli_thrinfo_barrier( thread ); \ + if ( bli_thrinfo_work_id( thread ) == 1 ) \ { \ - printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thrinfo_work_id( thread ), c_use, p_use ); \ fflush( stdout ); \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ @@ -279,7 +279,7 @@ bli_thread_barrier( rntm, thread ); \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_barrier( rntm, thread ); \ +bli_thrinfo_barrier( thread ); \ } \ */ /* @@ -388,8 +388,8 @@ void PASTEMAC(ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ + const dim_t nt = bli_thrinfo_n_way( thread ); \ + const dim_t tid = bli_thrinfo_work_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c index e4858621a..5d7ea345c 100644 --- a/frame/3/bli_l3_sup_var1n2m.c +++ b/frame/3/bli_l3_sup_var1n2m.c @@ -48,7 +48,7 @@ void bli_gemmsup_ref_var1n const obj_t* c, stor3_t stor_id, const cntx_t* cntx, - rntm_t* rntm, + const rntm_t* rntm, thrinfo_t* thread ) { @@ -118,18 +118,18 @@ void bli_gemmsup_ref_var1n // Note: This code explicitly performs the swaps that could be done // implicitly in other BLIS contexts where a type-specific helper function // was being called. - if ( bli_is_trans( trans ) ) - { - bool packtmp = packa; packa = packb; packb = packtmp; - conj_t conjtmp = conja; conja = conjb; conjb = conjtmp; - dim_t len_tmp = m; m = n; n = len_tmp; - const void* buf_tmp = buf_a; buf_a = buf_b; buf_b = buf_tmp; - inc_t str_tmp = rs_a; rs_a = cs_b; cs_b = str_tmp; - str_tmp = cs_a; cs_a = rs_b; rs_b = str_tmp; - str_tmp = rs_c; rs_c = cs_c; cs_c = str_tmp; - - stor_id = bli_stor3_trans( stor_id ); - } + if ( bli_is_trans( trans ) ) + { + bool packtmp = packa; packa = packb; packb = packtmp; + conj_t conjtmp = conja; conja = conjb; conjb = conjtmp; + dim_t len_tmp = m; m = n; n = len_tmp; + const void* buf_tmp = buf_a; buf_a = buf_b; buf_b = buf_tmp; + inc_t str_tmp = rs_a; rs_a = cs_b; cs_b = str_tmp; + str_tmp = cs_a; cs_a = rs_b; rs_b = str_tmp; + str_tmp = rs_c; rs_c = cs_c; cs_c = str_tmp; + + stor_id = bli_stor3_trans( stor_id ); + } // This transposition of the stor3_t id value is inherent to variant 1. // The reason: we assume that variant 2 is the "main" variant. The @@ -230,55 +230,15 @@ void bli_gemmsup_ref_var1n auxinfo_t aux; - mem_t mem_a = BLIS_MEM_INITIALIZER; - mem_t mem_b = BLIS_MEM_INITIALIZER; - - // Define an array of bszid_t ids, which will act as our substitute for - // the cntl_t tree. - // NOTE: These bszid_t values, and their order, match that of the bp - // algorithm (variant 2) because they are not used to query actual - // blocksizes but rather query the ways of parallelism for the various - // loops. For example, the 2nd loop in variant 1 partitions in the m - // dimension (in increments of MR), but parallelizes that m dimension - // with BLIS_JR_NT. - // Note that this panel-block algorithm partitions an NC x KC submatrix - // of A to be packed in the 4th loop, and a KC x MC submatrix of B to be - // packed in the 3rd loop. - // 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop - bszid_t bszids[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; - // Determine whether we are using more than one thread. const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); - thrinfo_t* thread_jc = NULL; - thrinfo_t* thread_pc = NULL; - thrinfo_t* thread_pa = NULL; - thrinfo_t* thread_ic = NULL; - thrinfo_t* thread_pb = NULL; - thrinfo_t* thread_jr = NULL; - - // Pre-grow the thrinfo_t tree. - bszid_t* bszids_jc = bszids; - thread_jc = thread; - bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); - - bszid_t* bszids_pc = &bszids_jc[1]; - thread_pc = bli_thrinfo_sub_node( thread_jc ); - bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); - - bszid_t* bszids_pa = &bszids_pc[1]; - thread_pa = bli_thrinfo_sub_node( thread_pc ); - - bszid_t* bszids_ic = &bszids_pa[1]; - thread_ic = bli_thrinfo_sub_node( thread_pa ); - bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); - - bszid_t* bszids_pb = &bszids_ic[1]; - thread_pb = bli_thrinfo_sub_node( thread_ic ); - - bszid_t* bszids_jr = &bszids_pb[1]; - thread_jr = bli_thrinfo_sub_node( thread_pb ); - bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); + thrinfo_t* thread_jc = bli_thrinfo_sub_node( thread ); + thrinfo_t* thread_pc = bli_thrinfo_sub_node( thread_jc ); + thrinfo_t* thread_pa = bli_thrinfo_sub_node( thread_pc ); + thrinfo_t* thread_ic = bli_thrinfo_sub_node( thread_pa ); + thrinfo_t* thread_pb = bli_thrinfo_sub_node( thread_ic ); + thrinfo_t* thread_jr = bli_thrinfo_sub_node( thread_pb ); // Compute the JC loop thread range for the current thread. dim_t jc_start, jc_end; @@ -320,7 +280,7 @@ void bli_gemmsup_ref_var1n // Only apply beta to the first iteration of the pc loop. const void* beta_use = ( pp == 0 ? buf_beta : one ); - char* a_use; + char* a_use; inc_t rs_a_use, cs_a_use, ps_a_use; // Determine the packing buffer and related parameters for matrix @@ -344,8 +304,6 @@ void bli_gemmsup_ref_var1n ( void** )&a_use, &rs_a_use, &cs_a_use, &ps_a_use, cntx, - rntm, - &mem_a, thread_pa ); @@ -402,8 +360,6 @@ void bli_gemmsup_ref_var1n ( void** )&b_use, &cs_b_use, &rs_b_use, &ps_b_use, cntx, - rntm, - &mem_b, thread_pb ); @@ -472,7 +428,7 @@ void bli_gemmsup_ref_var1n // NOTE: This barrier is only needed if we are packing A (since // that matrix is packed within the pc loop of this variant). - if ( packa ) bli_thread_barrier( rntm, thread_pa ); + if ( packa ) bli_thrinfo_barrier( thread_pa ); } } @@ -480,15 +436,11 @@ void bli_gemmsup_ref_var1n bli_packm_sup_finalize_mem ( packa, - rntm, - &mem_a, thread_pa ); bli_packm_sup_finalize_mem ( packb, - rntm, - &mem_b, thread_pb ); @@ -514,7 +466,7 @@ void bli_gemmsup_ref_var2m const obj_t* c, stor3_t stor_id, const cntx_t* cntx, - rntm_t* rntm, + const rntm_t* rntm, thrinfo_t* thread ) { @@ -680,46 +632,15 @@ void bli_gemmsup_ref_var2m auxinfo_t aux; - mem_t mem_a = BLIS_MEM_INITIALIZER; - mem_t mem_b = BLIS_MEM_INITIALIZER; - - // Define an array of bszid_t ids, which will act as our substitute for - // the cntl_t tree. - // 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop - bszid_t bszids[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; - // Determine whether we are using more than one thread. const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); - thrinfo_t* thread_jc = NULL; - thrinfo_t* thread_pc = NULL; - thrinfo_t* thread_pb = NULL; - thrinfo_t* thread_ic = NULL; - thrinfo_t* thread_pa = NULL; - thrinfo_t* thread_jr = NULL; - - // Pre-grow the thrinfo_t tree. - bszid_t* bszids_jc = bszids; - thread_jc = thread; - bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); - - bszid_t* bszids_pc = &bszids_jc[1]; - thread_pc = bli_thrinfo_sub_node( thread_jc ); - bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); - - bszid_t* bszids_pb = &bszids_pc[1]; - thread_pb = bli_thrinfo_sub_node( thread_pc ); - - bszid_t* bszids_ic = &bszids_pb[1]; - thread_ic = bli_thrinfo_sub_node( thread_pb ); - bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); - - bszid_t* bszids_pa = &bszids_ic[1]; - thread_pa = bli_thrinfo_sub_node( thread_ic ); - - bszid_t* bszids_jr = &bszids_pa[1]; - thread_jr = bli_thrinfo_sub_node( thread_pa ); - bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); + thrinfo_t* thread_jc = bli_thrinfo_sub_node( thread ); + thrinfo_t* thread_pc = bli_thrinfo_sub_node( thread_jc ); + thrinfo_t* thread_pb = bli_thrinfo_sub_node( thread_pc ); + thrinfo_t* thread_ic = bli_thrinfo_sub_node( thread_pb ); + thrinfo_t* thread_pa = bli_thrinfo_sub_node( thread_ic ); + thrinfo_t* thread_jr = bli_thrinfo_sub_node( thread_pa ); // Compute the JC loop thread range for the current thread. dim_t jc_start, jc_end; @@ -783,8 +704,6 @@ void bli_gemmsup_ref_var2m ( void** )&b_use, &cs_b_use, &rs_b_use, &ps_b_use, cntx, - rntm, - &mem_b, thread_pb ); @@ -839,8 +758,6 @@ void bli_gemmsup_ref_var2m ( void** )&a_use, &rs_a_use, &cs_a_use, &ps_a_use, cntx, - rntm, - &mem_a, thread_pa ); @@ -909,7 +826,7 @@ void bli_gemmsup_ref_var2m // NOTE: This barrier is only needed if we are packing B (since // that matrix is packed within the pc loop of this variant). - if ( packb ) bli_thread_barrier( rntm, thread_pb ); + if ( packb ) bli_thrinfo_barrier( thread_pb ); } } @@ -917,15 +834,11 @@ void bli_gemmsup_ref_var2m bli_packm_sup_finalize_mem ( packa, - rntm, - &mem_a, thread_pa ); bli_packm_sup_finalize_mem ( packb, - rntm, - &mem_b, thread_pb ); diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h index be6b17f39..8bbb73ca9 100644 --- a/frame/3/bli_l3_sup_vars.h +++ b/frame/3/bli_l3_sup_vars.h @@ -50,7 +50,7 @@ void PASTEMAC0(opname) \ const obj_t* c, \ stor3_t eff_id, \ const cntx_t* cntx, \ - rntm_t* rntm, \ + const rntm_t* rntm, \ thrinfo_t* thread \ ); diff --git a/frame/3/bli_l3_tapi_ex.c b/frame/3/bli_l3_tapi_ex.c index c934ba949..130237ee4 100644 --- a/frame/3/bli_l3_tapi_ex.c +++ b/frame/3/bli_l3_tapi_ex.c @@ -55,7 +55,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -115,7 +115,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -178,7 +178,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -236,7 +236,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -298,7 +298,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -355,7 +355,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -418,7 +418,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -481,7 +481,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ @@ -545,7 +545,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ const ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ) \ { \ bli_init_once(); \ diff --git a/frame/3/bli_l3_tapi_ex.h b/frame/3/bli_l3_tapi_ex.h index eb142af05..d8610dee8 100644 --- a/frame/3/bli_l3_tapi_ex.h +++ b/frame/3/bli_l3_tapi_ex.h @@ -54,7 +54,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) @@ -76,7 +76,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) @@ -97,7 +97,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) @@ -119,7 +119,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) @@ -139,7 +139,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) @@ -161,7 +161,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) @@ -186,7 +186,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) @@ -207,7 +207,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ const ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ const cntx_t* cntx, \ - rntm_t* rntm \ + const rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index f866cfd4c..402497153 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -34,139 +34,135 @@ */ #include "blis.h" -#include "assert.h" -void bli_l3_thrinfo_init_single +thrinfo_t* bli_l3_thrinfo_create ( - thrinfo_t* thread + dim_t id, + thrcomm_t* gl_comm, + array_t* array, + const rntm_t* rntm, + const cntl_t* cntl ) { - bli_thrinfo_init_single( thread ); -} + pool_t* pool = NULL; + if ( array != NULL ) + pool = bli_apool_array_elem( id, array ); -void bli_l3_thrinfo_free - ( - rntm_t* rntm, - thrinfo_t* thread - ) -{ - bli_thrinfo_free( rntm, thread ); -} + // Create the root thrinfo_t node. + thrinfo_t* root = bli_thrinfo_create_root + ( + gl_comm, + id, + pool, + bli_pba_query() + ); -void bli_l3_sup_thrinfo_free - ( - rntm_t* rntm, - thrinfo_t* thread - ) -{ - bli_thrinfo_free( rntm, thread ); -} + bli_l3_thrinfo_grow( root, rntm, cntl ); -// ----------------------------------------------------------------------------- + return root; +} -void bli_l3_thrinfo_create_root +void bli_l3_thrinfo_grow ( - dim_t id, - thrcomm_t* gl_comm, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t** thread + thrinfo_t* thread_par, + const rntm_t* rntm, + const cntl_t* cntl ) { - // Query the global communicator for the total number of threads to use. - dim_t n_threads = bli_thrcomm_num_threads( gl_comm ); - - // Use the thread id passed in as the global communicator id. - dim_t gl_comm_id = id; + const cntl_t* sub_prenode = bli_cntl_sub_prenode( cntl ); + const cntl_t* sub_node = bli_cntl_sub_node( cntl ); + const bszid_t bszid = bli_cntl_bszid( cntl ); + const dim_t n_way = bli_rntm_ways_for( bszid, rntm ); - // Use the blocksize id of the current (root) control tree node to - // query the top-most ways of parallelism to obtain. - bszid_t bszid = bli_cntl_bszid( cntl ); - dim_t xx_way = bli_rntm_ways_for( bszid, rntm ); + thrinfo_t* thread_cur = bli_thrinfo_split( n_way, thread_par ); + bli_thrinfo_set_sub_node( thread_cur, thread_par ); - // Determine the work id for this thrinfo_t node. - dim_t work_id = gl_comm_id / ( n_threads / xx_way ); + if ( sub_prenode != NULL ) + { + // A pre-node is only used in the IC loop of trsm. In this case, + // we cannot actually thread in the m dimension due to data dependencies + // and so all parallelism must be moved down to the JR loop. + rntm_t rntm_l = *rntm; + const dim_t ic_nway = bli_rntm_ic_ways( &rntm_l ); + const dim_t jr_nway = bli_rntm_jr_ways( &rntm_l ); + bli_rntm_set_ic_ways_only( 1, &rntm_l ); + bli_rntm_set_jr_ways_only( ic_nway*jr_nway, &rntm_l ); + + // Use thread_pre instead of thread_cur since we *don't* want to + // do any parallelism at this level. + thrinfo_t* thread_pre = bli_thrinfo_split( 1, thread_par ); + bli_thrinfo_set_sub_prenode( thread_pre, thread_par ); + bli_l3_thrinfo_grow( thread_pre, &rntm_l, sub_prenode ); + } - // Create the root thrinfo_t node. - *thread = bli_thrinfo_create - ( - rntm, - gl_comm, - gl_comm_id, - xx_way, - work_id, - TRUE, - bszid, - NULL - ); + if ( sub_node != NULL ) + { + bli_l3_thrinfo_grow( thread_cur, rntm, sub_node ); + } } // ----------------------------------------------------------------------------- -void bli_l3_sup_thrinfo_create_root +thrinfo_t* bli_l3_sup_thrinfo_create ( - dim_t id, - thrcomm_t* gl_comm, - rntm_t* rntm, - thrinfo_t** thread + dim_t id, + thrcomm_t* gl_comm, + pool_t* pool, + const rntm_t* rntm ) { - // Query the global communicator for the total number of threads to use. - dim_t n_threads = bli_thrcomm_num_threads( gl_comm ); - - // Use the thread id passed in as the global communicator id. - dim_t gl_comm_id = id; - - // Use the BLIS_NC blocksize id to query the top-most ways of parallelism - // to obtain. Note that hard-coding BLIS_NC like this is a little bit of a - // hack, but it works fine since both of the sup algorithms (bp and pb) use - // the cache blocksizes down to the 3rd loop. (See the definitions of - // bli_rntm_calc_num_threads_bp() and bli_rntm_calc_num_threads_pb() for - // a concise enumeration of these bszid_t ids.) - const bszid_t bszid = BLIS_NC; - dim_t xx_way = bli_rntm_ways_for( BLIS_NC, rntm ); - - // Determine the work id for this thrinfo_t node. - dim_t work_id = gl_comm_id / ( n_threads / xx_way ); - // Create the root thrinfo_t node. - *thread = bli_thrinfo_create + thrinfo_t* root = bli_thrinfo_create_root ( - rntm, gl_comm, - gl_comm_id, - xx_way, - work_id, - TRUE, - bszid, - NULL + id, + pool, + bli_pba_query() ); -} -// ----------------------------------------------------------------------------- + const dim_t n_way_jc = bli_rntm_ways_for( BLIS_NC, rntm ); + const dim_t n_way_pc = bli_rntm_ways_for( BLIS_KC, rntm ); + const dim_t n_way_ic = bli_rntm_ways_for( BLIS_MC, rntm ); + const dim_t n_way_jr = bli_rntm_ways_for( BLIS_NR, rntm ); + const dim_t n_way_ir = bli_rntm_ways_for( BLIS_MR, rntm ); + + thrinfo_t* thread_jc = bli_thrinfo_split( n_way_jc, root ); + thrinfo_t* thread_pc = bli_thrinfo_split( n_way_pc, thread_jc ); + thrinfo_t* thread_pb = bli_thrinfo_split( 1, thread_pc ); + thrinfo_t* thread_ic = bli_thrinfo_split( n_way_ic, thread_pb ); + thrinfo_t* thread_pa = bli_thrinfo_split( 1, thread_ic ); + thrinfo_t* thread_jr = bli_thrinfo_split( n_way_jr, thread_pa ); + thrinfo_t* thread_ir = bli_thrinfo_split( n_way_ir, thread_jr ); + + bli_thrinfo_set_sub_node( thread_jc, root ); + bli_thrinfo_set_sub_node( thread_pc, thread_jc ); + bli_thrinfo_set_sub_node( thread_pb, thread_pc ); + bli_thrinfo_set_sub_node( thread_ic, thread_pb ); + bli_thrinfo_set_sub_node( thread_pa, thread_ic ); + bli_thrinfo_set_sub_node( thread_jr, thread_pa ); + bli_thrinfo_set_sub_node( thread_ir, thread_jr ); + + return root; +} -void bli_l3_sup_thrinfo_update_root +void bli_l3_sup_thrinfo_update ( - rntm_t* rntm, - thrinfo_t* thread + const rntm_t* rntm, + thrinfo_t** root ) { - // Query the current root for the total number of threads to use. - const dim_t n_threads = bli_thread_num_threads( thread ); - - // Query the current root for the (global) comm id. - const dim_t gl_comm_id = bli_thread_ocomm_id( thread ); - - // Query the rntm_t for the updated number of ways of parallelism. - const dim_t xx_way = bli_rntm_ways_for( BLIS_NC, rntm ); - - // Recompute the work id for this thrinfo_t node using the updated - // number of ways of parallelism. - dim_t work_id = gl_comm_id / ( n_threads / xx_way ); - - // Save the updated ways of parallelism and work id to the thrinfo_t node. - bli_thrinfo_set_n_way( xx_way, thread ); - bli_thrinfo_set_work_id( work_id, thread ); + thrcomm_t* gl_comm = bli_thrinfo_comm( *root ); + dim_t tid = bli_thrinfo_thread_id( *root ); + pool_t* pool = bli_thrinfo_sba_pool( *root ); + dim_t nt = bli_thrinfo_num_threads( *root ); + + // Return early in single-threaded execution + // since the thread control tree may not have been + // allocated normally + if ( nt == 1 ) return; + + bli_thrinfo_free( *root ); + *root = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm ); } // ----------------------------------------------------------------------------- @@ -178,7 +174,7 @@ void bli_l3_thrinfo_print_gemm_paths { // In order to query the number of threads, we query the only thread we // know exists: thread 0. - dim_t n_threads = bli_thread_num_threads( threads[0] ); + dim_t n_threads = bli_thrinfo_num_threads( threads[0] ); // For the purposes of printing the "header" information that is common // to the various instances of a thrinfo_t (ie: across all threads), we @@ -211,44 +207,44 @@ void bli_l3_thrinfo_print_gemm_paths if ( !jc_info ) goto print_header; - jc_way = bli_thread_n_way( jc_info ); - jc_nt = bli_thread_num_threads( jc_info ); + jc_way = bli_thrinfo_n_way( jc_info ); + jc_nt = bli_thrinfo_num_threads( jc_info ); pc_info = bli_thrinfo_sub_node( jc_info ); if ( !pc_info ) goto print_header; - pc_way = bli_thread_n_way( pc_info ); - pc_nt = bli_thread_num_threads( pc_info ); + pc_way = bli_thrinfo_n_way( pc_info ); + pc_nt = bli_thrinfo_num_threads( pc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); if ( !pb_info ) goto print_header; - pb_way = bli_thread_n_way( pb_info ); - pb_nt = bli_thread_num_threads( pb_info ); + pb_way = bli_thrinfo_n_way( pb_info ); + pb_nt = bli_thrinfo_num_threads( pb_info ); ic_info = bli_thrinfo_sub_node( pb_info ); if ( !ic_info ) goto print_header; - ic_way = bli_thread_n_way( ic_info ); - ic_nt = bli_thread_num_threads( ic_info ); + ic_way = bli_thrinfo_n_way( ic_info ); + ic_nt = bli_thrinfo_num_threads( ic_info ); pa_info = bli_thrinfo_sub_node( ic_info ); if ( !pa_info ) goto print_header; - pa_way = bli_thread_n_way( pa_info ); - pa_nt = bli_thread_num_threads( pa_info ); + pa_way = bli_thrinfo_n_way( pa_info ); + pa_nt = bli_thrinfo_num_threads( pa_info ); jr_info = bli_thrinfo_sub_node( pa_info ); if ( !jr_info ) goto print_header; - jr_way = bli_thread_n_way( jr_info ); - jr_nt = bli_thread_num_threads( jr_info ); + jr_way = bli_thrinfo_n_way( jr_info ); + jr_nt = bli_thrinfo_num_threads( jr_info ); ir_info = bli_thrinfo_sub_node( jr_info ); if ( !ir_info ) goto print_header; - ir_way = bli_thread_n_way( ir_info ); - ir_nt = bli_thread_num_threads( ir_info ); + ir_way = bli_thrinfo_n_way( ir_info ); + ir_nt = bli_thrinfo_num_threads( ir_info ); print_header: @@ -262,7 +258,7 @@ void bli_l3_thrinfo_print_gemm_paths ( unsigned long )jr_nt, ( unsigned long )ir_nt ); printf( "xx_way: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n", - ( unsigned long )jc_way, + ( unsigned long )jc_way, ( unsigned long )pc_way, ( unsigned long )pb_way, ( unsigned long )ic_way, @@ -283,44 +279,44 @@ void bli_l3_thrinfo_print_gemm_paths if ( !jc_info ) goto print_thrinfo; - jc_comm_id = bli_thread_ocomm_id( jc_info ); - jc_work_id = bli_thread_work_id( jc_info ); + jc_comm_id = bli_thrinfo_thread_id( jc_info ); + jc_work_id = bli_thrinfo_work_id( jc_info ); pc_info = bli_thrinfo_sub_node( jc_info ); if ( !pc_info ) goto print_thrinfo; - pc_comm_id = bli_thread_ocomm_id( pc_info ); - pc_work_id = bli_thread_work_id( pc_info ); + pc_comm_id = bli_thrinfo_thread_id( pc_info ); + pc_work_id = bli_thrinfo_work_id( pc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); if ( !pb_info ) goto print_thrinfo; - pb_comm_id = bli_thread_ocomm_id( pb_info ); - pb_work_id = bli_thread_work_id( pb_info ); + pb_comm_id = bli_thrinfo_thread_id( pb_info ); + pb_work_id = bli_thrinfo_work_id( pb_info ); ic_info = bli_thrinfo_sub_node( pb_info ); if ( !ic_info ) goto print_thrinfo; - ic_comm_id = bli_thread_ocomm_id( ic_info ); - ic_work_id = bli_thread_work_id( ic_info ); + ic_comm_id = bli_thrinfo_thread_id( ic_info ); + ic_work_id = bli_thrinfo_work_id( ic_info ); pa_info = bli_thrinfo_sub_node( ic_info ); if ( !pa_info ) goto print_thrinfo; - pa_comm_id = bli_thread_ocomm_id( pa_info ); - pa_work_id = bli_thread_work_id( pa_info ); + pa_comm_id = bli_thrinfo_thread_id( pa_info ); + pa_work_id = bli_thrinfo_work_id( pa_info ); jr_info = bli_thrinfo_sub_node( pa_info ); if ( !jr_info ) goto print_thrinfo; - jr_comm_id = bli_thread_ocomm_id( jr_info ); - jr_work_id = bli_thread_work_id( jr_info ); + jr_comm_id = bli_thrinfo_thread_id( jr_info ); + jr_work_id = bli_thrinfo_work_id( jr_info ); ir_info = bli_thrinfo_sub_node( jr_info ); if ( !ir_info ) goto print_thrinfo; - ir_comm_id = bli_thread_ocomm_id( ir_info ); - ir_work_id = bli_thread_work_id( ir_info ); + ir_comm_id = bli_thrinfo_thread_id( ir_info ); + ir_work_id = bli_thrinfo_work_id( ir_info ); print_thrinfo: @@ -356,7 +352,7 @@ void bli_l3_thrinfo_print_trsm_paths { // In order to query the number of threads, we query the only thread we // know exists: thread 0. - dim_t n_threads = bli_thread_num_threads( threads[0] ); + dim_t n_threads = bli_thrinfo_num_threads( threads[0] ); // For the purposes of printing the "header" information that is common // to the various instances of a thrinfo_t (ie: across all threads), we @@ -391,26 +387,26 @@ void bli_l3_thrinfo_print_trsm_paths if ( !jc_info ) goto print_header; - jc_way = bli_thread_n_way( jc_info ); - jc_nt = bli_thread_num_threads( jc_info ); + jc_way = bli_thrinfo_n_way( jc_info ); + jc_nt = bli_thrinfo_num_threads( jc_info ); pc_info = bli_thrinfo_sub_node( jc_info ); if ( !pc_info ) goto print_header; - pc_way = bli_thread_n_way( pc_info ); - pc_nt = bli_thread_num_threads( pc_info ); + pc_way = bli_thrinfo_n_way( pc_info ); + pc_nt = bli_thrinfo_num_threads( pc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); if ( !pb_info ) goto print_header; - pb_way = bli_thread_n_way( pb_info ); - pb_nt = bli_thread_num_threads( pb_info ); + pb_way = bli_thrinfo_n_way( pb_info ); + pb_nt = bli_thrinfo_num_threads( pb_info ); ic_info = bli_thrinfo_sub_node( pb_info ); if ( !ic_info ) goto print_header; - ic_way = bli_thread_n_way( ic_info ); - ic_nt = bli_thread_num_threads( ic_info ); + ic_way = bli_thrinfo_n_way( ic_info ); + ic_nt = bli_thrinfo_num_threads( ic_info ); pa_info = bli_thrinfo_sub_node( ic_info ); pa_info0 = bli_thrinfo_sub_prenode( ic_info ); @@ -418,39 +414,39 @@ void bli_l3_thrinfo_print_trsm_paths if ( !pa_info0 ) goto check_header_node; - pa_way0 = bli_thread_n_way( pa_info0 ); - pa_nt0 = bli_thread_num_threads( pa_info0 ); + pa_way0 = bli_thrinfo_n_way( pa_info0 ); + pa_nt0 = bli_thrinfo_num_threads( pa_info0 ); jr_info0 = bli_thrinfo_sub_node( pa_info0 ); if ( !jr_info0 ) goto check_header_node; - jr_way0 = bli_thread_n_way( jr_info0 ); - jr_nt0 = bli_thread_num_threads( jr_info0 ); + jr_way0 = bli_thrinfo_n_way( jr_info0 ); + jr_nt0 = bli_thrinfo_num_threads( jr_info0 ); ir_info0 = bli_thrinfo_sub_node( jr_info0 ); if ( !ir_info0 ) goto check_header_node; - ir_way0 = bli_thread_n_way( ir_info0 ); - ir_nt0 = bli_thread_num_threads( ir_info0 ); + ir_way0 = bli_thrinfo_n_way( ir_info0 ); + ir_nt0 = bli_thrinfo_num_threads( ir_info0 ); check_header_node: if ( !pa_info ) goto print_header; - pa_way = bli_thread_n_way( pa_info ); - pa_nt = bli_thread_num_threads( pa_info ); + pa_way = bli_thrinfo_n_way( pa_info ); + pa_nt = bli_thrinfo_num_threads( pa_info ); jr_info = bli_thrinfo_sub_node( pa_info ); if ( !jr_info ) goto print_header; - jr_way = bli_thread_n_way( jr_info ); - jr_nt = bli_thread_num_threads( jr_info ); + jr_way = bli_thrinfo_n_way( jr_info ); + jr_nt = bli_thrinfo_num_threads( jr_info ); ir_info = bli_thrinfo_sub_node( jr_info ); if ( !ir_info ) goto print_header; - ir_way = bli_thread_n_way( ir_info ); - ir_nt = bli_thread_num_threads( ir_info ); + ir_way = bli_thrinfo_n_way( ir_info ); + ir_nt = bli_thrinfo_num_threads( ir_info ); print_header: @@ -493,26 +489,26 @@ void bli_l3_thrinfo_print_trsm_paths if ( !jc_info ) goto print_thrinfo; - jc_comm_id = bli_thread_ocomm_id( jc_info ); - jc_work_id = bli_thread_work_id( jc_info ); + jc_comm_id = bli_thrinfo_thread_id( jc_info ); + jc_work_id = bli_thrinfo_work_id( jc_info ); pc_info = bli_thrinfo_sub_node( jc_info ); if ( !pc_info ) goto print_thrinfo; - pc_comm_id = bli_thread_ocomm_id( pc_info ); - pc_work_id = bli_thread_work_id( pc_info ); + pc_comm_id = bli_thrinfo_thread_id( pc_info ); + pc_work_id = bli_thrinfo_work_id( pc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); if ( !pb_info ) goto print_thrinfo; - pb_comm_id = bli_thread_ocomm_id( pb_info ); - pb_work_id = bli_thread_work_id( pb_info ); + pb_comm_id = bli_thrinfo_thread_id( pb_info ); + pb_work_id = bli_thrinfo_work_id( pb_info ); ic_info = bli_thrinfo_sub_node( pb_info ); if ( !ic_info ) goto print_thrinfo; - ic_comm_id = bli_thread_ocomm_id( ic_info ); - ic_work_id = bli_thread_work_id( ic_info ); + ic_comm_id = bli_thrinfo_thread_id( ic_info ); + ic_work_id = bli_thrinfo_work_id( ic_info ); pa_info = bli_thrinfo_sub_node( ic_info ); pa_info0 = bli_thrinfo_sub_prenode( ic_info ); @@ -520,39 +516,39 @@ void bli_l3_thrinfo_print_trsm_paths if ( !pa_info0 ) goto check_thrinfo_node; - pa_comm_id0 = bli_thread_ocomm_id( pa_info0 ); - pa_work_id0 = bli_thread_work_id( pa_info0 ); + pa_comm_id0 = bli_thrinfo_thread_id( pa_info0 ); + pa_work_id0 = bli_thrinfo_work_id( pa_info0 ); jr_info0 = bli_thrinfo_sub_node( pa_info0 ); if ( !jr_info0 ) goto check_thrinfo_node; - jr_comm_id0 = bli_thread_ocomm_id( jr_info0 ); - jr_work_id0 = bli_thread_work_id( jr_info0 ); + jr_comm_id0 = bli_thrinfo_thread_id( jr_info0 ); + jr_work_id0 = bli_thrinfo_work_id( jr_info0 ); ir_info0 = bli_thrinfo_sub_node( jr_info0 ); if ( !ir_info0 ) goto check_thrinfo_node; - ir_comm_id0 = bli_thread_ocomm_id( ir_info0 ); - ir_work_id0 = bli_thread_work_id( ir_info0 ); + ir_comm_id0 = bli_thrinfo_thread_id( ir_info0 ); + ir_work_id0 = bli_thrinfo_work_id( ir_info0 ); check_thrinfo_node: if ( !pa_info ) goto print_thrinfo; - pa_comm_id = bli_thread_ocomm_id( pa_info ); - pa_work_id = bli_thread_work_id( pa_info ); + pa_comm_id = bli_thrinfo_thread_id( pa_info ); + pa_work_id = bli_thrinfo_work_id( pa_info ); jr_info = bli_thrinfo_sub_node( pa_info ); if ( !jr_info ) goto print_thrinfo; - jr_comm_id = bli_thread_ocomm_id( jr_info ); - jr_work_id = bli_thread_work_id( jr_info ); + jr_comm_id = bli_thrinfo_thread_id( jr_info ); + jr_work_id = bli_thrinfo_work_id( jr_info ); ir_info = bli_thrinfo_sub_node( jr_info ); if ( !ir_info ) goto print_thrinfo; - ir_comm_id = bli_thread_ocomm_id( ir_info ); - ir_work_id = bli_thread_work_id( ir_info ); + ir_comm_id = bli_thrinfo_thread_id( ir_info ); + ir_work_id = bli_thrinfo_work_id( ir_info ); print_thrinfo: #else @@ -584,8 +580,8 @@ void bli_l3_thrinfo_print_trsm_paths } else { - jc_comm_id = bli_thread_ocomm_id( jc_info ); - jc_work_id = bli_thread_work_id( jc_info ); + jc_comm_id = bli_thrinfo_thread_id( jc_info ); + jc_work_id = bli_thrinfo_work_id( jc_info ); pc_info = bli_thrinfo_sub_node( jc_info ); if ( !pc_info ) @@ -595,8 +591,8 @@ void bli_l3_thrinfo_print_trsm_paths } else { - pc_comm_id = bli_thread_ocomm_id( pc_info ); - pc_work_id = bli_thread_work_id( pc_info ); + pc_comm_id = bli_thrinfo_thread_id( pc_info ); + pc_work_id = bli_thrinfo_work_id( pc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); if ( !pb_info ) @@ -606,8 +602,8 @@ void bli_l3_thrinfo_print_trsm_paths } else { - pb_comm_id = bli_thread_ocomm_id( pb_info ); - pb_work_id = bli_thread_work_id( pb_info ); + pb_comm_id = bli_thrinfo_thread_id( pb_info ); + pb_work_id = bli_thrinfo_work_id( pb_info ); ic_info = bli_thrinfo_sub_node( pb_info ); if ( !ic_info ) @@ -617,8 +613,8 @@ void bli_l3_thrinfo_print_trsm_paths } else { - ic_comm_id = bli_thread_ocomm_id( ic_info ); - ic_work_id = bli_thread_work_id( ic_info ); + ic_comm_id = bli_thrinfo_thread_id( ic_info ); + ic_work_id = bli_thrinfo_work_id( ic_info ); pa_info0 = bli_thrinfo_sub_prenode( ic_info ); pa_info = bli_thrinfo_sub_node( ic_info ); @@ -630,8 +626,8 @@ void bli_l3_thrinfo_print_trsm_paths } else { - pa_comm_id0 = bli_thread_ocomm_id( pa_info0 ); - pa_work_id0 = bli_thread_work_id( pa_info0 ); + pa_comm_id0 = bli_thrinfo_thread_id( pa_info0 ); + pa_work_id0 = bli_thrinfo_work_id( pa_info0 ); jr_info0 = bli_thrinfo_sub_node( pa_info0 ); if ( !jr_info0 ) @@ -641,8 +637,8 @@ void bli_l3_thrinfo_print_trsm_paths } else { - jr_comm_id0 = bli_thread_ocomm_id( jr_info0 ); - jr_work_id0 = bli_thread_work_id( jr_info0 ); + jr_comm_id0 = bli_thrinfo_thread_id( jr_info0 ); + jr_work_id0 = bli_thrinfo_work_id( jr_info0 ); ir_info0 = bli_thrinfo_sub_node( jr_info0 ); if ( !ir_info0 ) @@ -652,8 +648,8 @@ void bli_l3_thrinfo_print_trsm_paths } else { - ir_comm_id0 = bli_thread_ocomm_id( ir_info0 ); - ir_work_id0 = bli_thread_work_id( ir_info0 ); + ir_comm_id0 = bli_thrinfo_thread_id( ir_info0 ); + ir_work_id0 = bli_thrinfo_work_id( ir_info0 ); } } } @@ -666,8 +662,8 @@ void bli_l3_thrinfo_print_trsm_paths } else { - pa_comm_id = bli_thread_ocomm_id( pa_info ); - pa_work_id = bli_thread_work_id( pa_info ); + pa_comm_id = bli_thrinfo_thread_id( pa_info ); + pa_work_id = bli_thrinfo_work_id( pa_info ); jr_info = bli_thrinfo_sub_node( pa_info ); if ( !jr_info ) @@ -677,8 +673,8 @@ void bli_l3_thrinfo_print_trsm_paths } else { - jr_comm_id = bli_thread_ocomm_id( jr_info ); - jr_work_id = bli_thread_work_id( jr_info ); + jr_comm_id = bli_thrinfo_thread_id( jr_info ); + jr_work_id = bli_thrinfo_work_id( jr_info ); ir_info = bli_thrinfo_sub_node( jr_info ); if ( !ir_info ) @@ -688,8 +684,8 @@ void bli_l3_thrinfo_print_trsm_paths } else { - ir_comm_id = bli_thread_ocomm_id( ir_info ); - ir_work_id = bli_thread_work_id( ir_info ); + ir_comm_id = bli_thrinfo_thread_id( ir_info ); + ir_work_id = bli_thrinfo_work_id( ir_info ); } } } @@ -724,15 +720,14 @@ void bli_l3_thrinfo_print_trsm_paths void bli_l3_thrinfo_free_paths ( - rntm_t* rntm, thrinfo_t** threads ) { - dim_t n_threads = bli_thread_num_threads( threads[0] ); + dim_t n_threads = bli_thrinfo_num_threads( threads[0] ); dim_t i; for ( i = 0; i < n_threads; ++i ) - bli_l3_thrinfo_free( rntm, threads[i] ); + bli_thrinfo_free( threads[i] ); bli_free_intl( threads ); } diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 37a3909fd..b1290df50 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -68,60 +68,36 @@ \ ( index % thread->n_way == thread->work_id % thread->n_way ) -// -// thrinfo_t APIs specific to level-3 operations. -// - -void bli_l3_thrinfo_init - ( - thrinfo_t* thread, - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - thrinfo_t* sub_node - ); - -void bli_l3_thrinfo_init_single - ( - thrinfo_t* thread - ); - -void bli_l3_thrinfo_free - ( - rntm_t* rntm, - thrinfo_t* thread - ); +// ----------------------------------------------------------------------------- -void bli_l3_sup_thrinfo_free +BLIS_EXPORT_BLIS thrinfo_t* bli_l3_thrinfo_create ( - rntm_t* rntm, - thrinfo_t* thread + dim_t id, + thrcomm_t* gl_comm, + array_t* array, + const rntm_t* rntm, + const cntl_t* cntl ); -// ----------------------------------------------------------------------------- - -void bli_l3_thrinfo_create_root +void bli_l3_thrinfo_grow ( - dim_t id, - thrcomm_t* gl_comm, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t** thread + thrinfo_t* thread_par, + const rntm_t* rntm, + const cntl_t* cntl ); -void bli_l3_sup_thrinfo_create_root +thrinfo_t* bli_l3_sup_thrinfo_create ( - dim_t id, - thrcomm_t* gl_comm, - rntm_t* rntm, - thrinfo_t** thread + dim_t id, + thrcomm_t* gl_comm, + pool_t* pool, + const rntm_t* rntm ); -void bli_l3_sup_thrinfo_update_root +void bli_l3_sup_thrinfo_update ( - rntm_t* rntm, - thrinfo_t* thread + const rntm_t* rntm, + thrinfo_t** root ); void bli_l3_thrinfo_print_gemm_paths @@ -138,7 +114,6 @@ void bli_l3_thrinfo_print_trsm_paths void bli_l3_thrinfo_free_paths ( - rntm_t* rntm, thrinfo_t** threads ); diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c index 485779a90..f841e5eb2 100644 --- a/frame/3/gemm/bli_gemm_blk_var1.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -37,13 +37,12 @@ void bli_gemm_blk_var1 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par ) { obj_t ap, cp; @@ -58,6 +57,7 @@ void bli_gemm_blk_var1 // Determine the current thread's subpartition range. dim_t my_start, my_end; + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); bli_thread_range_mdim ( direct, thread, &ap, b, &cp, cntl, cntx, @@ -88,9 +88,8 @@ void bli_gemm_blk_var1 &BLIS_ONE, &c1, cntx, - rntm, bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread ) + thread ); } } diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c index 254a31064..ceadce7d7 100644 --- a/frame/3/gemm/bli_gemm_blk_var2.c +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -37,13 +37,12 @@ void bli_gemm_blk_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par ) { obj_t bp, cp; @@ -58,6 +57,7 @@ void bli_gemm_blk_var2 // Determine the current thread's subpartition range. dim_t my_start, my_end; + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); bli_thread_range_ndim ( direct, thread, a, &bp, &cp, cntl, cntx, @@ -88,9 +88,8 @@ void bli_gemm_blk_var2 &BLIS_ONE, &c1, cntx, - rntm, bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread ) + thread ); } } diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index cb20b7f36..d683cfc88 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -36,13 +36,12 @@ void bli_gemm_blk_var3 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par ) { obj_t ap, bp, cs; @@ -50,6 +49,8 @@ void bli_gemm_blk_var3 bli_obj_alias_to( b, &bp ); bli_obj_alias_to( c, &cs ); + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + // Determine the direction in which to partition (forwards or backwards). dir_t direct = bli_l3_direct( &ap, &bp, &cs, cntl ); @@ -83,13 +84,10 @@ void bli_gemm_blk_var3 &BLIS_ONE, &cs, cntx, - rntm, bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread ) + thread ); - bli_thread_barrier( rntm, bli_thrinfo_sub_node( thread ) ); - // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it // only for the first iteration (and then BLIS_ONE for all others). diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 052c812a3..bd8d97d13 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -37,21 +37,21 @@ cntl_t* bli_gemm_cntl_create ( - rntm_t* rntm, + pool_t* pool, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ) { - return bli_gemmbp_cntl_create( rntm, family, schema_a, schema_b, ker ); + return bli_gemmbp_cntl_create( pool, family, schema_a, schema_b, ker ); } // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( - rntm_t* rntm, + pool_t* pool, opid_t family, pack_t schema_a, pack_t schema_b, @@ -73,18 +73,18 @@ cntl_t* bli_gemmbp_cntl_create // Create two nodes for the macro-kernel. cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node ( - rntm, // the thread's runtime structure - family, // the operation family - BLIS_MR, // needed for bli_thrinfo_rgrow() - NULL, // variant function pointer not used - NULL // no sub-node; this is the leaf of the tree. + pool, // the thread's sba pool + family, // the operation family + BLIS_MR, + NULL, // variant function pointer not used + NULL // no sub-node; this is the leaf of the tree. ); cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_create_node ( - rntm, // the thread's runtime structure + pool, // the thread's sba pool family, - BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + BLIS_NR, macro_kernel_fp, gemm_cntl_bu_ke ); @@ -92,14 +92,14 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for packing matrix A. cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( - rntm, - bli_l3_packa, // pack the left-hand operand + pool, + bli_l3_packa, // pack the left-hand operand BLIS_MR, BLIS_KR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - schema_a, // normally BLIS_PACKED_ROW_PANELS + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, gemm_cntl_bp_bu ); @@ -107,7 +107,7 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for partitioning the m dimension by MC. cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_create_node ( - rntm, + pool, family, BLIS_MC, bli_gemm_blk_var1, @@ -117,14 +117,14 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for packing matrix B. cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( - rntm, - bli_l3_packb, // pack the right-hand operand + pool, + bli_l3_packb, // pack the right-hand operand BLIS_NR, BLIS_KR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - schema_b, // normally BLIS_PACKED_COL_PANELS + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, gemm_cntl_op_bp ); @@ -132,7 +132,7 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for partitioning the k dimension by KC. cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node ( - rntm, + pool, family, BLIS_KC, bli_gemm_blk_var3, @@ -142,7 +142,7 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for partitioning the n dimension by NC. cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node ( - rntm, + pool, family, BLIS_NC, bli_gemm_blk_var2, @@ -154,124 +154,26 @@ cntl_t* bli_gemmbp_cntl_create // ----------------------------------------------------------------------------- -// This control tree creation function is disabled because it is no longer used. -// (It was originally created in the run up to publishing the 1m journal article, -// but was disabled to reduce complexity.) -#if 0 -cntl_t* bli_gemmpb_cntl_create - ( - opid_t family - ) -{ - void_fp macro_kernel_p = bli_gemm_ker_var1; - - // Change the macro-kernel if the operation family is gemmt or trmm. - //if ( family == BLIS_GEMMT ) macro_kernel_p = bli_gemmt_x_ker_var2; - //else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; - - // Create two nodes for the macro-kernel. - cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_create_node - ( - family, // the operation family - BLIS_MR, // needed for bli_thrinfo_rgrow() - NULL, // variant function pointer not used - NULL // no sub-node; this is the leaf of the tree. - ); - - cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_create_node - ( - family, - BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() - macro_kernel_p, - gemm_cntl_ub_ke - ); - - // Create a node for packing matrix A (which is really the right-hand - // operand "B"). - cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node - ( - bli_gemm_packb, // pack the right-hand operand - bli_packm_blk_var1, - BLIS_MR, - BLIS_KR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_A_BLOCK, - gemm_cntl_pb_ub - ); - - // Create a node for partitioning the n dimension by MC. - cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_create_node - ( - family, - BLIS_MC, - bli_gemm_blk_var2, - gemm_cntl_packb - ); - - // Create a node for packing matrix B (which is really the left-hand - // operand "A"). - cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node - ( - bli_gemm_packa, // pack the left-hand operand - bli_packm_blk_var1, - BLIS_NR, - BLIS_KR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - gemm_cntl_op_pb - ); - - // Create a node for partitioning the k dimension by KC. - cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node - ( - family, - BLIS_KC, - bli_gemm_blk_var3, - gemm_cntl_packa - ); - - // Create a node for partitioning the m dimension by NC. - cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node - ( - family, - BLIS_NC, - bli_gemm_blk_var1, - gemm_cntl_mm_op - ); - - return gemm_cntl_vl_mm; -} -#endif - -// ----------------------------------------------------------------------------- - void bli_gemm_cntl_free ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + pool_t* pool, + cntl_t* cntl ) { - bli_cntl_free( rntm, cntl, thread ); + bli_cntl_free( pool, cntl ); } // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ) { - return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node ); + return bli_cntl_create_node( pool, family, bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 5fa213ac4..48e0652ca 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -35,7 +35,7 @@ cntl_t* bli_gemm_cntl_create ( - rntm_t* rntm, + pool_t* pool, opid_t family, pack_t schema_a, pack_t schema_b, @@ -46,34 +46,26 @@ cntl_t* bli_gemm_cntl_create cntl_t* bli_gemmbp_cntl_create ( - rntm_t* rntm, + pool_t* pool, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); -#if 0 -cntl_t* bli_gemmpb_cntl_create - ( - opid_t family, - ); -#endif - // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + pool_t* pool, + cntl_t* cntl ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, opid_t family, bszid_t bszid, void_fp var_func, diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 5f992bd67..fe0dc61a8 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -43,8 +43,7 @@ void bli_gemm_front const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + rntm_t* rntm ) { bli_init_once(); @@ -244,8 +243,7 @@ void bli_gemm_front betap, cp, cntx, - rntm, - cntl + rntm ); #ifdef BLIS_ENABLE_GEMM_MD diff --git a/frame/3/gemm/bli_gemm_front.h b/frame/3/gemm/bli_gemm_front.h index 744f88d1b..9465c37d9 100644 --- a/frame/3/gemm/bli_gemm_front.h +++ b/frame/3/gemm/bli_gemm_front.h @@ -40,8 +40,7 @@ void bli_gemm_front const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + rntm_t* rntm ); #ifdef BLIS_ENABLE_SMALL_MATRIX @@ -53,7 +52,7 @@ err_t bli_gemm_small const obj_t* beta, const obj_t* c, const cntx_t* cntx, - cntl_t* cntl + const cntl_t* cntl ); #endif diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 199e72cb6..51dceced2 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -81,9 +81,8 @@ void bli_gemm_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread_par ) { num_t dt_exec = bli_obj_exec_dt( c ); @@ -254,13 +253,14 @@ void bli_gemm_ker_var2 // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) // loop around the microkernel. Here we query the thrinfo_t node for the // 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - dim_t jr_nt = bli_thread_n_way( thread ); - dim_t jr_tid = bli_thread_work_id( thread ); - dim_t ir_nt = bli_thread_n_way( caucus ); - dim_t ir_tid = bli_thread_work_id( caucus ); + dim_t jr_nt = bli_thrinfo_n_way( thread ); + dim_t jr_tid = bli_thrinfo_work_id( thread ); + dim_t ir_nt = bli_thrinfo_n_way( caucus ); + dim_t ir_tid = bli_thrinfo_work_id( caucus ); dim_t jr_start, jr_end; dim_t ir_start, ir_end; diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index d3109e600..24f7ecfb9 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -53,12 +53,11 @@ typedef struct \ void PASTEMAC0(opname) \ ( \ - const obj_t* a, \ - const obj_t* b, \ - const obj_t* c, \ - const cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* c, \ + const cntx_t* cntx, \ + const cntl_t* cntl, \ thrinfo_t* thread \ ); diff --git a/frame/3/gemm/bli_gemm_ker_var1.c b/frame/3/gemm/other/bli_gemm_ker_var1.c similarity index 100% rename from frame/3/gemm/bli_gemm_ker_var1.c rename to frame/3/gemm/other/bli_gemm_ker_var1.c diff --git a/frame/3/gemm/other/bli_gemm_ker_var2.c b/frame/3/gemm/other/bli_gemm_ker_var2.c index c5cf935b8..8b26b2263 100644 --- a/frame/3/gemm/other/bli_gemm_ker_var2.c +++ b/frame/3/gemm/other/bli_gemm_ker_var2.c @@ -270,10 +270,10 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ - dim_t jr_num_threads = bli_thread_n_way( thread ); \ - dim_t jr_thread_id = bli_thread_work_id( thread ); \ - dim_t ir_num_threads = bli_thread_n_way( caucus ); \ - dim_t ir_thread_id = bli_thread_work_id( caucus ); \ + dim_t jr_num_threads = bli_thrinfo_n_way( thread ); \ + dim_t jr_thread_id = bli_thrinfo_work_id( thread ); \ + dim_t ir_num_threads = bli_thrinfo_n_way( caucus ); \ + dim_t ir_thread_id = bli_thrinfo_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ diff --git a/frame/3/gemm/other/bli_gemm_ker_var2rr.c b/frame/3/gemm/other/bli_gemm_ker_var2rr.c index 946e3048c..c374e178b 100644 --- a/frame/3/gemm/other/bli_gemm_ker_var2rr.c +++ b/frame/3/gemm/other/bli_gemm_ker_var2rr.c @@ -276,10 +276,10 @@ void PASTEMAC(ch,varname) \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ + dim_t ir_nt = bli_thrinfo_n_way( caucus ); \ + dim_t ir_tid = bli_thrinfo_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ diff --git a/frame/3/gemm/other/bli_gemm_ker_var2sl.c b/frame/3/gemm/other/bli_gemm_ker_var2sl.c index f5159bbb9..f61911c53 100644 --- a/frame/3/gemm/other/bli_gemm_ker_var2sl.c +++ b/frame/3/gemm/other/bli_gemm_ker_var2sl.c @@ -276,10 +276,10 @@ void PASTEMAC(ch,varname) \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ + dim_t ir_nt = bli_thrinfo_n_way( caucus ); \ + dim_t ir_tid = bli_thrinfo_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c index 49b32c976..d75738a94 100644 --- a/frame/3/gemmt/bli_gemmt_front.c +++ b/frame/3/gemmt/bli_gemmt_front.c @@ -43,8 +43,7 @@ void bli_gemmt_front const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + rntm_t* rntm ) { bli_init_once(); @@ -106,8 +105,7 @@ void bli_gemmt_front beta, &c_local, cntx, - rntm, - cntl + rntm ); } diff --git a/frame/3/gemmt/bli_gemmt_front.h b/frame/3/gemmt/bli_gemmt_front.h index 0f2a9ada2..4a7cd7abe 100644 --- a/frame/3/gemmt/bli_gemmt_front.h +++ b/frame/3/gemmt/bli_gemmt_front.h @@ -41,6 +41,5 @@ void bli_gemmt_front const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + rntm_t* rntm ); diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c index aed0359ec..4a3a48304 100644 --- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c @@ -35,30 +35,46 @@ #include "blis.h" -#define FUNCPTR_T gemmt_fp +typedef void (*xpbys_mxn_l_vft) + ( + doff_t diagoff, + dim_t m, + dim_t n, + void* x, inc_t rs_x, inc_t cs_x, + void* b, + void* y, inc_t rs_y, inc_t cs_y + ); -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2); +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +\ +void PASTEMAC(ch,op) \ + ( \ + doff_t diagoff, \ + dim_t m, \ + dim_t n, \ + void* x, inc_t rs_x, inc_t cs_x, \ + void* b, \ + void* y, inc_t rs_y, inc_t cs_y \ + ) \ +{ \ + ctype* restrict x_cast = x; \ + ctype* restrict b_cast = b; \ + ctype* restrict y_cast = y; \ +\ + PASTEMAC3(ch,ch,ch,xpbys_mxn_l) \ + ( \ + diagoff, \ + m, n, \ + x_cast, rs_x, cs_x, \ + b_cast, \ + y_cast, rs_y, cs_y \ + ); \ +} + +INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn); +static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn); void bli_gemmt_l_ker_var2 ( @@ -66,30 +82,28 @@ void bli_gemmt_l_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread_par ) { - const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); - const doff_t diagoffc = bli_obj_diag_offset( c ); + doff_t diagoffc = bli_obj_diag_offset( c ); const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - const dim_t k = bli_obj_width( a ); + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); const void* buf_a = bli_obj_buffer_at_off( a ); - const inc_t cs_a = bli_obj_col_stride( a ); const inc_t is_a = bli_obj_imag_stride( a ); const dim_t pd_a = bli_obj_panel_dim( a ); const inc_t ps_a = bli_obj_panel_stride( a ); const void* buf_b = bli_obj_buffer_at_off( b ); - const inc_t rs_b = bli_obj_row_stride( b ); const inc_t is_b = bli_obj_imag_stride( b ); const dim_t pd_b = bli_obj_panel_dim( b ); const inc_t ps_b = bli_obj_panel_stride( b ); @@ -109,97 +123,32 @@ void bli_gemmt_l_ker_var2 const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); const void* buf_beta = bli_obj_internal_scalar_buffer( c ); - // Index into the type combination array to extract the correct - // function pointer. - ftypes[dt_exec] - ( - diagoffc, - schema_a, - schema_b, - m, - n, - k, - ( void* )buf_alpha, - ( void* )buf_a, cs_a, is_a, - pd_a, ps_a, - ( void* )buf_b, rs_b, is_b, - pd_b, ps_b, - ( void* )buf_beta, - buf_c, rs_c, cs_c, - ( cntx_t* )cntx, - rntm, - thread - ); -} + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt ]; + + // Temporary C buffer for edge cases. Note that the strides of this + // temporary buffer are set so that they match the storage of the + // original C matrix. For example, if C is column-stored, ct will be + // column-stored as well. + char ct[ BLIS_STACK_BUF_MAX_SIZE ] + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); + const inc_t rs_ct = ( col_pref ? 1 : NR ); + const inc_t cs_ct = ( col_pref ? MR : 1 ); + + const void* zero = bli_obj_buffer_for_const( dt, &BLIS_ZERO ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, ip; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ /* Assumptions/assertions: rs_a == 1 @@ -212,296 +161,280 @@ void PASTEMAC(ch,varname) \ ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely above the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region above where the diagonal of C intersects - the left edge of the panel, adjust the pointer to C and A and treat - this case as if the diagonal offset were zero. */ \ - if ( diagoffc < 0 ) \ - { \ - ip = -diagoffc / MR; \ - i = ip * MR; \ - m = m - i; \ - diagoffc = -diagoffc % MR; \ - c_cast = c_cast + (i )*rs_c; \ - a_cast = a_cast + (ip )*ps_a; \ - } \ -\ - /* If there is a zero region to the right of where the diagonal - of C intersects the bottom of the panel, shrink it to prevent - "no-op" iterations from executing. */ \ - if ( diagoffc + m < n ) \ - { \ - n = diagoffc + m; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* Save the desired output datatype (indicating no typecasting). */ \ - /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the rectangular - part of C, and the triangular portion. */ \ - dim_t n_iter_rct; \ - dim_t n_iter_tri; \ -\ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \ - { \ - /* If the entire panel of C does not intersect the diagonal, there is - no triangular region, and therefore we can skip the second set of - loops. */ \ - n_iter_rct = n_iter; \ - n_iter_tri = 0; \ - } \ - else \ - { \ - /* If the panel of C does intersect the diagonal, compute the number of - iterations in the rectangular region by dividing NR into the diagonal - offset. Any remainder from this integer division is discarded, which - is what we want. That is, we want the rectangular region to contain - as many columns of whole microtiles as possible without including any - microtiles that intersect the diagonal. The number of iterations in - the triangular (or trapezoidal) region is computed as the remaining - number of iterations in the n dimension. */ \ - n_iter_rct = diagoffc / NR; \ - n_iter_tri = n_iter - n_iter_rct; \ - } \ -\ - /* Determine the thread range and increment for the 2nd and 1st loops for - the initial rectangular region of C (if it exists). - NOTE: The definition of bli_thread_range_jrir() will depend on whether - slab or round-robin partitioning was requested at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* No need to compute the diagonal offset for the rectangular - region. */ \ - /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - } \ - } \ -\ - /* If there is no triangular region, then we're done. */ \ - if ( n_iter_tri == 0 ) return; \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd loop - and the default (slab or rr) partitioning in the 1st loop for the - remaining triangular region of C. */ \ - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Advance the start and end iteration offsets for the triangular region - by the number of iterations used for the rectangular region. */ \ - jr_start += n_iter_rct; \ - jr_end += n_iter_rct; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly below the diagonal, - we compute and store as we normally would. - And if we're strictly above the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - MR, \ - NR, \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - } \ - } \ -} + */ + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of C is entirely above the diagonal, + // it is not stored. So we do nothing. + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; + + // If there is a zero region above where the diagonal of C intersects + // the left edge of the panel, adjust the pointer to C and A and treat + // this case as if the diagonal offset were zero. + if ( diagoffc < 0 ) + { + dim_t ip = -diagoffc / MR; + dim_t i = ip * MR; + m = m - i; + diagoffc = -diagoffc % MR; + c_cast = c_cast + (i )*rs_c*dt_size; + a_cast = a_cast + (ip )*ps_a*dt_size; + } + + // If there is a zero region to the right of where the diagonal + // of C intersects the bottom of the panel, shrink it to prevent + // "no-op" iterations from executing. + if ( diagoffc + m < n ) + { + n = diagoffc + m; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + dim_t n_iter = n / NR; + dim_t n_left = n % NR; + + dim_t m_iter = m / MR; + dim_t m_left = m % MR; + + if ( n_left ) ++n_iter; + if ( m_left ) ++m_iter; + + // Determine some increments used to step through A, B, and C. + inc_t rstep_a = ps_a * dt_size; + + inc_t cstep_b = ps_b * dt_size; + + inc_t rstep_c = rs_c * MR * dt_size; + inc_t cstep_c = cs_c * NR * dt_size; + + // Save the pack schemas of A and B to the auxinfo_t object. + auxinfo_t aux; + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // Save the imaginary stride of A and B to the auxinfo_t object. + bli_auxinfo_set_is_a( is_a, &aux ); + bli_auxinfo_set_is_b( is_b, &aux ); + + // Save the desired output datatype (indicating no typecasting). + //bli_auxinfo_set_dt_on_output( dt, &aux );*/ + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. + dim_t jr_nt = bli_thrinfo_n_way( thread ); + dim_t jr_tid = bli_thrinfo_work_id( thread ); + dim_t ir_nt = bli_thrinfo_n_way( caucus ); + dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + dim_t jr_start, jr_end; + dim_t ir_start, ir_end; + dim_t jr_inc, ir_inc; -INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 ) + // Note that we partition the 2nd loop into two regions: the rectangular + // part of C, and the triangular portion. + dim_t n_iter_rct; + dim_t n_iter_tri; + + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) + { + // If the entire panel of C does not intersect the diagonal, there is + // no triangular region, and therefore we can skip the second set of + // loops. + n_iter_rct = n_iter; + n_iter_tri = 0; + } + else + { + // If the panel of C does intersect the diagonal, compute the number of + // iterations in the rectangular region by dividing NR into the diagonal + // offset. Any remainder from this integer division is discarded, which + // is what we want. That is, we want the rectangular region to contain + // as many columns of whole microtiles as possible without including any + // microtiles that intersect the diagonal. The number of iterations in + // the triangular (or trapezoidal) region is computed as the remaining + // number of iterations in the n dimension. + n_iter_rct = diagoffc / NR; + n_iter_tri = n_iter - n_iter_rct; + } + + // Determine the thread range and increment for the 2nd and 1st loops for + // the initial rectangular region of C (if it exists). + // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + // Interior loop over the m dimension (MR rows at a time). + for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // No need to compute the diagonal offset for the rectangular + // region. + //diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) + { + a2 = a_cast; + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // If the diagonal intersects the current MR x NR submatrix, we + // compute it the temporary buffer and then add in the elements + // on or below the diagonal. + // Otherwise, if the submatrix is strictly below the diagonal, + // we compute and store as we normally would. + // And if we're strictly above the diagonal, we do nothing and + // continue. + { + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + } + } + + // If there is no triangular region, then we're done. + if ( n_iter_tri == 0 ) return; + + // Use round-robin assignment of micropanels to threads in the 2nd loop + // and the default (slab or rr) partitioning in the 1st loop for the + // remaining triangular region of C. + bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + + // Advance the start and end iteration offsets for the triangular region + // by the number of iterations used for the rectangular region. + jr_start += n_iter_rct; + jr_end += n_iter_rct; + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + // Interior loop over the m dimension (MR rows at a time). + for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the diagonal offset for the submatrix at (i,j). + doff_t diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) + { + a2 = a_cast; + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // If the diagonal intersects the current MR x NR submatrix, we + // compute it the temporary buffer and then add in the elements + // on or below the diagonal. + // Otherwise, if the submatrix is strictly below the diagonal, + // we compute and store as we normally would. + // And if we're strictly above the diagonal, we do nothing and + // continue. + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + // Invoke the gemm micro-kernel. + gemm_ukr + ( + MR, + NR, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )zero, + ct, rs_ct, cs_ct, + &aux, + ( cntx_t* )cntx + ); + + // Scale C and add the result to only the stored part. + xpbys_mxn_l_ukr( diagoffc_ij, + m_cur, n_cur, + ct, rs_ct, cs_ct, + ( void* )beta_cast, + c11, rs_c, cs_c ); + } + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + } + } +} diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c index 87d77ee55..5b4e1ccd9 100644 --- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c @@ -35,30 +35,46 @@ #include "blis.h" -#define FUNCPTR_T gemmt_fp +typedef void (*xpbys_mxn_u_vft) + ( + doff_t diagoff, + dim_t m, + dim_t n, + void* x, inc_t rs_x, inc_t cs_x, + void* b, + void* y, inc_t rs_y, inc_t cs_y + ); -typedef void (*FUNCPTR_T) - ( - doff_t diagoffc, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, inc_t is_a, - dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, inc_t is_b, - dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2); +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +\ +void PASTEMAC(ch,op) \ + ( \ + doff_t diagoff, \ + dim_t m, \ + dim_t n, \ + void* x, inc_t rs_x, inc_t cs_x, \ + void* b, \ + void* y, inc_t rs_y, inc_t cs_y \ + ) \ +{ \ + ctype* restrict x_cast = x; \ + ctype* restrict b_cast = b; \ + ctype* restrict y_cast = y; \ +\ + PASTEMAC3(ch,ch,ch,xpbys_mxn_u) \ + ( \ + diagoff, \ + m, n, \ + x_cast, rs_x, cs_x, \ + b_cast, \ + y_cast, rs_y, cs_y \ + ); \ +} + +INSERT_GENTFUNC_BASIC0(xpbys_mxn_u_fn); +static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn); void bli_gemmt_u_ker_var2 ( @@ -66,30 +82,28 @@ void bli_gemmt_u_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread_par ) { - const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); - const doff_t diagoffc = bli_obj_diag_offset( c ); + doff_t diagoffc = bli_obj_diag_offset( c ); const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - const dim_t k = bli_obj_width( a ); + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); const void* buf_a = bli_obj_buffer_at_off( a ); - const inc_t cs_a = bli_obj_col_stride( a ); const inc_t is_a = bli_obj_imag_stride( a ); const dim_t pd_a = bli_obj_panel_dim( a ); const inc_t ps_a = bli_obj_panel_stride( a ); const void* buf_b = bli_obj_buffer_at_off( b ); - const inc_t rs_b = bli_obj_row_stride( b ); const inc_t is_b = bli_obj_imag_stride( b ); const dim_t pd_b = bli_obj_panel_dim( b ); const inc_t ps_b = bli_obj_panel_stride( b ); @@ -109,97 +123,32 @@ void bli_gemmt_u_ker_var2 const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); const void* buf_beta = bli_obj_internal_scalar_buffer( c ); - // Index into the type combination array to extract the correct - // function pointer. - ftypes[dt_exec] - ( - diagoffc, - schema_a, - schema_b, - m, - n, - k, - ( void* )buf_alpha, - ( void* )buf_a, cs_a, is_a, - pd_a, ps_a, - ( void* )buf_b, rs_b, is_b, - pd_b, ps_b, - ( void* )buf_beta, - buf_c, rs_c, cs_c, - ( cntx_t* )cntx, - rntm, - thread - ); -} + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt ]; + + // Temporary C buffer for edge cases. Note that the strides of this + // temporary buffer are set so that they match the storage of the + // original C matrix. For example, if C is column-stored, ct will be + // column-stored as well. + char ct[ BLIS_STACK_BUF_MAX_SIZE ] + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); + const inc_t rs_ct = ( col_pref ? 1 : NR ); + const inc_t cs_ct = ( col_pref ? MR : 1 ); + + const void* zero = bli_obj_buffer_for_const( dt, &BLIS_ZERO ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - /*const dim_t PACKMR = cs_a;*/ \ - /*const dim_t PACKNR = rs_b;*/ \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -\ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffc_ij; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t i, j, jp; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - auxinfo_t aux; \ -\ /* Assumptions/assertions: rs_a == 1 @@ -212,299 +161,283 @@ void PASTEMAC(ch,varname) \ ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) - */ \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of C is entirely below the diagonal, - it is not stored. So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ -\ - /* If there is a zero region to the left of where the diagonal of C - intersects the top edge of the panel, adjust the pointer to C and B - and treat this case as if the diagonal offset were zero. - NOTE: It's possible that after this pruning that the diagonal offset - is still positive (though it is guaranteed to be less than NR). */ \ - if ( diagoffc > 0 ) \ - { \ - jp = diagoffc / NR; \ - j = jp * NR; \ - n = n - j; \ - diagoffc = diagoffc % NR; \ - c_cast = c_cast + (j )*cs_c; \ - b_cast = b_cast + (jp )*ps_b; \ - } \ -\ - /* If there is a zero region below where the diagonal of C intersects - the right edge of the panel, shrink it to prevent "no-op" iterations - from executing. */ \ - if ( -diagoffc + n < m ) \ - { \ - m = -diagoffc + n; \ - } \ -\ - /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ - PASTEMAC(ch,set0s_mxn)( MR, NR, \ - ct, rs_ct, cs_ct ); \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( is_a, &aux ); \ - bli_auxinfo_set_is_b( is_b, &aux ); \ -\ - /* Save the desired output datatype (indicating no typecasting). */ \ - /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the triangular - part of C, and the rectangular portion. */ \ - dim_t n_iter_tri; \ - dim_t n_iter_rct; \ -\ - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \ - { \ - /* If the entire panel of C does not intersect the diagonal, there is - no triangular region, and therefore we can skip the first set of - loops. */ \ - n_iter_tri = 0; \ - n_iter_rct = n_iter; \ - } \ - else \ - { \ - /* If the panel of C does intersect the diagonal, compute the number of - iterations in the triangular (or trapezoidal) region by dividing NR - into the number of rows in C. A non-zero remainder means we need to - add one additional iteration. That is, we want the triangular region - to contain as few columns of whole microtiles as possible while still - including all microtiles that intersect the diagonal. The number of - iterations in the rectangular region is computed as the remaining - number of iterations in the n dimension. */ \ - n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \ - n_iter_rct = n_iter - n_iter_tri; \ - } \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd loop - and the default (slab or rr) partitioning in the 1st loop for the - initial triangular region of C (if it exists). */ \ - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir ( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* Compute the diagonal offset for the submatrix at (i,j). */ \ - diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - MR, \ - NR, \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - zero, \ - ct, rs_ct, cs_ct, \ - &aux, \ - cntx \ - ); \ -\ - /* Scale C and add the result to only the stored part. */ \ - PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ - m_cur, n_cur, \ - ct, rs_ct, cs_ct, \ - beta_cast, \ - c11, rs_c, cs_c ); \ - } \ - else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - } \ - } \ -\ - /* If there is no rectangular region, then we're done. */ \ - if ( n_iter_rct == 0 ) return; \ -\ - /* Determine the thread range and increment for the 2nd loop of the - remaining rectangular region of C (and also use default partitioning - for the 1st loop). - NOTE: The definition of bli_thread_range_jrir() will depend on whether - slab or round-robin partitioning was requested at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Advance the start and end iteration offsets for the rectangular region - by the number of iterations used for the triangular region. */ \ - jr_start += n_iter_tri; \ - jr_end += n_iter_tri; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* Interior loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - /* No need to compute the diagonal offset for the rectangular - region. */ \ - /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* If the diagonal intersects the current MR x NR submatrix, we - compute it the temporary buffer and then add in the elements - on or below the diagonal. - Otherwise, if the submatrix is strictly above the diagonal, - we compute and store as we normally would. - And if we're strictly below the diagonal, we do nothing and - continue. */ \ - { \ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - } \ - } \ -} + */ + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of C is entirely below the diagonal, + // it is not stored. So we do nothing. + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; + + // If there is a zero region to the left of where the diagonal of C + // intersects the top edge of the panel, adjust the pointer to C and B + // and treat this case as if the diagonal offset were zero. + // NOTE: It's possible that after this pruning that the diagonal offset + // is still positive (though it is guaranteed to be less than NR). + if ( diagoffc > 0 ) + { + dim_t jp = diagoffc / NR; + dim_t j = jp * NR; + n = n - j; + diagoffc = diagoffc % NR; + c_cast = c_cast + (j )*cs_c*dt_size; + b_cast = b_cast + (jp )*ps_b*dt_size; + } + + // If there is a zero region below where the diagonal of C intersects + // the right edge of the panel, shrink it to prevent "no-op" iterations + // from executing. + if ( -diagoffc + n < m ) + { + m = -diagoffc + n; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + dim_t n_iter = n / NR; + dim_t n_left = n % NR; + + dim_t m_iter = m / MR; + dim_t m_left = m % MR; + + if ( n_left ) ++n_iter; + if ( m_left ) ++m_iter; + + // Determine some increments used to step through A, B, and C. + inc_t rstep_a = ps_a * dt_size; + + inc_t cstep_b = ps_b * dt_size; + + inc_t rstep_c = rs_c * MR * dt_size; + inc_t cstep_c = cs_c * NR * dt_size; + + // Save the pack schemas of A and B to the auxinfo_t object. + auxinfo_t aux; + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // Save the imaginary stride of A and B to the auxinfo_t object. + bli_auxinfo_set_is_a( is_a, &aux ); + bli_auxinfo_set_is_b( is_b, &aux ); + + // Save the desired output datatype (indicating no typecasting). + //bli_auxinfo_set_dt_on_output( dt, &aux );*/ + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. + dim_t jr_nt = bli_thrinfo_n_way( thread ); + dim_t jr_tid = bli_thrinfo_work_id( thread ); + dim_t ir_nt = bli_thrinfo_n_way( caucus ); + dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + dim_t jr_start, jr_end; + dim_t ir_start, ir_end; + dim_t jr_inc, ir_inc; -INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 ) + // Note that we partition the 2nd loop into two regions: the triangular + // part of C, and the rectangular portion. + dim_t n_iter_tri; + dim_t n_iter_rct; + + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) + { + // If the entire panel of C does not intersect the diagonal, there is + // no triangular region, and therefore we can skip the first set of + // loops. + n_iter_tri = 0; + n_iter_rct = n_iter; + } + else + { + // If the panel of C does intersect the diagonal, compute the number of + // iterations in the triangular (or trapezoidal) region by dividing NR + // into the number of rows in C. A non-zero remainder means we need to + // add one additional iteration. That is, we want the triangular region + // to contain as few columns of whole microtiles as possible while still + // including all microtiles that intersect the diagonal. The number of + // iterations in the rectangular region is computed as the remaining + // number of iterations in the n dimension. + n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); + n_iter_rct = n_iter - n_iter_tri; + } + + // Use round-robin assignment of micropanels to threads in the 2nd loop + // and the default (slab or rr) partitioning in the 1st loop for the + // initial triangular region of C (if it exists). + bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_jrir ( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + // Interior loop over the m dimension (MR rows at a time). + for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the diagonal offset for the submatrix at (i,j). + doff_t diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) + { + a2 = a_cast; + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // If the diagonal intersects the current MR x NR submatrix, we + // compute it the temporary buffer and then add in the elements + // on or below the diagonal. + // Otherwise, if the submatrix is strictly above the diagonal, + // we compute and store as we normally would. + // And if we're strictly below the diagonal, we do nothing and + // continue. + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + // Invoke the gemm micro-kernel. + gemm_ukr + ( + MR, + NR, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )zero, + ct, rs_ct, cs_ct, + &aux, + ( cntx_t* )cntx + ); + + // Scale C and add the result to only the stored part. + xpbys_mxn_u_ukr( diagoffc_ij, + m_cur, n_cur, + ct, rs_ct, cs_ct, + ( void* )beta_cast, + c11, rs_c, cs_c ); + } + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + } + } + + // If there is no rectangular region, then we're done. + if ( n_iter_rct == 0 ) return; + + // Determine the thread range and increment for the 2nd loop of the + // remaining rectangular region of C (and also use default partitioning + // for the 1st loop). + // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + + // Advance the start and end iteration offsets for the rectangular region + // by the number of iterations used for the triangular region. + jr_start += n_iter_tri; + jr_end += n_iter_tri; + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + // Interior loop over the m dimension (MR rows at a time). + for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // No need to compute the diagonal offset for the rectangular + // region. + //diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) + { + a2 = a_cast; + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // If the diagonal intersects the current MR x NR submatrix, we + // compute it the temporary buffer and then add in the elements + // on or below the diagonal. + // Otherwise, if the submatrix is strictly above the diagonal, + // we compute and store as we normally would. + // And if we're strictly below the diagonal, we do nothing and + // continue. + { + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + } + } +} diff --git a/frame/3/gemmt/bli_gemmt_var.h b/frame/3/gemmt/bli_gemmt_var.h index 98d8f5563..eb6e16018 100644 --- a/frame/3/gemmt/bli_gemmt_var.h +++ b/frame/3/gemmt/bli_gemmt_var.h @@ -47,8 +47,7 @@ void PASTEMAC0(opname) \ const obj_t* ah, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ + const cntl_t* cntl, \ thrinfo_t* thread \ ); @@ -81,7 +80,6 @@ void PASTEMAC(ch,varname) \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ); diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c index 76fe106b0..207e1c938 100644 --- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c @@ -46,20 +46,18 @@ void bli_gemmt_x_ker_var2 const obj_t* ah, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { - dim_t uplo; - l3_var_oft f; + dim_t uplo; // Set a bool based on the uplo field of C's root object. if ( bli_obj_root_is_lower( c ) ) uplo = 0; else uplo = 1; // Index into the variant array to extract the correct function pointer. - f = vars[uplo]; + l3_var_oft f = vars[uplo]; // Call the macrokernel. f @@ -68,7 +66,6 @@ void bli_gemmt_x_ker_var2 ah, c, cntx, - rntm, cntl, thread ); diff --git a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c index ece351ef7..64df59e88 100644 --- a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c +++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c @@ -284,10 +284,10 @@ void PASTEMAC(ch,varname) \ c1 = c_cast; \ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ - dim_t jr_num_threads = bli_thread_n_way( thread ); \ - dim_t jr_thread_id = bli_thread_work_id( thread ); \ - dim_t ir_num_threads = bli_thread_n_way( caucus ); \ - dim_t ir_thread_id = bli_thread_work_id( caucus ); \ + dim_t jr_num_threads = bli_thrinfo_n_way( thread ); \ + dim_t jr_thread_id = bli_thrinfo_work_id( thread ); \ + dim_t ir_num_threads = bli_thrinfo_n_way( caucus ); \ + dim_t ir_thread_id = bli_thrinfo_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ diff --git a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c index f00e769b5..d5acec3b8 100644 --- a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c +++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c @@ -284,10 +284,10 @@ void PASTEMAC(ch,varname) \ c1 = c_cast; \ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ - dim_t jr_num_threads = bli_thread_n_way( thread ); \ - dim_t jr_thread_id = bli_thread_work_id( thread ); \ - dim_t ir_num_threads = bli_thread_n_way( caucus ); \ - dim_t ir_thread_id = bli_thread_work_id( caucus ); \ + dim_t jr_num_threads = bli_thrinfo_n_way( thread ); \ + dim_t jr_thread_id = bli_thrinfo_work_id( thread ); \ + dim_t ir_num_threads = bli_thrinfo_n_way( caucus ); \ + dim_t ir_thread_id = bli_thrinfo_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index c39703503..a281ddade 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -43,8 +43,7 @@ void bli_hemm_front const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + rntm_t* rntm ) { bli_init_once(); @@ -161,8 +160,7 @@ void bli_hemm_front beta, &c_local, cntx, - rntm, - cntl + rntm ); } diff --git a/frame/3/hemm/bli_hemm_front.h b/frame/3/hemm/bli_hemm_front.h index 63eb91cd3..2ccd8e0c8 100644 --- a/frame/3/hemm/bli_hemm_front.h +++ b/frame/3/hemm/bli_hemm_front.h @@ -41,6 +41,5 @@ void bli_hemm_front const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + rntm_t* rntm ); diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index c9aada989..1ee5e0a7f 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -43,8 +43,7 @@ void bli_symm_front const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + rntm_t* rntm ) { bli_init_once(); @@ -160,8 +159,7 @@ void bli_symm_front beta, &c_local, cntx, - rntm, - cntl + rntm ); } diff --git a/frame/3/symm/bli_symm_front.h b/frame/3/symm/bli_symm_front.h index 417cb9acb..585ec1025 100644 --- a/frame/3/symm/bli_symm_front.h +++ b/frame/3/symm/bli_symm_front.h @@ -41,6 +41,5 @@ void bli_symm_front const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + rntm_t* rntm ); diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index edd4ce1ef..d351e78e1 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -42,8 +42,7 @@ void bli_trmm_front const obj_t* a, const obj_t* b, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + rntm_t* rntm ) { bli_init_once(); @@ -179,8 +178,7 @@ void bli_trmm_front &BLIS_ZERO, &c_local, cntx, - rntm, - cntl + rntm ); } diff --git a/frame/3/trmm/bli_trmm_front.h b/frame/3/trmm/bli_trmm_front.h index cfefdd39b..f13d4c34b 100644 --- a/frame/3/trmm/bli_trmm_front.h +++ b/frame/3/trmm/bli_trmm_front.h @@ -39,6 +39,5 @@ void bli_trmm_front const obj_t* a, const obj_t* b, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + rntm_t* rntm ); diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index f5476b2ca..3bc4e3c6b 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -35,50 +35,27 @@ #include "blis.h" -#define FUNCPTR_T gemm_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffa, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); - - void bli_trmm_ll_ker_var2 ( const obj_t* a, const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread_par ) { - const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); - const doff_t diagoffa = bli_obj_diag_offset( a ); + doff_t diagoffa = bli_obj_diag_offset( a ); const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - const dim_t k = bli_obj_width( a ); + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); const void* buf_a = bli_obj_buffer_at_off( a ); const inc_t cs_a = bli_obj_col_stride( a ); @@ -105,89 +82,23 @@ void bli_trmm_ll_ker_var2 const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); const void* buf_beta = bli_obj_internal_scalar_buffer( c ); - // Index into the type combination array to extract the correct - // function pointer. - ftypes[dt_exec] - ( - diagoffa, - schema_a, - schema_b, - m, - n, - k, - ( void* )buf_alpha, - ( void* )buf_a, cs_a, pd_a, ps_a, - ( void* )buf_b, rs_b, pd_b, ps_b, - ( void* )buf_beta, - buf_c, rs_c, cs_c, - ( cntx_t* )cntx, - rntm, - thread - ); -} + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - const dim_t PACKMR = cs_a; \ - const dim_t PACKNR = rs_b; \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - ctype* restrict one = PASTEMAC(ch,1); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffa_i; \ - dim_t k_full; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t k_a1011; \ - dim_t off_a1011; \ - dim_t i, j; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - inc_t istep_a; \ - inc_t istep_b; \ - inc_t ps_a_cur; \ - inc_t is_a_cur; \ - auxinfo_t aux; \ -\ /* Assumptions/assertions: rs_a == 1 @@ -200,227 +111,201 @@ void PASTEMAC(ch,varname) \ ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) - */ \ -\ - /* Safety trap: Certain indexing within this macro-kernel does not - work as intended if both MR and NR are odd. */ \ - if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ - ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current block of A is entirely above the diagonal, - it is implicitly zero. So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ -\ - /* Compute k_full. For all trmm, k_full is simply k. This is - needed because some parameter combinations of trmm reduce k - to advance past zero regions in the triangular matrix, and - when computing the imaginary stride of B (the non-triangular - matrix), which is used by 4m1/3m1 implementations, we need - this unreduced value of k. */ \ - k_full = k; \ -\ - /* If there is a zero region above where the diagonal of A intersects the - left edge of the block, adjust the pointer to C and treat this case as - if the diagonal offset were zero. This skips over the region that was - not packed. (Note we assume the diagonal offset is a multiple of MR; - this assumption will hold as long as the cache blocksizes are each a - multiple of MR and NR.) */ \ - if ( diagoffa < 0 ) \ - { \ - i = -diagoffa; \ - m = m - i; \ - diagoffa = 0; \ - c_cast = c_cast + (i )*rs_c; \ - } \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - istep_a = PACKMR * k; \ - istep_b = PACKNR * k_full; \ -\ - if ( bli_is_odd( istep_a ) ) istep_a += 1; \ - if ( bli_is_odd( istep_b ) ) istep_b += 1; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_b( istep_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ - dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ -\ - dim_t jr_start, jr_end; \ - /*dim_t ir_start, ir_end;*/ \ - dim_t jr_inc; \ -\ - /* Determine the thread range and increment for the 2nd loop. - NOTE: The definition of bli_thread_range_jrir() will depend on whether - slab or round-robin partitioning was requested at configure-time. \ - NOTE: Parallelism in the 1st loop is disabled for now. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - a1 = a_cast; \ - c11 = c1; \ -\ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ - { \ - diagoffa_i = diagoffa + ( doff_t )i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* If the current panel of A intersects the diagonal, scale C - by beta. If it is strictly below the diagonal, scale by one. - This allows the current macro-kernel to work for both trmm - and trmm3. */ \ - if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ - { \ - ctype* restrict b1_i; \ - ctype* restrict a2; \ -\ - /* Determine the offset to and length of the panel that was - packed so we can index into the corresponding location in - b1. */ \ - off_a1011 = 0; \ - k_a1011 = bli_min( diagoffa_i + MR, k ); \ -\ - /* Compute the panel stride for the current diagonal- - intersecting micro-panel. */ \ - is_a_cur = k_a1011 * PACKMR; \ - is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ - ps_a_cur = is_a_cur; \ -\ - /* NOTE: ir loop parallelism disabled for now. */ \ - /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ -\ - b1_i = b1 + off_a1011 * PACKNR; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1; \ - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ - { \ - a2 = a_cast; \ - b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k_a1011, \ - alpha_cast, \ - a1, \ - b1_i, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - /*}*/ \ -\ - a1 += ps_a_cur; \ - } \ - else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ - { \ - /* NOTE: ir loop parallelism disabled for now. */ \ - /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ -\ - ctype* restrict a2; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1; \ - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ - { \ - a2 = a_cast; \ - b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - one, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - /*}*/ \ -\ - a1 += rstep_a; \ - } \ -\ - c11 += rstep_c; \ - } \ - } \ -/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \ -/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current block of A is entirely above the diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; + + // If there is a zero region above where the diagonal of A intersects the + // left edge of the block, adjust the pointer to C and treat this case as + // if the diagonal offset were zero. This skips over the region that was + // not packed. (Note we assume the diagonal offset is a multiple of MR; + // this assumption will hold as long as the cache blocksizes are each a + // multiple of MR and NR.) + if ( diagoffa < 0 ) + { + m += diagoffa; + c_cast -= diagoffa * rs_c * dt_size; + diagoffa = 0; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + dim_t n_iter = n / NR; + dim_t n_left = n % NR; + + dim_t m_iter = m / MR; + dim_t m_left = m % MR; + + if ( n_left ) ++n_iter; + if ( m_left ) ++m_iter; + + // Determine some increments used to step through A, B, and C. + inc_t rstep_a = ps_a * dt_size; + + inc_t cstep_b = ps_b * dt_size; + + inc_t rstep_c = rs_c * MR * dt_size; + inc_t cstep_c = cs_c * NR * dt_size; + + // Save the pack schemas of A and B to the auxinfo_t object. + auxinfo_t aux; + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + //thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + dim_t jr_nt = bli_thrinfo_n_way( thread ); + dim_t jr_tid = bli_thrinfo_work_id( thread ); + //dim_t ir_nt = bli_thrinfo_n_way( ir_thread ); + //dim_t ir_tid = bli_thrinfo_work_id( ir_thread ); + + dim_t jr_start, jr_end; + //dim_t ir_start, ir_end; + dim_t jr_inc; + + // Determine the thread range and increment for the 2nd loop. + // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + // NOTE: Parallelism in the 1st loop is disabled for now. + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + //bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + const char* a1 = a_cast; + char* c11 = c1; + + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = 0; i < m_iter; ++i ) + { + doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + // If the current panel of A intersects the diagonal, scale C + // by beta. If it is strictly below the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) + { + // Determine the offset to and length of the panel that was + // packed so we can index into the corresponding location in + // b1. + dim_t off_a1011 = 0; + dim_t k_a1011 = bli_min( diagoffa_i + MR, k ); + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_a_cur = k_a1011 * PACKMR; + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); + ps_a_cur *= dt_size; + + // NOTE: ir loop parallelism disabled for now. + //if ( bli_trmm_my_iter( i, ir_thread ) ) { + + const char* b1_i = b1 + off_a1011 * PACKNR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = a1; + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = b1; + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_a1011, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1_i, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + //} + + a1 += ps_a_cur; + } + else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) + { + // NOTE: ir loop parallelism disabled for now. + //if ( bli_trmm_my_iter( i, ir_thread ) ) { + + // Compute the addresses of the next panels of A and B. + const char* a2 = a1; + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = b1; + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + //} + + a1 += rstep_a; + } + + c11 += rstep_c; + } + } } -INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 ) +//PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" ); diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index df5b2dac5..265e21a66 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -35,50 +35,27 @@ #include "blis.h" -#define FUNCPTR_T gemm_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffa, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); - - void bli_trmm_lu_ker_var2 ( const obj_t* a, const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread_par ) { - const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); - const doff_t diagoffa = bli_obj_diag_offset( a ); + doff_t diagoffa = bli_obj_diag_offset( a ); const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - const dim_t k = bli_obj_width( a ); + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); const void* buf_a = bli_obj_buffer_at_off( a ); const inc_t cs_a = bli_obj_col_stride( a ); @@ -105,89 +82,23 @@ void bli_trmm_lu_ker_var2 const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); const void* buf_beta = bli_obj_internal_scalar_buffer( c ); - // Index into the type combination array to extract the correct - // function pointer. - ftypes[dt_exec] - ( - diagoffa, - schema_a, - schema_b, - m, - n, - k, - ( void* )buf_alpha, - ( void* )buf_a, cs_a, pd_a, ps_a, - ( void* )buf_b, rs_b, pd_b, ps_b, - ( void* )buf_beta, - buf_c, rs_c, cs_c, - ( cntx_t* )cntx, - rntm, - thread - ); -} + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - const dim_t PACKMR = cs_a; \ - const dim_t PACKNR = rs_b; \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - ctype* restrict one = PASTEMAC(ch,1); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffa_i; \ - dim_t k_full; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t k_a1112; \ - dim_t off_a1112; \ - dim_t i, j; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - inc_t istep_a; \ - inc_t istep_b; \ - inc_t ps_a_cur; \ - inc_t is_a_cur; \ - auxinfo_t aux; \ -\ /* Assumptions/assertions: rs_a == 1 @@ -200,235 +111,208 @@ void PASTEMAC(ch,varname) \ ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) - */ \ -\ - /* Safety trap: Certain indexing within this macro-kernel does not - work as intended if both MR and NR are odd. */ \ - if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ - ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current block of A is entirely below the diagonal, - it is implicitly zero. So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ -\ - /* Compute k_full. For all trmm, k_full is simply k. This is - needed because some parameter combinations of trmm reduce k - to advance past zero regions in the triangular matrix, and - when computing the imaginary stride of B (the non-triangular - matrix), which is used by 4m1/3m1 implementations, we need - this unreduced value of k. */ \ - k_full = k; \ -\ - /* If there is a zero region to the left of where the diagonal of A - intersects the top edge of the block, adjust the pointer to B and - treat this case as if the diagonal offset were zero. Note that we - don't need to adjust the pointer to A since packm would have simply - skipped over the region that was not stored. */ \ - if ( diagoffa > 0 ) \ - { \ - i = diagoffa; \ - k = k - i; \ - diagoffa = 0; \ - b_cast = b_cast + i * PACKNR; \ - } \ -\ - /* If there is a zero region below where the diagonal of A intersects the - right side of the block, shrink it to prevent "no-op" iterations from - executing. */ \ - if ( -diagoffa + k < m ) \ - { \ - m = -diagoffa + k; \ - } \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - istep_a = PACKMR * k; \ - istep_b = PACKNR * k_full; \ -\ - if ( bli_is_odd( istep_a ) ) istep_a += 1; \ - if ( bli_is_odd( istep_b ) ) istep_b += 1; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_b( istep_b, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ - dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ -\ - dim_t jr_start, jr_end; \ - /*dim_t ir_start, ir_end;*/ \ - dim_t jr_inc; \ -\ - /* Determine the thread range and increment for the 2nd loop. - NOTE: The definition of bli_thread_range_jrir() will depend on whether - slab or round-robin partitioning was requested at configure-time. \ - NOTE: Parallelism in the 1st loop is disabled for now. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - a1 = a_cast; \ - c11 = c1; \ -\ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ - { \ - diagoffa_i = diagoffa + ( doff_t )i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* If the current panel of A intersects the diagonal, scale C - by beta. If it is strictly above the diagonal, scale by one. - This allows the current macro-kernel to work for both trmm - and trmm3. */ \ - if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ - { \ - ctype* restrict b1_i; \ - ctype* restrict a2; \ -\ - /* Determine the offset to and length of the panel that was - packed so we can index into the corresponding location in - b1. */ \ - off_a1112 = diagoffa_i; \ - k_a1112 = k - off_a1112; \ -\ - /* Compute the panel stride for the current diagonal- - intersecting micro-panel. */ \ - is_a_cur = k_a1112 * PACKMR; \ - is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ - ps_a_cur = is_a_cur; \ -\ - /* NOTE: ir loop parallelism disabled for now. */ \ - /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ -\ - b1_i = b1 + off_a1112 * PACKNR; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1; \ - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ - { \ - a2 = a_cast; \ - b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k_a1112, \ - alpha_cast, \ - a1, \ - b1_i, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - /*}*/ \ -\ - a1 += ps_a_cur; \ - } \ - else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ - { \ - /* NOTE: ir loop parallelism disabled for now. */ \ - /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ -\ - ctype* restrict a2; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1; \ - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ - { \ - a2 = a_cast; \ - b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - one, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - /*}*/ \ -\ - a1 += rstep_a; \ - } \ -\ - c11 += rstep_c; \ - } \ - } \ -\ -/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \ -/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current block of A is entirely below the diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; + + // If there is a zero region to the left of where the diagonal of A + // intersects the top edge of the block, adjust the pointer to B and + // treat this case as if the diagonal offset were zero. Note that we + // don't need to adjust the pointer to A since packm would have simply + // skipped over the region that was not stored. + if ( diagoffa > 0 ) + { + k -= diagoffa; + b_cast += diagoffa * PACKNR * dt_size; + diagoffa = 0; + } + + // If there is a zero region below where the diagonal of A intersects the + // right side of the block, shrink it to prevent "no-op" iterations from + // executing. + if ( -diagoffa + k < m ) + { + m = -diagoffa + k; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + dim_t n_iter = n / NR; + dim_t n_left = n % NR; + + dim_t m_iter = m / MR; + dim_t m_left = m % MR; + + if ( n_left ) ++n_iter; + if ( m_left ) ++m_iter; + + // Determine some increments used to step through A, B, and C. + inc_t rstep_a = ps_a * dt_size; + + inc_t cstep_b = ps_b * dt_size; + + inc_t rstep_c = rs_c * MR * dt_size; + inc_t cstep_c = cs_c * NR * dt_size; + + // Save the pack schemas of A and B to the auxinfo_t object. + auxinfo_t aux; + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + //thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + dim_t jr_nt = bli_thrinfo_n_way( thread ); + dim_t jr_tid = bli_thrinfo_work_id( thread ); + //dim_t ir_nt = bli_thrinfo_n_way( ir_thread ); + //dim_t ir_tid = bli_thrinfo_work_id( ir_thread ); + + dim_t jr_start, jr_end; + //dim_t ir_start, ir_end; + dim_t jr_inc; + + // Determine the thread range and increment for the 2nd loop. + // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + // NOTE: Parallelism in the 1st loop is disabled for now. + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + //bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + const char* a1 = a_cast; + char* c11 = c1; + + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = 0; i < m_iter; ++i ) + { + doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + // If the current panel of A intersects the diagonal, scale C + // by beta. If it is strictly above the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) + { + // Determine the offset to and length of the panel that was + // packed so we can index into the corresponding location in + // b1. + dim_t off_a1112 = diagoffa_i; + dim_t k_a1112 = k - off_a1112; + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_a_cur = k_a1112 * PACKMR; + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); + ps_a_cur *= dt_size; + + // NOTE: ir loop parallelism disabled for now. + //if ( bli_trmm_my_iter( i, ir_thread ) ) { + + const char* b1_i = b1 + off_a1112 * PACKNR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = a1; + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = b1; + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_a1112, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1_i, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + //} + + a1 += ps_a_cur; + } + else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) + { + // NOTE: ir loop parallelism disabled for now. + //if ( bli_trmm_my_iter( i, ir_thread ) ) { + + // Compute the addresses of the next panels of A and B. + const char* a2 = a1; + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = b1; + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + //} + + a1 += rstep_a; + } + + c11 += rstep_c; + } + } } -INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 ) +//PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" ); diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 89f86aa3a..785f2cf5f 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -35,50 +35,27 @@ #include "blis.h" -#define FUNCPTR_T gemm_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffb, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); - - void bli_trmm_rl_ker_var2 ( const obj_t* a, const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread_par ) { - const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); - const doff_t diagoffb = bli_obj_diag_offset( b ); + doff_t diagoffb = bli_obj_diag_offset( b ); const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - const dim_t k = bli_obj_width( a ); + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); const void* buf_a = bli_obj_buffer_at_off( a ); const inc_t cs_a = bli_obj_col_stride( a ); @@ -105,89 +82,23 @@ void bli_trmm_rl_ker_var2 const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); const void* buf_beta = bli_obj_internal_scalar_buffer( c ); - // Index into the type combination array to extract the correct - // function pointer. - ftypes[dt_exec] - ( - diagoffb, - schema_a, - schema_b, - m, - n, - k, - ( void* )buf_alpha, - ( void* )buf_a, cs_a, pd_a, ps_a, - ( void* )buf_b, rs_b, pd_b, ps_b, - ( void* )buf_beta, - buf_c, rs_c, cs_c, - ( cntx_t* )cntx, - rntm, - thread - ); -} + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffb, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - const dim_t PACKMR = cs_a; \ - const dim_t PACKNR = rs_b; \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - ctype* restrict one = PASTEMAC(ch,1); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffb_j; \ - dim_t k_full; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t k_b1121; \ - dim_t off_b1121; \ - dim_t i, j; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - inc_t istep_a; \ - inc_t istep_b; \ - inc_t ps_b_cur; \ - inc_t is_b_cur; \ - auxinfo_t aux; \ -\ /* Assumptions/assertions: rs_a == 1 @@ -200,292 +111,261 @@ void PASTEMAC(ch,varname) \ ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) - */ \ -\ - /* Safety trap: Certain indexing within this macro-kernel does not - work as intended if both MR and NR are odd. */ \ - if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ - ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of B is entirely above the diagonal, - it is implicitly zero. So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ -\ - /* Compute k_full. For all trmm, k_full is simply k. This is - needed because some parameter combinations of trmm reduce k - to advance past zero regions in the triangular matrix, and - when computing the imaginary stride of A (the non-triangular - matrix), which is used by 4m1/3m1 implementations, we need - this unreduced value of k. */ \ - k_full = k; \ -\ - /* If there is a zero region above where the diagonal of B intersects - the left edge of the panel, adjust the pointer to A and treat this - case as if the diagonal offset were zero. Note that we don't need to - adjust the pointer to B since packm would have simply skipped over - the region that was not stored. */ \ - if ( diagoffb < 0 ) \ - { \ - j = -diagoffb; \ - k = k - j; \ - diagoffb = 0; \ - a_cast = a_cast + j * PACKMR; \ - } \ -\ - /* If there is a zero region to the right of where the diagonal - of B intersects the bottom of the panel, shrink it to prevent - "no-op" iterations from executing. */ \ - if ( diagoffb + k < n ) \ - { \ - n = diagoffb + k; \ - } \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - istep_a = PACKMR * k_full; \ - istep_b = PACKNR * k; \ -\ - if ( bli_is_odd( istep_a ) ) istep_a += 1; \ - if ( bli_is_odd( istep_b ) ) istep_b += 1; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( istep_a, &aux ); \ -\ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the rectangular - part of B, and the triangular portion. */ \ - dim_t n_iter_rct; \ - dim_t n_iter_tri; \ -\ - if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \ - { \ - /* If the entire panel of B does not intersect the diagonal, there is - no triangular region, and therefore we can skip the second set of - loops. */ \ - n_iter_rct = n_iter; \ - n_iter_tri = 0; \ - } \ - else \ - { \ - /* If the panel of B does intersect the diagonal, compute the number of - iterations in the rectangular region by dividing NR into the diagonal - offset. (There should never be any remainder in this division.) The - number of iterations in the triangular (or trapezoidal) region is - computed as the remaining number of iterations in the n dimension. */ \ - n_iter_rct = diagoffb / NR; \ - n_iter_tri = n_iter - n_iter_rct; \ - } \ -\ - /* Determine the thread range and increment for the 2nd and 1st loops for - the initial rectangular region of B (if it exists). - NOTE: The definition of bli_thread_range_jrir() will depend on whether - slab or round-robin partitioning was requested at configure-time. \ - NOTE: Parallelism in the 1st loop is disabled for now. */ \ - bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - { \ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - one, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - } \ - } \ -\ - /* If there is no triangular region, then we're done. */ \ - if ( n_iter_tri == 0 ) return; \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd and - 1st loops for the remaining triangular region of B (if it exists). - NOTE: We don't need to call bli_thread_range_jrir_rr() here since we - employ a hack that calls for each thread to execute every iteration - of the jr and ir loops but skip all but the pointer increment for - iterations that are not assigned to it. */ \ -\ - /* Advance the starting b1 and c1 pointers to the positions corresponding - to the start of the triangular region of B. */ \ - jr_start = n_iter_rct; \ - b1 = b_cast + jr_start * cstep_b; \ - c1 = c_cast + jr_start * cstep_c; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < n_iter; ++j ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - diagoffb_j = diagoffb - ( doff_t )j*NR; \ -\ - /* Determine the offset to the beginning of the panel that - was packed so we can index into the corresponding location - in A. Then compute the length of that panel. */ \ - off_b1121 = bli_max( -diagoffb_j, 0 ); \ - k_b1121 = k - off_b1121; \ -\ - a1 = a_cast; \ - c11 = c1; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* If the current panel of B intersects the diagonal, scale C - by beta. If it is strictly below the diagonal, scale by one. - This allows the current macro-kernel to work for both trmm - and trmm3. */ \ - { \ - /* Compute the panel stride for the current diagonal- - intersecting micro-panel. */ \ - is_b_cur = k_b1121 * PACKNR; \ - is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ - ps_b_cur = is_b_cur; \ -\ - if ( bli_trmm_my_iter_rr( j, thread ) ) { \ -\ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ - { \ - if ( bli_trmm_my_iter_rr( i, caucus ) ) { \ -\ - ctype* restrict a1_i; \ - ctype* restrict a2; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - a1_i = a1 + off_b1121 * PACKMR; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1; \ - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ - { \ - a2 = a_cast; \ - b2 = b1; \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k_b1121, \ - alpha_cast, \ - a1_i, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ - } \ - } \ -\ - b1 += ps_b_cur; \ - } \ -\ - c1 += cstep_c; \ - } \ -\ -/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \ -/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of B is entirely above the diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; + + // If there is a zero region above where the diagonal of B intersects + // the left edge of the panel, adjust the pointer to A and treat this + // case as if the diagonal offset were zero. Note that we don't need to + // adjust the pointer to B since packm would have simply skipped over + // the region that was not stored. + if ( diagoffb < 0 ) + { + k += diagoffb; + a_cast -= diagoffb * PACKMR * dt_size; + diagoffb = 0; + } + + // If there is a zero region to the right of where the diagonal + // of B intersects the bottom of the panel, shrink it to prevent + // "no-op" iterations from executing. + if ( diagoffb + k < n ) + { + n = diagoffb + k; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + dim_t n_iter = n / NR; + dim_t n_left = n % NR; + + dim_t m_iter = m / MR; + dim_t m_left = m % MR; + + if ( n_left ) ++n_iter; + if ( m_left ) ++m_iter; + + // Determine some increments used to step through A, B, and C. + inc_t rstep_a = ps_a * dt_size; + + inc_t cstep_b = ps_b * dt_size; + + inc_t rstep_c = rs_c * MR * dt_size; + inc_t cstep_c = cs_c * NR * dt_size; + + // Save the pack schemas of A and B to the auxinfo_t object. + auxinfo_t aux; + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + dim_t jr_nt = bli_thrinfo_n_way( thread ); + dim_t jr_tid = bli_thrinfo_work_id( thread ); + dim_t ir_nt = bli_thrinfo_n_way( caucus ); + dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + dim_t jr_start, jr_end; + dim_t ir_start, ir_end; + dim_t jr_inc, ir_inc; + + // Note that we partition the 2nd loop into two regions: the rectangular + // part of B, and the triangular portion. + dim_t n_iter_rct; + dim_t n_iter_tri; + + if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) + { + // If the entire panel of B does not intersect the diagonal, there is + // no triangular region, and therefore we can skip the second set of + // loops. + n_iter_rct = n_iter; + n_iter_tri = 0; + } + else + { + // If the panel of B does intersect the diagonal, compute the number of + // iterations in the rectangular region by dividing NR into the diagonal + // offset. (There should never be any remainder in this division.) The + // number of iterations in the triangular (or trapezoidal) region is + // computed as the remaining number of iterations in the n dimension. + n_iter_rct = diagoffb / NR; + n_iter_tri = n_iter - n_iter_rct; + } + + // Determine the thread range and increment for the 2nd and 1st loops for + // the initial rectangular region of B (if it exists). + // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + // NOTE: Parallelism in the 1st loop is disabled for now. + bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + { + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + } + } + + // If there is no triangular region, then we're done. + if ( n_iter_tri == 0 ) return; + + // Use round-robin assignment of micropanels to threads in the 2nd and + // 1st loops for the remaining triangular region of B (if it exists). + // NOTE: We don't need to call bli_thread_range_jrir_rr() here since we + // employ a hack that calls for each thread to execute every iteration + // of the jr and ir loops but skip all but the pointer increment for + // iterations that are not assigned to it. + + // Advance the starting b1 and c1 pointers to the positions corresponding + // to the start of the triangular region of B. + jr_start = n_iter_rct; + const char* b1 = b_cast + jr_start * cstep_b; + char* c1 = c_cast + jr_start * cstep_c; + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < n_iter; ++j ) + { + doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + + // Determine the offset to the beginning of the panel that + // was packed so we can index into the corresponding location + // in A. Then compute the length of that panel. + dim_t off_b1121 = bli_max( -diagoffb_j, 0 ); + dim_t k_b1121 = k - off_b1121; + + const char* a1 = a_cast; + char* c11 = c1; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + // If the current panel of B intersects the diagonal, scale C + // by beta. If it is strictly below the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + { + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_b_cur = k_b1121 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + if ( bli_trmm_my_iter_rr( j, thread ) ) { + + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = 0; i < m_iter; ++i ) + { + if ( bli_trmm_my_iter_rr( i, caucus ) ) { + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + const char* a1_i = a1 + off_b1121 * PACKMR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = a1; + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = b1; + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_b1121, + ( void* )alpha_cast, + ( void* )a1_i, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + + a1 += rstep_a; + c11 += rstep_c; + } + } + + b1 += ps_b_cur; + } + + c1 += cstep_c; + } } -INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 ) +//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" ); diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 4ed38e761..ca27caef1 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -35,50 +35,27 @@ #include "blis.h" -#define FUNCPTR_T gemm_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffb, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha, - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, - void* beta, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); - - void bli_trmm_ru_ker_var2 ( const obj_t* a, const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread_par ) { - const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); - const doff_t diagoffb = bli_obj_diag_offset( b ); + doff_t diagoffb = bli_obj_diag_offset( b ); const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - const dim_t k = bli_obj_width( a ); + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); const void* buf_a = bli_obj_buffer_at_off( a ); const inc_t cs_a = bli_obj_col_stride( a ); @@ -105,89 +82,23 @@ void bli_trmm_ru_ker_var2 const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); const void* buf_beta = bli_obj_internal_scalar_buffer( c ); - // Index into the type combination array to extract the correct - // function pointer. - ftypes[dt_exec] - ( - diagoffb, - schema_a, - schema_b, - m, - n, - k, - ( void* )buf_alpha, - ( void* )buf_a, cs_a, pd_a, ps_a, - ( void* )buf_b, rs_b, pd_b, ps_b, - ( void* )buf_beta, - buf_c, rs_c, cs_c, - ( cntx_t* )cntx, - rntm, - thread - ); -} + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffb, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - const dim_t PACKMR = cs_a; \ - const dim_t PACKNR = rs_b; \ -\ - /* Query the context for the micro-kernel address and cast it to its - function pointer type. */ \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - ctype* restrict one = PASTEMAC(ch,1); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ - ctype* restrict beta_cast = beta; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffb_j; \ - dim_t k_full; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t k_b0111; \ - dim_t off_b0111; \ - dim_t i, j, jb0; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - inc_t istep_a; \ - inc_t istep_b; \ - inc_t ps_b_cur; \ - inc_t is_b_cur; \ - auxinfo_t aux; \ -\ /* Assumptions/assertions: rs_a == 1 @@ -200,312 +111,279 @@ void PASTEMAC(ch,varname) \ ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) - */ \ -\ - /* Safety trap: Certain indexing within this macro-kernel does not - work as intended if both MR and NR are odd. */ \ - if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ - ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of B is entirely below its diagonal, - it is implicitly zero. So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ -\ - /* Compute k_full. For all trmm, k_full is simply k. This is - needed because some parameter combinations of trmm reduce k - to advance past zero regions in the triangular matrix, and - when computing the imaginary stride of A (the non-triangular - matrix), which is used by 4m1/3m1 implementations, we need - this unreduced value of k. */ \ - k_full = k; \ -\ - /* If there is a zero region to the left of where the diagonal of B - intersects the top edge of the panel, adjust the pointer to C and - treat this case as if the diagonal offset were zero. This skips over - the region that was not packed. (Note we assume the diagonal offset - is a multiple of MR; this assumption will hold as long as the cache - blocksizes are each a multiple of MR and NR.) */ \ - if ( diagoffb > 0 ) \ - { \ - j = diagoffb; \ - n = n - j; \ - diagoffb = 0; \ - c_cast = c_cast + (j )*cs_c; \ - } \ -\ - /* If there is a zero region below where the diagonal of B intersects the - right side of the block, shrink it to prevent "no-op" iterations from - executing. */ \ - if ( -diagoffb + n < k ) \ - { \ - k = -diagoffb + n; \ - } \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - istep_a = PACKMR * k_full; \ - istep_b = PACKNR * k; \ -\ - if ( bli_is_odd( istep_a ) ) istep_a += 1; \ - if ( bli_is_odd( istep_b ) ) istep_b += 1; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of A to the auxinfo_t object. */ \ - bli_auxinfo_set_is_a( istep_a, &aux ); \ -\ - /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - loop around the microkernel. Here we query the thrinfo_t node for the - 1st (ir) loop around the microkernel. */ \ - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ -\ - dim_t jr_start, jr_end; \ - dim_t ir_start, ir_end; \ - dim_t jr_inc, ir_inc; \ -\ - /* Note that we partition the 2nd loop into two regions: the triangular - part of C, and the rectangular portion. */ \ - dim_t n_iter_tri; \ - dim_t n_iter_rct; \ -\ - if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \ - { \ - /* If the entire panel of B does not intersect the diagonal, there is - no triangular region, and therefore we can skip the first set of - loops. */ \ - n_iter_tri = 0; \ - n_iter_rct = n_iter; \ - } \ - else \ - { \ - /* If the panel of B does intersect the diagonal, compute the number of - iterations in the triangular (or trapezoidal) region by dividing NR - into the number of rows in B. (There should never be any remainder - in this division.) The number of iterations in the rectangular region - is computed as the remaining number of iterations in the n dimension. */ \ - n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \ - n_iter_rct = n_iter - n_iter_tri; \ - } \ -\ - /* Use round-robin assignment of micropanels to threads in the 2nd and - 1st loops for the initial triangular region of B (if it exists). - NOTE: We don't need to call bli_thread_range_jrir_rr() here since we - employ a hack that calls for each thread to execute every iteration - of the jr and ir loops but skip all but the pointer increment for - iterations that are not assigned to it. */ \ -\ - b1 = b_cast; \ - c1 = c_cast; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter_tri; ++j ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - diagoffb_j = diagoffb - ( doff_t )j*NR; \ -\ - /* Determine the offset to and length of the panel that was packed - so we can index into the corresponding location in A. */ \ - off_b0111 = 0; \ - k_b0111 = bli_min( k, -diagoffb_j + NR ); \ -\ - a1 = a_cast; \ - c11 = c1; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* If the current panel of B intersects the diagonal, scale C - by beta. If it is strictly below the diagonal, scale by one. - This allows the current macro-kernel to work for both trmm - and trmm3. */ \ - { \ - /* Compute the panel stride for the current diagonal- - intersecting micro-panel. */ \ - is_b_cur = k_b0111 * PACKNR; \ - is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ - ps_b_cur = is_b_cur; \ -\ - if ( bli_trmm_my_iter_rr( j, thread ) ) { \ -\ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ - { \ - if ( bli_trmm_my_iter_rr( i, caucus ) ) { \ -\ - ctype* restrict a1_i; \ - ctype* restrict a2; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - a1_i = a1 + off_b0111 * PACKMR; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1; \ - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ - { \ - a2 = a_cast; \ - b2 = b1; \ - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k_b0111, \ - alpha_cast, \ - a1_i, \ - b1, \ - beta_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ - } \ - } \ -\ - b1 += ps_b_cur; \ - } \ -\ - c1 += cstep_c; \ - } \ -\ - /* If there is no rectangular region, then we're done. */ \ - if ( n_iter_rct == 0 ) return; \ -\ - /* Determine the thread range and increment for the 2nd and 1st loops for - the remaining rectangular region of B. - NOTE: The definition of bli_thread_range_jrir() will depend on whether - slab or round-robin partitioning was requested at configure-time. \ - NOTE: Parallelism in the 1st loop is disabled for now. */ \ - bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ - bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ -\ - /* Advance the start and end iteration offsets for the rectangular region - by the number of iterations used for the triangular region. */ \ - jr_start += n_iter_tri; \ - jr_end += n_iter_tri; \ - jb0 = n_iter_tri; \ -\ - /* Save the resulting value of b1 from the previous loop since it represents - the starting point for the rectangular region. */ \ - b_cast = b1; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - /* NOTE: We must index through b_cast differently since it contains - the starting address of the rectangular region (which is already - n_iter_tri logical iterations through B). */ \ - b1 = b_cast + (j-jb0) * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* If the current panel of B intersects the diagonal, scale C - by beta. If it is strictly below the diagonal, scale by one. - This allows the current macro-kernel to work for both trmm - and trmm3. */ \ - { \ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = ir_start; i < ir_end; i += ir_inc ) \ - { \ - ctype* restrict a2; \ -\ - a1 = a_cast + i * rstep_a; \ - c11 = c1 + i * rstep_c; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ - { \ - a2 = a_cast; \ - b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k, \ - alpha_cast, \ - a1, \ - b1, \ - one, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ - } \ - } \ - } \ -\ -\ -\ -/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \ -/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of B is entirely below its diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; + + // If there is a zero region to the left of where the diagonal of B + // intersects the top edge of the panel, adjust the pointer to C and + // treat this case as if the diagonal offset were zero. This skips over + // the region that was not packed. (Note we assume the diagonal offset + // is a multiple of MR; this assumption will hold as long as the cache + // blocksizes are each a multiple of MR and NR.) + if ( diagoffb > 0 ) + { + n -= diagoffb; + c_cast += diagoffb * cs_c * dt_size; + diagoffb = 0; + } + + // If there is a zero region below where the diagonal of B intersects the + // right side of the block, shrink it to prevent "no-op" iterations from + // executing. + if ( -diagoffb + n < k ) + { + k = -diagoffb + n; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + dim_t n_iter = n / NR; + dim_t n_left = n % NR; + + dim_t m_iter = m / MR; + dim_t m_left = m % MR; + + if ( n_left ) ++n_iter; + if ( m_left ) ++m_iter; + + // Determine some increments used to step through A, B, and C. + inc_t rstep_a = ps_a * dt_size; + + inc_t cstep_b = ps_b * dt_size; + + inc_t rstep_c = rs_c * MR * dt_size; + inc_t cstep_c = cs_c * NR * dt_size; + + // Save the pack schemas of A and B to the auxinfo_t object. + auxinfo_t aux; + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. + dim_t jr_nt = bli_thrinfo_n_way( thread ); + dim_t jr_tid = bli_thrinfo_work_id( thread ); + dim_t ir_nt = bli_thrinfo_n_way( caucus ); + dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + dim_t jr_start, jr_end; + dim_t ir_start, ir_end; + dim_t jr_inc, ir_inc; + + // Note that we partition the 2nd loop into two regions: the triangular + // part of C, and the rectangular portion. + dim_t n_iter_tri; + dim_t n_iter_rct; + + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) + { + // If the entire panel of B does not intersect the diagonal, there is + // no triangular region, and therefore we can skip the first set of + // loops. + n_iter_tri = 0; + n_iter_rct = n_iter; + } + else + { + // If the panel of B does intersect the diagonal, compute the number of + // iterations in the triangular (or trapezoidal) region by dividing NR + // into the number of rows in B. (There should never be any remainder + // in this division.) The number of iterations in the rectangular region + // is computed as the remaining number of iterations in the n dimension. + n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); + n_iter_rct = n_iter - n_iter_tri; + } + + // Use round-robin assignment of micropanels to threads in the 2nd and + // 1st loops for the initial triangular region of B (if it exists). + // NOTE: We don't need to call bli_thread_range_jrir_rr() here since we + // employ a hack that calls for each thread to execute every iteration + // of the jr and ir loops but skip all but the pointer increment for + // iterations that are not assigned to it. + + const char* b1 = b_cast; + char* c1 = c_cast; + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = 0; j < n_iter_tri; ++j ) + { + doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + + // Determine the offset to and length of the panel that was packed + // so we can index into the corresponding location in A. + dim_t off_b0111 = 0; + dim_t k_b0111 = bli_min( k, -diagoffb_j + NR ); + + const char* a1 = a_cast; + char* c11 = c1; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + // If the current panel of B intersects the diagonal, scale C + // by beta. If it is strictly below the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + { + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_b_cur = k_b0111 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + if ( bli_trmm_my_iter_rr( j, thread ) ) { + + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = 0; i < m_iter; ++i ) + { + if ( bli_trmm_my_iter_rr( i, caucus ) ) { + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + const char* a1_i = a1 + off_b0111 * PACKMR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = a1; + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = b1; + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_b0111, + ( void* )alpha_cast, + ( void* )a1_i, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + + a1 += rstep_a; + c11 += rstep_c; + } + } + + b1 += ps_b_cur; + } + + c1 += cstep_c; + } + + // If there is no rectangular region, then we're done. + if ( n_iter_rct == 0 ) return; + + // Determine the thread range and increment for the 2nd and 1st loops for + // the remaining rectangular region of B. + // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + // NOTE: Parallelism in the 1st loop is disabled for now. + bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + + // Advance the start and end iteration offsets for the rectangular region + // by the number of iterations used for the triangular region. + jr_start += n_iter_tri; + jr_end += n_iter_tri; + dim_t jb0 = n_iter_tri; + + // Save the resulting value of b1 from the previous loop since it represents + // the starting point for the rectangular region. + b_cast = b1; + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + // NOTE: We must index through b_cast differently since it contains + // the starting address of the rectangular region (which is already + // n_iter_tri logical iterations through B). + b1 = b_cast + (j-jb0) * cstep_b; + c1 = c_cast + j * cstep_c; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + // If the current panel of B intersects the diagonal, scale C + // by beta. If it is strictly below the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + { + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + } + } } -INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 ) +//PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" ); diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h index 2f0642ca8..f8c3d7ee2 100644 --- a/frame/3/trmm/bli_trmm_var.h +++ b/frame/3/trmm/bli_trmm_var.h @@ -47,8 +47,7 @@ void PASTEMAC0(opname) \ const obj_t* b, \ const obj_t* c, \ const cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ + const cntl_t* cntl, \ thrinfo_t* thread \ ); @@ -87,7 +86,6 @@ void PASTEMAC(ch,varname) \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ - rntm_t* rntm, \ thrinfo_t* thread \ ); diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c index d42bc88c2..60030bf4a 100644 --- a/frame/3/trmm/bli_trmm_xx_ker_var2.c +++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c @@ -47,14 +47,12 @@ void bli_trmm_xx_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { - dim_t side; - dim_t uplo; - l3_var_oft f; + dim_t side; + dim_t uplo; // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular @@ -73,7 +71,7 @@ void bli_trmm_xx_ker_var2 } // Index into the variant array to extract the correct function pointer. - f = vars[side][uplo]; + l3_var_oft f = vars[side][uplo]; // Call the macrokernel. f @@ -82,7 +80,6 @@ void bli_trmm_xx_ker_var2 b, c, cntx, - rntm, cntl, thread ); diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c index 706e14d43..ccf6e2160 100644 --- a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c @@ -322,8 +322,8 @@ void PASTEMAC(ch,varname) \ c1 = c_cast; \ \ thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ - dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ - dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ + dim_t jr_num_threads = bli_thrinfo_n_way( jr_thread ); \ + dim_t jr_thread_id = bli_thrinfo_work_id( jr_thread ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c index 699892635..c3c11e62f 100644 --- a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c +++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c @@ -327,10 +327,10 @@ void PASTEMAC(ch,varname) \ /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ - dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ + /*dim_t ir_nt = bli_thrinfo_n_way( ir_thread ); \ + dim_t ir_tid = bli_thrinfo_work_id( ir_thread );*/ \ \ dim_t jr_start, jr_end; \ /*dim_t ir_start, ir_end;*/ \ diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c index eb5577593..f69b38d7f 100644 --- a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c +++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c @@ -327,10 +327,10 @@ void PASTEMAC(ch,varname) \ /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ - dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ + /*dim_t ir_nt = bli_thrinfo_n_way( ir_thread ); \ + dim_t ir_tid = bli_thrinfo_work_id( ir_thread );*/ \ \ dim_t jr_start, jr_end; \ /*dim_t ir_start, ir_end;*/ \ diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c index 738711f58..7aaf2606f 100644 --- a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c @@ -329,8 +329,8 @@ void PASTEMAC(ch,varname) \ c1 = c_cast; \ \ thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ - dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ - dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ + dim_t jr_num_threads = bli_thrinfo_n_way( jr_thread ); \ + dim_t jr_thread_id = bli_thrinfo_work_id( jr_thread ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c index df53b2011..e3d75d474 100644 --- a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c +++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c @@ -334,10 +334,10 @@ void PASTEMAC(ch,varname) \ /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ - dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ + /*dim_t ir_nt = bli_thrinfo_n_way( ir_thread ); \ + dim_t ir_tid = bli_thrinfo_work_id( ir_thread );*/ \ \ dim_t jr_start, jr_end; \ /*dim_t ir_start, ir_end;*/ \ diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c index fbcd4f9aa..700c54a0c 100644 --- a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c +++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c @@ -334,10 +334,10 @@ void PASTEMAC(ch,varname) \ /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ - dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ + /*dim_t ir_nt = bli_thrinfo_n_way( ir_thread ); \ + dim_t ir_tid = bli_thrinfo_work_id( ir_thread );*/ \ \ dim_t jr_start, jr_end; \ /*dim_t ir_start, ir_end;*/ \ diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c index 7775d9217..4499dd6ae 100644 --- a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c @@ -329,8 +329,8 @@ void PASTEMAC(ch,varname) \ c1 = c_cast; \ \ thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ - dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ - dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ + dim_t jr_num_threads = bli_thrinfo_n_way( jr_thread ); \ + dim_t jr_thread_id = bli_thrinfo_work_id( jr_thread ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c index c1354a962..a35e6adf1 100644 --- a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c @@ -330,10 +330,10 @@ void PASTEMAC(ch,varname) \ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ + dim_t ir_nt = bli_thrinfo_n_way( caucus ); \ + dim_t ir_tid = bli_thrinfo_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c index 7cf8eeef0..438835156 100644 --- a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c @@ -330,10 +330,10 @@ void PASTEMAC(ch,varname) \ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ + dim_t ir_nt = bli_thrinfo_n_way( caucus ); \ + dim_t ir_tid = bli_thrinfo_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c index 1d0f31708..275d6ca47 100644 --- a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c @@ -330,8 +330,8 @@ void PASTEMAC(ch,varname) \ c1 = c_cast; \ \ thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ - dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ - dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ + dim_t jr_num_threads = bli_thrinfo_n_way( jr_thread ); \ + dim_t jr_thread_id = bli_thrinfo_work_id( jr_thread ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c index d8ae4f8bb..704b38833 100644 --- a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c @@ -335,10 +335,10 @@ void PASTEMAC(ch,varname) \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ + dim_t ir_nt = bli_thrinfo_n_way( caucus ); \ + dim_t ir_tid = bli_thrinfo_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c index c05a082d4..eab41f665 100644 --- a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c @@ -335,10 +335,10 @@ void PASTEMAC(ch,varname) \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ + dim_t ir_nt = bli_thrinfo_n_way( caucus ); \ + dim_t ir_tid = bli_thrinfo_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 9681eb640..88478713f 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -43,8 +43,7 @@ void bli_trmm3_front const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + rntm_t* rntm ) { bli_init_once(); @@ -171,8 +170,7 @@ void bli_trmm3_front beta, &c_local, cntx, - rntm, - cntl + rntm ); } diff --git a/frame/3/trmm3/bli_trmm3_front.h b/frame/3/trmm3/bli_trmm3_front.h index b5dde34cd..dcaa4d0ee 100644 --- a/frame/3/trmm3/bli_trmm3_front.h +++ b/frame/3/trmm3/bli_trmm3_front.h @@ -41,6 +41,5 @@ void bli_trmm3_front const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + rntm_t* rntm ); diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index 413b12818..cfd1b4d7d 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -39,13 +39,12 @@ void bli_trsm_blk_var1 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par ) { obj_t ap, cp; @@ -67,7 +66,13 @@ void bli_trsm_blk_var1 0, kc, &cp, &c1 ); // All threads iterate over the entire diagonal block A11. + thrinfo_t* thread_pre = bli_thrinfo_sub_prenode( thread_par ); dim_t my_start = 0, my_end = kc; + //bli_thread_range_mdim + //( + // direct, thread_pre, &a11, b, &c1, cntl, cntx, + // &my_start, &my_end + //); #ifdef PRINT printf( "bli_trsm_blk_var1(): a11 is %d x %d at offsets (%3d, %3d)\n", @@ -105,9 +110,8 @@ void bli_trsm_blk_var1 &BLIS_ONE, &c1_1, cntx, - rntm, bli_cntl_sub_prenode( cntl ), - bli_thrinfo_sub_prenode( thread ) + thread_pre ); } @@ -118,7 +122,7 @@ void bli_trsm_blk_var1 // We must execute a barrier here because the upcoming rank-k update // requires the packed matrix B to be fully updated by the trsm // subproblem. - bli_thread_barrier( rntm, thread ); + bli_thrinfo_barrier( thread_par ); // Isolate the remaining part of the column panel matrix A, which we do by // acquiring the subpartition ahead of A11 (that is, A21 or A01, depending @@ -137,6 +141,7 @@ void bli_trsm_blk_var1 // Determine the current thread's subpartition range for the gemm // subproblem over Ax1. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); bli_thread_range_mdim ( direct, thread, &ax1, b, &cx1, cntl, cntx, @@ -177,9 +182,8 @@ void bli_trsm_blk_var1 &BLIS_ONE, &c1, cntx, - rntm, bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread ) + thread ); } #ifdef PRINT diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index 88db57e51..e86eb988a 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -37,13 +37,12 @@ void bli_trsm_blk_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par ) { obj_t bp, cp; @@ -58,6 +57,7 @@ void bli_trsm_blk_var2 // Determine the current thread's subpartition range. dim_t my_start, my_end; + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); bli_thread_range_ndim ( direct, thread, a, &bp, &cp, cntl, cntx, @@ -88,9 +88,8 @@ void bli_trsm_blk_var2 &BLIS_ONE, &c1, cntx, - rntm, bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread ) + thread ); } } diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c index 229259a95..77a3b77d1 100644 --- a/frame/3/trsm/bli_trsm_blk_var3.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -36,13 +36,12 @@ void bli_trsm_blk_var3 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par ) { obj_t ap, bp, cs; @@ -50,6 +49,8 @@ void bli_trsm_blk_var3 bli_obj_alias_to( b, &bp ); bli_obj_alias_to( c, &cs ); + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + // Determine the direction in which to partition (forwards or backwards). dir_t direct = bli_l3_direct( &ap, &bp, &cs, cntl ); @@ -64,8 +65,8 @@ void bli_trsm_blk_var3 for ( dim_t i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. - b_alg = bli_trsm_determine_kc( direct, i, k_trans, &ap, &bp, - bli_cntl_bszid( cntl ), cntx ); + b_alg = bli_l3_determine_kc( direct, i, k_trans, &ap, &bp, + bli_cntl_bszid( cntl ), cntx, cntl ); // Acquire partitions for A1 and B1. obj_t a1, b1; @@ -83,14 +84,10 @@ void bli_trsm_blk_var3 &BLIS_ONE, &cs, cntx, - rntm, bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread ) + thread ); - //bli_thread_ibarrier( thread ); - bli_thread_barrier( rntm, bli_thrinfo_sub_node( thread ) ); - // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 0a3be87f7..d036e94c7 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -37,7 +37,7 @@ cntl_t* bli_trsm_cntl_create ( - rntm_t* rntm, + pool_t* pool, side_t side, pack_t schema_a, pack_t schema_b, @@ -45,14 +45,14 @@ cntl_t* bli_trsm_cntl_create ) { if ( bli_is_left( side ) ) - return bli_trsm_l_cntl_create( rntm, schema_a, schema_b, ker ); + return bli_trsm_l_cntl_create( pool, schema_a, schema_b, ker ); else - return bli_trsm_r_cntl_create( rntm, schema_a, schema_b, ker ); + return bli_trsm_r_cntl_create( pool, schema_a, schema_b, ker ); } cntl_t* bli_trsm_l_cntl_create ( - rntm_t* rntm, + pool_t* pool, pack_t schema_a, pack_t schema_b, void_fp ker @@ -73,18 +73,18 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* gemm_cntl_bu_ke = bli_trsm_cntl_create_node ( - rntm, // the thread's runtime structure - family, // the operation family - BLIS_MR, // needed for bli_thrinfo_rgrow() - NULL, // variant function pointer not used - NULL // no sub-node; this is the leaf of the tree. + pool, // the thread's sba pool + family, // the operation family + BLIS_MR, + NULL, // variant function pointer not used + NULL // no sub-node; this is the leaf of the tree. ); cntl_t* gemm_cntl_bp_bu = bli_trsm_cntl_create_node ( - rntm, + pool, family, - BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + BLIS_NR, macro_kernel_p, gemm_cntl_bu_ke ); @@ -92,14 +92,14 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for packing matrix A. cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( - rntm, + pool, bli_l3_packa, // trsm operation's packm function for A. BLIS_MR, BLIS_MR, - FALSE, // do NOT invert diagonal - TRUE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - schema_a, // normally BLIS_PACKED_ROW_PANELS + FALSE, // do NOT invert diagonal + TRUE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, gemm_cntl_bp_bu ); @@ -110,18 +110,18 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node ( - rntm, // the thread's runtime structure - family, // the operation family - BLIS_MR, // needed for bli_thrinfo_rgrow() - NULL, // variant function pointer not used - NULL // no sub-node; this is the leaf of the tree. + pool, // the thread's sba pool + family, // the operation family + BLIS_MR, + NULL, // variant function pointer not used + NULL // no sub-node; this is the leaf of the tree. ); cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node ( - rntm, + pool, family, - BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + BLIS_NR, macro_kernel_p, trsm_cntl_bu_ke ); @@ -129,18 +129,18 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for packing matrix A. cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( - rntm, + pool, bli_l3_packa, // trsm operation's packm function for A. BLIS_MR, BLIS_MR, #ifdef BLIS_ENABLE_TRSM_PREINVERSION - TRUE, // invert diagonal + TRUE, // invert diagonal #else - FALSE, // do NOT invert diagonal + FALSE, // do NOT invert diagonal #endif - TRUE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - schema_a, // normally BLIS_PACKED_ROW_PANELS + TRUE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, trsm_cntl_bp_bu ); @@ -151,7 +151,7 @@ cntl_t* bli_trsm_l_cntl_create // NOTE: We attach the gemm sub-tree as the main branch. cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_MC, bli_trsm_blk_var1, @@ -166,14 +166,14 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for packing matrix B. cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( - rntm, + pool, bli_l3_packb, BLIS_NR, BLIS_MR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - schema_b, // normally BLIS_PACKED_COL_PANELS + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, trsm_cntl_op_bp ); @@ -181,7 +181,7 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for partitioning the k dimension by KC. cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_KC, bli_trsm_blk_var3, @@ -191,7 +191,7 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for partitioning the n dimension by NC. cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_NC, bli_trsm_blk_var2, @@ -203,7 +203,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* bli_trsm_r_cntl_create ( - rntm_t* rntm, + pool_t* pool, pack_t schema_a, pack_t schema_b, void_fp ker @@ -220,7 +220,7 @@ cntl_t* bli_trsm_r_cntl_create // Create two nodes for the macro-kernel. cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used @@ -229,7 +229,7 @@ cntl_t* bli_trsm_r_cntl_create cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, @@ -239,7 +239,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for packing matrix A. cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( - rntm, + pool, bli_l3_packa, BLIS_NR, BLIS_MR, @@ -254,7 +254,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for partitioning the m dimension by MC. cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_MC, bli_trsm_blk_var1, @@ -264,7 +264,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for packing matrix B. cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( - rntm, + pool, bli_l3_packb, BLIS_MR, BLIS_MR, @@ -279,7 +279,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for partitioning the k dimension by KC. cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_KC, bli_trsm_blk_var3, @@ -289,7 +289,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for partitioning the n dimension by NC. cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node ( - rntm, + pool, family, BLIS_NC, bli_trsm_blk_var2, @@ -301,25 +301,24 @@ cntl_t* bli_trsm_r_cntl_create void bli_trsm_cntl_free ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + pool_t* pool, + cntl_t* cntl ) { - bli_cntl_free( rntm, cntl, thread ); + bli_cntl_free( pool, cntl ); } // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ) { - return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node ); + return bli_cntl_create_node( pool, family, bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h index 86f4a29b2..a23120ff8 100644 --- a/frame/3/trsm/bli_trsm_cntl.h +++ b/frame/3/trsm/bli_trsm_cntl.h @@ -35,7 +35,7 @@ cntl_t* bli_trsm_cntl_create ( - rntm_t* rntm, + pool_t* pool, side_t side, pack_t schema_a, pack_t schema_b, @@ -44,7 +44,7 @@ cntl_t* bli_trsm_cntl_create cntl_t* bli_trsm_l_cntl_create ( - rntm_t* rntm, + pool_t* pool, pack_t schema_a, pack_t schema_b, void_fp ker @@ -52,7 +52,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* bli_trsm_r_cntl_create ( - rntm_t* rntm, + pool_t* pool, pack_t schema_a, pack_t schema_b, void_fp ker @@ -60,16 +60,15 @@ cntl_t* bli_trsm_r_cntl_create void bli_trsm_cntl_free ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + pool_t* pool, + cntl_t* cntl ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, opid_t family, bszid_t bszid, void_fp var_func, diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index b94a129d9..4672366e5 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -42,8 +42,7 @@ void bli_trsm_front const obj_t* a, const obj_t* b, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + rntm_t* rntm ) { bli_init_once(); @@ -153,8 +152,7 @@ void bli_trsm_front alpha, &c_local, cntx, - rntm, - cntl + rntm ); } diff --git a/frame/3/trsm/bli_trsm_front.h b/frame/3/trsm/bli_trsm_front.h index b31e88b04..dacfd19e9 100644 --- a/frame/3/trsm/bli_trsm_front.h +++ b/frame/3/trsm/bli_trsm_front.h @@ -40,8 +40,7 @@ void bli_trsm_front const obj_t* a, const obj_t* b, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + rntm_t* rntm ); #ifdef BLIS_ENABLE_SMALL_MATRIX diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index 075b40336..e2128f100 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -35,50 +35,27 @@ #include "blis.h" -#define FUNCPTR_T gemm_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffa, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha1, - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, - void* alpha2, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); - - void bli_trsm_ll_ker_var2 ( const obj_t* a, const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread_par ) { - const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); - const doff_t diagoffa = bli_obj_diag_offset( a ); + doff_t diagoffa = bli_obj_diag_offset( a ); const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - const dim_t k = bli_obj_width( a ); + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); const void* buf_a = bli_obj_buffer_at_off( a ); const inc_t cs_a = bli_obj_col_stride( a ); @@ -110,105 +87,23 @@ void bli_trsm_ll_ker_var2 // packing. const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c ); - // Index into the type combination array to extract the correct - // function pointer. - ftypes[dt_exec] - ( - diagoffa, - schema_a, - schema_b, - m, - n, - k, - ( void* )buf_alpha1, - ( void* )buf_a, cs_a, pd_a, ps_a, - ( void* )buf_b, rs_b, pd_b, ps_b, - ( void* )buf_alpha2, - buf_c, rs_c, cs_c, - ( cntx_t* )cntx, - rntm, - thread - ); -} + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + // Cast the micro-kernel address to its function pointer type. + gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* minus_one = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha1_cast = buf_alpha1; + const char* alpha2_cast = buf_alpha2; -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - const dim_t PACKMR = cs_a; \ - const dim_t PACKNR = rs_b; \ -\ - /* Cast the micro-kernel address to its function pointer type. */ \ - PASTECH(ch,gemmtrsm_ukr_ft) \ - gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ -/* - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -*/ \ -\ - ctype* restrict minus_one = PASTEMAC(ch,m1); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha1_cast = alpha1; \ - ctype* restrict alpha2_cast = alpha2; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffa_i; \ - dim_t k_full; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t k_a1011; \ - dim_t k_a10; \ - dim_t off_a10; \ - dim_t off_a11; \ - dim_t i, j; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - inc_t istep_a; \ - inc_t istep_b; \ - inc_t ps_a_cur; \ - inc_t is_a_cur; \ - auxinfo_t aux; \ -\ /* Assumptions/assertions: rs_a == 1 @@ -221,262 +116,224 @@ void PASTEMAC(ch,varname) \ ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) - */ \ -\ - /* Safety trap: Certain indexing within this macro-kernel does not - work as intended if both MR and NR are odd. */ \ - if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ - ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If matrix A is above the diagonal, it is implicitly zero. - So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ -\ - /* Compute k_full as k inflated up to a multiple of MR. This is - needed because some parameter combinations of trsm reduce k - to advance past zero regions in the triangular matrix, and - when computing the imaginary stride of B (the non-triangular - matrix), which is used by 4m1/3m1 implementations, we need - this unreduced value of k. */ \ - k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ -\ - /* If there is a zero region above where the diagonal of A intersects the - left edge of the block, adjust the pointer to C and treat this case as - if the diagonal offset were zero. This skips over the region that was - not packed. (Note we assume the diagonal offset is a multiple of MR; - this assumption will hold as long as the cache blocksizes are each a - multiple of MR and NR.) */ \ - if ( diagoffa < 0 ) \ - { \ - i = -diagoffa; \ - m = m - i; \ - diagoffa = 0; \ - c_cast = c_cast + (i )*rs_c; \ - } \ -\ - /* Check the k dimension, which needs to be a multiple of MR. If k - isn't a multiple of MR, we adjust it higher to satisfy the micro- - kernel, which is expecting to perform an MR x MR triangular solve. - This adjustment of k is consistent with what happened when A was - packed: all of its bottom/right edges were zero-padded, and - furthermore, the panel that stores the bottom-right corner of the - matrix has its diagonal extended into the zero-padded region (as - identity). This allows the trsm of that bottom-right panel to - proceed without producing any infs or NaNs that would infect the - "good" values of the corresponding block of B. */ \ - if ( k % MR != 0 ) k += MR - ( k % MR ); \ -\ - /* NOTE: We don't need to check that m is a multiple of PACKMR since we - know that the underlying buffer was already allocated to have an m - dimension that is a multiple of PACKMR, with the region between the - last row and the next multiple of MR zero-padded accordingly. */ \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - istep_a = PACKMR * k; \ - istep_b = PACKNR * k_full; \ -\ - if ( bli_is_odd( istep_a ) ) istep_a += 1; \ - if ( bli_is_odd( istep_b ) ) istep_b += 1; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_b( istep_b, &aux ); \ -\ - /* We don't bother querying the thrinfo_t node for the 1st loop because - we can't parallelize that loop in trsm due to the inter-iteration - dependencies that exist. */ \ - /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ -\ - dim_t jr_start, jr_end; \ - dim_t jr_inc; \ -\ - /* Determine the thread range and increment for the 2nd loop. - NOTE: The definition of bli_thread_range_jrir() will depend on whether - slab or round-robin partitioning was requested at configure-time. - NOTE: Parallelism in the 1st loop is unattainable due to the - inter-iteration dependencies present in trsm. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - a1 = a_cast; \ - c11 = c1 + (0 )*rstep_c; \ -\ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ - { \ - diagoffa_i = diagoffa + ( doff_t )i*MR; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* If the current panel of A intersects the diagonal, use a - special micro-kernel that performs a fused gemm and trsm. - If the current panel of A resides below the diagonal, use a - a regular gemm micro-kernel. Otherwise, if it is above the - diagonal, it was not packed (because it is implicitly zero) - and so we do nothing. */ \ - if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ - { \ - ctype* restrict a10; \ - ctype* restrict a11; \ - ctype* restrict b01; \ - ctype* restrict b11; \ - ctype* restrict a2; \ -\ - /* Compute various offsets into and lengths of parts of A. */ \ - off_a10 = 0; \ - k_a1011 = diagoffa_i + MR; \ - k_a10 = k_a1011 - MR; \ - off_a11 = k_a10; \ -\ - /* Compute the panel stride for the current diagonal- - intersecting micro-panel. */ \ - is_a_cur = k_a1011 * PACKMR; \ - is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ - ps_a_cur = is_a_cur; \ -\ - /* Compute the addresses of the panel A10 and the triangular - block A11. */ \ - a10 = a1; \ - a11 = a1 + k_a10 * PACKMR; \ - /*a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, 1 );*/ \ -\ - /* Compute the addresses of the panel B01 and the block - B11. */ \ - b01 = b1 + off_a10 * PACKNR; \ - b11 = b1 + off_a11 * PACKNR; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + ps_a_cur; \ - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ - { \ - a2 = a_cast; \ - b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - gemmtrsm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k_a10, \ - alpha1_cast, \ - a10, \ - a11, \ - b01, \ - b11, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ -\ - a1 += ps_a_cur; \ - } \ - else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ - { \ - ctype* restrict a2; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ - { \ - a2 = a_cast; \ - b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k, \ - minus_one, \ - a1, \ - b1, \ - alpha2_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ -\ - a1 += rstep_a; \ - } \ -\ - c11 += rstep_c; \ - } \ - } \ -\ -/* -PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \ - ( double* )a11, 1, PACKMR, "%4.1f", "" ); \ -*/ \ -\ -/* -PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \ -*/ \ -\ -/* -PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \ -*/ \ + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If matrix A is above the diagonal, it is implicitly zero. + // So we do nothing. + if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; + + // Compute k_full as k inflated up to a multiple of MR. This is + // needed because some parameter combinations of trsm reduce k + // to advance past zero regions in the triangular matrix, and + // when computing the imaginary stride of B (the non-triangular + // matrix), which is used by 4m1/3m1 implementations, we need + // this unreduced value of k. + if ( k % MR != 0 ) k += MR - ( k % MR ); + + // If there is a zero region above where the diagonal of A intersects the + // left edge of the block, adjust the pointer to C and treat this case as + // if the diagonal offset were zero. This skips over the region that was + // not packed. (Note we assume the diagonal offset is a multiple of MR; + // this assumption will hold as long as the cache blocksizes are each a + // multiple of MR and NR.) + if ( diagoffa < 0 ) + { + m += diagoffa; + c_cast -= diagoffa * rs_c * dt_size; + diagoffa = 0; + } + + // NOTE: We don't need to check that m is a multiple of PACKMR since we + // know that the underlying buffer was already allocated to have an m + // dimension that is a multiple of PACKMR, with the region between the + // last row and the next multiple of MR zero-padded accordingly. + + // Compute number of primary and leftover components of the m and n + // dimensions. + dim_t n_iter = n / NR; + dim_t n_left = n % NR; + + dim_t m_iter = m / MR; + dim_t m_left = m % MR; + + if ( n_left ) ++n_iter; + if ( m_left ) ++m_iter; + + // Determine some increments used to step through A, B, and C. + inc_t rstep_a = ps_a * dt_size; + + inc_t cstep_b = ps_b * dt_size; + + inc_t rstep_c = rs_c * MR * dt_size; + inc_t cstep_c = cs_c * NR * dt_size; + + // Save the pack schemas of A and B to the auxinfo_t object. + auxinfo_t aux; + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // We don't bother querying the thrinfo_t node for the 1st loop because + // we can't parallelize that loop in trsm due to the inter-iteration + // dependencies that exist. + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + dim_t jr_nt = bli_thrinfo_n_way( thread ); + dim_t jr_tid = bli_thrinfo_work_id( thread ); + + dim_t jr_start, jr_end; + dim_t jr_inc; + + // Determine the thread range and increment for the 2nd loop. + // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + // NOTE: Parallelism in the 1st loop is unattainable due to the + // inter-iteration dependencies present in trsm. + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + const char* a1 = a_cast; + char* c11 = c1 + (0 )*rstep_c; + + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = 0; i < m_iter; ++i ) + { + doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + // If the current panel of A intersects the diagonal, use a + // special micro-kernel that performs a fused gemm and trsm. + // If the current panel of A resides below the diagonal, use a + // a regular gemm micro-kernel. Otherwise, if it is above the + // diagonal, it was not packed (because it is implicitly zero) + // and so we do nothing. + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) + { + // Compute various offsets into and lengths of parts of A. + dim_t off_a10 = 0; + dim_t k_a1011 = diagoffa_i + MR; + dim_t k_a10 = k_a1011 - MR; + dim_t off_a11 = k_a10; + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_a_cur = k_a1011 * PACKMR; + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); + ps_a_cur *= dt_size; + + // Compute the addresses of the panel A10 and the triangular + // block A11. + const char* a10 = a1; + const char* a11 = a1 + k_a10 * PACKMR * dt_size; + //a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, 1 ); + + // Compute the addresses of the panel B01 and the block + // B11. + const char* b01 = b1 + off_a10 * PACKNR * dt_size; + const char* b11 = b1 + off_a11 * PACKNR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = a1 + ps_a_cur; + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = b1; + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + gemmtrsm_ukr + ( + m_cur, + n_cur, + k_a10, + ( void* )alpha1_cast, + ( void* )a10, + ( void* )a11, + ( void* )b01, + ( void* )b11, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + a1 += ps_a_cur; + } + else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) + { + // Compute the addresses of the next panels of A and B. + const char* a2 = a1 + rstep_a; + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = b1; + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )minus_one, + ( void* )a1, + ( void* )b1, + ( void* )alpha2_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + a1 += rstep_a; + } + + c11 += rstep_c; + } + } } -INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 ) +/* +PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, + ( double* )a11, 1, PACKMR, "%4.1f", "" ); + +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); + +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k_full, a1, 1, MR, "%5.2f", "" ); +PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k_full, NR, bp, NR, 1, "%5.2f", "" ); +*/ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 799fdd101..314ee3070 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -35,50 +35,27 @@ #include "blis.h" -#define FUNCPTR_T gemm_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffa, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha1, - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, - void* alpha2, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); - - void bli_trsm_lu_ker_var2 ( const obj_t* a, const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread_par ) { - const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); - const doff_t diagoffa = bli_obj_diag_offset( a ); + doff_t diagoffa = bli_obj_diag_offset( a ); const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - const dim_t k = bli_obj_width( a ); + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); const void* buf_a = bli_obj_buffer_at_off( a ); const inc_t cs_a = bli_obj_col_stride( a ); @@ -110,106 +87,23 @@ void bli_trsm_lu_ker_var2 // packing. const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c ); - // Index into the type combination array to extract the correct - // function pointer. - ftypes[dt_exec] - ( - diagoffa, - schema_a, - schema_b, - m, - n, - k, - ( void* )buf_alpha1, - ( void* )buf_a, cs_a, pd_a, ps_a, - ( void* )buf_b, rs_b, pd_b, ps_b, - ( void* )buf_alpha2, - buf_c, rs_c, cs_c, - ( cntx_t* )cntx, - rntm, - thread - ); -} + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + // Cast the micro-kernel address to its function pointer type. + gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* minus_one = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha1_cast = buf_alpha1; + const char* alpha2_cast = buf_alpha2; -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - const dim_t PACKMR = cs_a; \ - const dim_t PACKNR = rs_b; \ -\ - /* Cast the micro-kernel address to its function pointer type. */ \ - PASTECH(ch,gemmtrsm_ukr_ft) \ - gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ -/* - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -*/ \ -\ - ctype* restrict minus_one = PASTEMAC(ch,m1); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha1_cast = alpha1; \ - ctype* restrict alpha2_cast = alpha2; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffa_i; \ - dim_t k_full; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t k_a1112; \ - dim_t k_a11; \ - dim_t k_a12; \ - dim_t off_a11; \ - dim_t off_a12; \ - dim_t i, j, ib; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - inc_t istep_a; \ - inc_t istep_b; \ - inc_t ps_a_cur; \ - inc_t is_a_cur; \ - auxinfo_t aux; \ -\ /* Assumptions/assertions: rs_a == 1 @@ -222,275 +116,243 @@ void PASTEMAC(ch,varname) \ ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) - */ \ -\ - /* Safety trap: Certain indexing within this macro-kernel does not - work as intended if both MR and NR are odd. */ \ - if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ - ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If matrix A is below the diagonal, it is implicitly zero. - So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ -\ - /* Compute k_full as k inflated up to a multiple of MR. This is - needed because some parameter combinations of trsm reduce k - to advance past zero regions in the triangular matrix, and - when computing the imaginary stride of B (the non-triangular - matrix), which is used by 4m1/3m1 implementations, we need - this unreduced value of k. */ \ - k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ -\ - /* If there is a zero region to the left of where the diagonal of A - intersects the top edge of the block, adjust the pointer to B and - treat this case as if the diagonal offset were zero. Note that we - don't need to adjust the pointer to A since packm would have simply - skipped over the region that was not stored. */ \ - if ( diagoffa > 0 ) \ - { \ - i = diagoffa; \ - k = k - i; \ - diagoffa = 0; \ - b_cast = b_cast + i * PACKNR; \ - } \ -\ - /* If there is a zero region below where the diagonal of A intersects the - right side of the block, shrink it to prevent "no-op" iterations from - executing. */ \ - if ( -diagoffa + k < m ) \ - { \ - m = -diagoffa + k; \ - } \ -\ - /* Check the k dimension, which needs to be a multiple of MR. If k - isn't a multiple of MR, we adjust it higher to satisfy the micro- - kernel, which is expecting to perform an MR x MR triangular solve. - This adjustment of k is consistent with what happened when A was - packed: all of its bottom/right edges were zero-padded, and - furthermore, the panel that stores the bottom-right corner of the - matrix has its diagonal extended into the zero-padded region (as - identity). This allows the trsm of that bottom-right panel to - proceed without producing any infs or NaNs that would infect the - "good" values of the corresponding block of B. */ \ - if ( k % MR != 0 ) k += MR - ( k % MR ); \ -\ - /* NOTE: We don't need to check that m is a multiple of PACKMR since we - know that the underlying buffer was already allocated to have an m - dimension that is a multiple of PACKMR, with the region between the - last row and the next multiple of MR zero-padded accordingly. */ \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - istep_a = PACKMR * k; \ - istep_b = PACKNR * k_full; \ -\ - if ( bli_is_odd( istep_a ) ) istep_a += 1; \ - if ( bli_is_odd( istep_b ) ) istep_b += 1; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. */ \ - bli_auxinfo_set_schema_a( schema_a, &aux ); \ - bli_auxinfo_set_schema_b( schema_b, &aux ); \ -\ - /* Save the imaginary stride of B to the auxinfo_t object. */ \ - bli_auxinfo_set_is_b( istep_b, &aux ); \ -\ - /* We don't bother querying the thrinfo_t node for the 1st loop because - we can't parallelize that loop in trsm due to the inter-iteration - dependencies that exist. */ \ - /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ -\ - /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ -\ - dim_t jr_start, jr_end; \ - dim_t jr_inc; \ -\ - /* Determine the thread range and increment for the 2nd loop. - NOTE: The definition of bli_thread_range_jrir() will depend on whether - slab or round-robin partitioning was requested at configure-time. - NOTE: Parallelism in the 1st loop is unattainable due to the - inter-iteration dependencies present in trsm. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = jr_start; j < jr_end; j += jr_inc ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b2; \ -\ - b1 = b_cast + j * cstep_b; \ - c1 = c_cast + j * cstep_c; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - a1 = a_cast; \ - c11 = c1 + (m_iter-1)*rstep_c; \ -\ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( ib = 0; ib < m_iter; ++ib ) \ - { \ - i = m_iter - 1 - ib; \ - diagoffa_i = diagoffa + ( doff_t )i*MR; \ -\ - m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \ -\ - /* If the current panel of A intersects the diagonal, use a - special micro-kernel that performs a fused gemm and trsm. - If the current panel of A resides above the diagonal, use a - a regular gemm micro-kernel. Otherwise, if it is below the - diagonal, it was not packed (because it is implicitly zero) - and so we do nothing. */ \ - if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ - { \ - ctype* restrict a11; \ - ctype* restrict a12; \ - ctype* restrict b11; \ - ctype* restrict b21; \ - ctype* restrict a2; \ -\ - /* Compute various offsets into and lengths of parts of A. */ \ - off_a11 = diagoffa_i; \ - k_a1112 = k - off_a11;; \ - k_a11 = MR; \ - k_a12 = k_a1112 - MR; \ - off_a12 = off_a11 + k_a11; \ -\ - /* Compute the panel stride for the current diagonal- - intersecting micro-panel. */ \ - is_a_cur = k_a1112 * PACKMR; \ - is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ - ps_a_cur = is_a_cur; \ -\ - /* Compute the addresses of the triangular block A11 and the - panel A12. */ \ - a11 = a1; \ - a12 = a1 + k_a11 * PACKMR; \ - /*a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, 1 );*/ \ -\ - /* Compute the addresses of the panel B01 and the block - B11. */ \ - b11 = b1 + off_a11 * PACKNR; \ - b21 = b1 + off_a12 * PACKNR; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + ps_a_cur; \ - if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \ - { \ - a2 = a_cast; \ - b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - gemmtrsm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k_a12, \ - alpha1_cast, \ - a12, \ - a11, \ - b21, \ - b11, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ -\ - a1 += ps_a_cur; \ - } \ - else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ - { \ - ctype* restrict a2; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + rstep_a; \ - if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \ - { \ - a2 = a_cast; \ - b2 = b1; \ - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. */ \ - bli_auxinfo_set_next_a( a2, &aux ); \ - bli_auxinfo_set_next_b( b2, &aux ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k, \ - minus_one, \ - a1, \ - b1, \ - alpha2_cast, \ - c11, rs_c, cs_c, \ - &aux, \ - cntx \ - ); \ -\ - a1 += rstep_a; \ - } \ -\ - c11 -= rstep_c; \ - } \ - } \ -\ -/* -PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \ -printf( "m_iter = %lu\n", m_iter ); \ -printf( "m_cur = %lu\n", m_cur ); \ -printf( "k = %lu\n", k ); \ -printf( "diagoffa_i = %lu\n", diagoffa_i ); \ -printf( "off_a1112 = %lu\n", off_a1112 ); \ -printf( "k_a1112 = %lu\n", k_a1112 ); \ -printf( "k_a12 = %lu\n", k_a12 ); \ -printf( "k_a11 = %lu\n", k_a11 ); \ -printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \ -printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \ -*/ \ -\ -/* -PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \ -*/ \ + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If matrix A is below the diagonal, it is implicitly zero. + // So we do nothing. + if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; + + // If there is a zero region to the left of where the diagonal of A + // intersects the top edge of the block, adjust the pointer to B and + // treat this case as if the diagonal offset were zero. Note that we + // don't need to adjust the pointer to A since packm would have simply + // skipped over the region that was not stored. + if ( diagoffa > 0 ) + { + k -= diagoffa; + b_cast += diagoffa * PACKNR * dt_size; + diagoffa = 0; + } + + // If there is a zero region below where the diagonal of A intersects the + // right side of the block, shrink it to prevent "no-op" iterations from + // executing. + if ( -diagoffa + k < m ) + { + m = -diagoffa + k; + } + + // Check the k dimension, which needs to be a multiple of MR. If k + // isn't a multiple of MR, we adjust it higher to satisfy the micro- + // kernel, which is expecting to perform an MR x MR triangular solve. + // This adjustment of k is consistent with what happened when A was + // packed: all of its bottom/right edges were zero-padded, and + // furthermore, the panel that stores the bottom-right corner of the + // matrix has its diagonal extended into the zero-padded region (as + // identity). This allows the trsm of that bottom-right panel to + // proceed without producing any infs or NaNs that would infect the + // "good" values of the corresponding block of B. + if ( k % MR != 0 ) k += MR - ( k % MR ); + + // NOTE: We don't need to check that m is a multiple of PACKMR since we + // know that the underlying buffer was already allocated to have an m + // dimension that is a multiple of PACKMR, with the region between the + // last row and the next multiple of MR zero-padded accordingly. + + // Compute number of primary and leftover components of the m and n + // dimensions. + dim_t n_iter = n / NR; + dim_t n_left = n % NR; + + dim_t m_iter = m / MR; + dim_t m_left = m % MR; + + if ( n_left ) ++n_iter; + if ( m_left ) ++m_iter; + + // Determine some increments used to step through A, B, and C. + inc_t rstep_a = ps_a * dt_size; + + inc_t cstep_b = ps_b * dt_size; + + inc_t rstep_c = rs_c * MR * dt_size; + inc_t cstep_c = cs_c * NR * dt_size; + + // Save the pack schemas of A and B to the auxinfo_t object. + auxinfo_t aux; + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // We don't bother querying the thrinfo_t node for the 1st loop because + // we can't parallelize that loop in trsm due to the inter-iteration + // dependencies that exist. + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + dim_t jr_nt = bli_thrinfo_n_way( thread ); + dim_t jr_tid = bli_thrinfo_work_id( thread ); + + dim_t jr_start, jr_end; + dim_t jr_inc; + + // Determine the thread range and increment for the 2nd loop. + // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + // NOTE: Parallelism in the 1st loop is unattainable due to the + // inter-iteration dependencies present in trsm. + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + const char* a1 = a_cast; + char* c11 = c1 + (m_iter-1)*rstep_c; + + // Loop over the m dimension (MR rows at a time). + for ( dim_t ib = 0; ib < m_iter; ++ib ) + { + dim_t i = m_iter - 1 - ib; + doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + + dim_t m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); + + // If the current panel of A intersects the diagonal, use a + // special micro-kernel that performs a fused gemm and trsm. + // If the current panel of A resides above the diagonal, use a + // a regular gemm micro-kernel. Otherwise, if it is below the + // diagonal, it was not packed (because it is implicitly zero) + // and so we do nothing. + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) + { + // Compute various offsets into and lengths of parts of A. + dim_t off_a11 = diagoffa_i; + dim_t k_a1112 = k - off_a11;; + dim_t k_a11 = MR; + dim_t k_a12 = k_a1112 - MR; + dim_t off_a12 = off_a11 + k_a11; + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_a_cur = k_a1112 * PACKMR; + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); + ps_a_cur *= dt_size; + + // Compute the addresses of the triangular block A11 and the + // panel A12. + const char* a11 = a1; + const char* a12 = a1 + k_a11 * PACKMR * dt_size; + //a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, 1 ); + + // Compute the addresses of the panel B01 and the block + // B11. + const char* b11 = b1 + off_a11 * PACKNR * dt_size; + const char* b21 = b1 + off_a12 * PACKNR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = a1 + ps_a_cur; + if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = b1; + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + gemmtrsm_ukr + ( + m_cur, + n_cur, + k_a12, + ( void* )alpha1_cast, + ( void* )a12, + ( void* )a11, + ( void* )b21, + ( void* )b11, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + a1 += ps_a_cur; + } + else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) + { + // Compute the addresses of the next panels of A and B. + const char* a2 = a1 + rstep_a; + if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = b1; + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )minus_one, + ( void* )a1, + ( void* )b1, + ( void* )alpha2_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + a1 += rstep_a; + } + + c11 -= rstep_c; + } + } } -INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 ) +/* +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); +printf( "m_iter = %lu\n", m_iter ); +printf( "m_cur = %lu\n", m_cur ); +printf( "k = %lu\n", k ); +printf( "diagoffa_i = %lu\n", diagoffa_i ); +printf( "off_a1112 = %lu\n", off_a1112 ); +printf( "k_a1112 = %lu\n", k_a1112 ); +printf( "k_a12 = %lu\n", k_a12 ); +printf( "k_a11 = %lu\n", k_a11 ); +printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); +printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); + +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); +PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); +*/ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 721203df7..42e72840e 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -35,50 +35,27 @@ #include "blis.h" -#define FUNCPTR_T gemm_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffb, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha1, - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, - void* alpha2, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); - - void bli_trsm_rl_ker_var2 ( const obj_t* a, const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread_par ) { - const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size ( dt ); - const doff_t diagoffb = bli_obj_diag_offset( b ); + doff_t diagoffb = bli_obj_diag_offset( b ); const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - const dim_t k = bli_obj_width( a ); + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); const void* buf_a = bli_obj_buffer_at_off( a ); const inc_t cs_a = bli_obj_col_stride( a ); @@ -110,111 +87,28 @@ void bli_trsm_rl_ker_var2 // packing. const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c ); - // Index into the type combination array to extract the correct - // function pointer. - ftypes[dt_exec] - ( - diagoffb, - schema_a, - schema_b, - m, - n, - k, - ( void* )buf_alpha1, - ( void* )buf_a, cs_a, pd_a, ps_a, - ( void* )buf_b, rs_b, pd_b, ps_b, - ( void* )buf_alpha2, - buf_c, rs_c, cs_c, - ( cntx_t* )cntx, - rntm, - thread - ); -} + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + // Cast the micro-kernel address to its function pointer type. + // NOTE: We use the upper-triangular gemmtrsm ukernel because, while + // the current macro-kernel targets the "rl" case (right-side/lower- + // triangular), it becomes upper-triangular after the kernel operation + // is transposed so that all kernel instances are of the "left" + // variety (since those are the only trsm ukernels that exist). + gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* minus_one = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha1_cast = buf_alpha1; + const char* alpha2_cast = buf_alpha2; -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffb, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - const dim_t PACKMR = cs_a; \ - const dim_t PACKNR = rs_b; \ -\ - /* Cast the micro-kernel address to its function pointer type. */ \ - /* NOTE: We use the upper-triangular gemmtrsm ukernel because, while - the current macro-kernel targets the "rl" case (right-side/lower- - triangular), it becomes upper-triangular after the kernel operation - is transposed so that all kernel instances are of the "left" - variety (since those are the only trsm ukernels that exist). */ \ - PASTECH(ch,gemmtrsm_ukr_ft) \ - gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ -/* - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -*/ \ -\ - ctype* restrict minus_one = PASTEMAC(ch,m1); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha1_cast = alpha1; \ - ctype* restrict alpha2_cast = alpha2; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffb_j; \ - dim_t k_full; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t k_b1121; \ - dim_t k_b11; \ - dim_t k_b21; \ - dim_t off_b11; \ - dim_t off_b21; \ - dim_t i, j, jb; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - inc_t istep_a; \ - inc_t istep_b; \ - inc_t ps_b_cur; \ - inc_t is_b_cur; \ - auxinfo_t aux; \ -\ /* Assumptions/assertions: rs_a == 1 @@ -235,41 +129,32 @@ void PASTEMAC(ch,varname) \ transposing the operation, then A needs to be packed with NR and B needs to be packed with MR (remember: B is the triangular matrix in the right-hand side parameter case). - */ \ -\ + */ + /* Safety trap: Certain indexing within this macro-kernel does not - work as intended if both MR and NR are odd. */ \ - if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ - ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ + work as intended if both MR and NR are odd. */ + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + /* If any dimension is zero, return immediately. */ + if ( bli_zero_dim3( m, n, k ) ) return; + /* Safeguard: If the current panel of B is entirely above its diagonal, - it is implicitly zero. So we do nothing. */ \ - if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ -\ - /* Compute k_full as k inflated up to a multiple of NR. This is - needed because some parameter combinations of trsm reduce k - to advance past zero regions in the triangular matrix, and - when computing the imaginary stride of B (the non-triangular - matrix), which is used by 4m1/3m1 implementations, we need - this unreduced value of k. */ \ - k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \ -\ + it is implicitly zero. So we do nothing. */ + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; + /* If there is a zero region above where the diagonal of B intersects the left edge of the panel, adjust the pointer to A and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to B since packm would have simply skipped over - the region that was not stored. */ \ - if ( diagoffb < 0 ) \ - { \ - j = -diagoffb; \ - k = k - j; \ - diagoffb = 0; \ - a_cast = a_cast + j * PACKMR; \ - } \ -\ + the region that was not stored. */ + if ( diagoffb < 0 ) + { + k += diagoffb; + a_cast -= diagoffb * PACKMR * dt_size; + diagoffb = 0; + } + /* If there is a zero region to the right of where the diagonal of B intersects the bottom of the panel, shrink it so that we can index to the correct place in C (corresponding to the @@ -277,12 +162,12 @@ void PASTEMAC(ch,varname) \ NOTE: This is NOT being done to skip over "no-op" iterations, as with the trsm_lu macro-kernel. This MUST be done for correct execution because we use n (via n_iter) to compute diagonal and - index offsets for backwards movement through B. */ \ - if ( diagoffb + k < n ) \ - { \ - n = diagoffb + k; \ - } \ -\ + index offsets for backwards movement through B. */ + if ( diagoffb + k < n ) + { + n = diagoffb + k; + } + /* Check the k dimension, which needs to be a multiple of NR. If k isn't a multiple of NR, we adjust it higher to satisfy the micro- kernel, which is expecting to perform an NR x NR triangular solve. @@ -292,209 +177,188 @@ void PASTEMAC(ch,varname) \ matrix has its diagonal extended into the zero-padded region (as identity). This allows the trsm of that bottom-right panel to proceed without producing any infs or NaNs that would infect the - "good" values of the corresponding block of A. */ \ - if ( k % NR != 0 ) k += NR - ( k % NR ); \ -\ + "good" values of the corresponding block of A. */ + if ( k % NR != 0 ) k += NR - ( k % NR ); + /* NOTE: We don't need to check that n is a multiple of PACKNR since we know that the underlying buffer was already allocated to have an n dimension that is a multiple of PACKNR, with the region between the - last column and the next multiple of NR zero-padded accordingly. */ \ -\ + last column and the next multiple of NR zero-padded accordingly. */ + + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - istep_a = PACKMR * k_full; \ - istep_b = PACKNR * k; \ -\ - if ( bli_is_odd( istep_a ) ) istep_a += 1; \ - if ( bli_is_odd( istep_b ) ) istep_b += 1; \ -\ + dimensions. */ + dim_t n_iter = n / NR; + dim_t n_left = n % NR; + + dim_t m_iter = m / MR; + dim_t m_left = m % MR; + + if ( n_left ) ++n_iter; + if ( m_left ) ++m_iter; + + /* Determine some increments used to step through A, B, and C. */ + inc_t rstep_a = ps_a * dt_size; + + inc_t cstep_b = ps_b * dt_size; + + inc_t rstep_c = rs_c * MR * dt_size; + inc_t cstep_c = cs_c * NR * dt_size; + /* Save the pack schemas of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular - "A" matrix is actually contained within B. */ \ - bli_auxinfo_set_schema_a( schema_b, &aux ); \ - bli_auxinfo_set_schema_b( schema_a, &aux ); \ -\ - /* Save the imaginary stride of A to the auxinfo_t object. - NOTE: We swap the values for A and B since the triangular - "A" matrix is actually contained within B. */ \ - bli_auxinfo_set_is_b( istep_a, &aux ); \ -\ - b1 = b_cast; \ - c1 = c_cast; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( jb = 0; jb < n_iter; ++jb ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b11; \ - ctype* restrict b21; \ - ctype* restrict b2; \ -\ - j = n_iter - 1 - jb; \ - diagoffb_j = diagoffb - ( doff_t )j*NR; \ - a1 = a_cast; \ - c11 = c1 + (n_iter-1)*cstep_c; \ -\ - n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ + "A" matrix is actually contained within B. */ + auxinfo_t aux; + bli_auxinfo_set_schema_a( schema_b, &aux ); + bli_auxinfo_set_schema_b( schema_a, &aux ); + + const char* b1 = b_cast; + char* c1 = c_cast; + + /* Loop over the n dimension (NR columns at a time). */ + for ( dim_t jb = 0; jb < n_iter; ++jb ) + { + dim_t j = n_iter - 1 - jb; + doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + + dim_t n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); + + const char* a1 = a_cast; + char* c11 = c1 + (n_iter-1)*cstep_c; + + /* Initialize our next panel of B to be the current panel of B. */ + const char* b2 = b1; + /* If the current panel of B intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. If the current panel of B resides below the diagonal, use a a regular gemm micro-kernel. Otherwise, if it is above the diagonal, it was not packed (because it is implicitly zero) - and so we do nothing. */ \ - if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ - { \ + and so we do nothing. */ + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) + { /* Determine the offset to and length of the panel that was packed - so we can index into the corresponding location in A. */ \ - off_b11 = bli_max( -diagoffb_j, 0 ); \ - k_b1121 = k - off_b11; \ - k_b11 = NR; \ - k_b21 = k_b1121 - NR; \ - off_b21 = off_b11 + k_b11; \ -\ + so we can index into the corresponding location in A. */ + dim_t off_b11 = bli_max( -diagoffb_j, 0 ); + dim_t k_b1121 = k - off_b11; + dim_t k_b11 = NR; + dim_t k_b21 = k_b1121 - NR; + dim_t off_b21 = off_b11 + k_b11; + /* Compute the addresses of the triangular block B11 and the - panel B21. */ \ - b11 = b1; \ - b21 = b1 + k_b11 * PACKNR; \ - /*b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 );*/ \ -\ - /* Compute the panel stride for the current micro-panel. */ \ - is_b_cur = k_b1121 * PACKNR; \ - is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ - ps_b_cur = is_b_cur; \ -\ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ - { \ - if ( bli_trsm_my_iter_rr( i, thread ) ){ \ -\ - ctype* restrict a11; \ - ctype* restrict a12; \ - ctype* restrict a2; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the A11 block and A12 panel. */ \ - a11 = a1 + off_b11 * PACKMR; \ - a12 = a1 + off_b21 * PACKMR; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1; \ - /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\ - if ( i + bli_thread_num_threads(thread) >= m_iter ) \ - { \ - a2 = a_cast; \ - b2 = b1 + ps_b_cur; \ - if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \ - b2 = b_cast; \ - } \ -\ + panel B21. */ + const char* b11 = b1; + const char* b21 = b1 + k_b11 * PACKNR * dt_size; + /*b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 );*/ + + /* Compute the panel stride for the current micro-panel. */ + inc_t ps_b_cur = k_b1121 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + /* Loop over the m dimension (MR rows at a time). */ + for ( dim_t i = 0; i < m_iter; ++i ) + { + if ( bli_trsm_my_iter_rr( i, thread ) ){ + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + /* Compute the addresses of the A11 block and A12 panel. */ + const char* a11 = a1 + off_b11 * PACKMR * dt_size; + const char* a12 = a1 + off_b21 * PACKMR * dt_size; + + /* Compute the addresses of the next panels of A and B. */ + const char* a2 = a1; + /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */ + if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) + { + a2 = a_cast; + b2 = b1 + ps_b_cur; + if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) + b2 = b_cast; + } + /* Save addresses of next panels of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the - triangular "A" matrix is actually contained within B. */ \ - bli_auxinfo_set_next_a( b2, &aux ); \ - bli_auxinfo_set_next_b( a2, &aux ); \ -\ - gemmtrsm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k_b21, \ - alpha1_cast, \ - b21, \ - b11, \ - a12, \ - a11, \ - c11, cs_c, rs_c, \ - &aux, \ - cntx \ - ); \ -\ - } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ - } \ -\ - b1 += ps_b_cur; \ - } \ - else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \ - { \ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ - { \ - if ( bli_trsm_my_iter_rr( i, thread ) ){ \ -\ - ctype* restrict a2; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1; \ - /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\ - if ( i + bli_thread_num_threads(thread) >= m_iter ) \ - { \ - a2 = a_cast; \ - b2 = b1 + cstep_b; \ - if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \ - b2 = b_cast; \ - } \ -\ + triangular "A" matrix is actually contained within B. */ + bli_auxinfo_set_next_a( b2, &aux ); + bli_auxinfo_set_next_b( a2, &aux ); + + gemmtrsm_ukr + ( + m_cur, + n_cur, + k_b21, + ( void* )alpha1_cast, + ( void* )b21, + ( void* )b11, + ( void* )a12, + ( void* )a11, + c11, cs_c, rs_c, + &aux, + ( cntx_t* )cntx + ); + + } + + a1 += rstep_a; + c11 += rstep_c; + } + + b1 += ps_b_cur; + } + else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) + { + /* Loop over the m dimension (MR rows at a time). */ + for ( dim_t i = 0; i < m_iter; ++i ) + { + if ( bli_trsm_my_iter_rr( i, thread ) ){ + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + /* Compute the addresses of the next panels of A and B. */ + const char* a2 = a1; + /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */ + if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) + { + a2 = a_cast; + b2 = b1 + cstep_b; + if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) + b2 = b_cast; + } + /* Save addresses of next panels of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the - triangular "A" matrix is actually contained within B. */ \ - bli_auxinfo_set_next_a( b2, &aux ); \ - bli_auxinfo_set_next_b( a2, &aux ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k, \ - minus_one, \ - b1, \ - a1, \ - alpha2_cast, \ - c11, cs_c, rs_c, \ - &aux, \ - cntx \ - ); \ -\ - } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ - } \ -\ - b1 += cstep_b; \ - } \ -\ - c1 -= cstep_c; \ - } \ -} + triangular "A" matrix is actually contained within B. */ + bli_auxinfo_set_next_a( b2, &aux ); + bli_auxinfo_set_next_b( a2, &aux ); + + /* Invoke the gemm micro-kernel. */ + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )minus_one, + ( void* )b1, + ( void* )a1, + ( void* )alpha2_cast, + c11, cs_c, rs_c, + &aux, + ( cntx_t* )cntx + ); + + } -INSERT_GENTFUNC_BASIC0( trsm_rl_ker_var2 ) + a1 += rstep_a; + c11 += rstep_c; + } + + b1 += cstep_b; + } + + c1 -= cstep_c; + } +} diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 447fbf8cd..6cc9a8bbb 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -35,50 +35,27 @@ #include "blis.h" -#define FUNCPTR_T gemm_fp - -typedef void (*FUNCPTR_T) - ( - doff_t diagoffb, - pack_t schema_a, - pack_t schema_b, - dim_t m, - dim_t n, - dim_t k, - void* alpha1, - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, - void* alpha2, - void* c, inc_t rs_c, inc_t cs_c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread - ); - -static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); - - void bli_trsm_ru_ker_var2 ( const obj_t* a, const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + const cntl_t* cntl, + thrinfo_t* thread_par ) { - const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); - const doff_t diagoffb = bli_obj_diag_offset( b ); + doff_t diagoffb = bli_obj_diag_offset( b ); const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); - const dim_t m = bli_obj_length( c ); - const dim_t n = bli_obj_width( c ); - const dim_t k = bli_obj_width( a ); + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); const void* buf_a = bli_obj_buffer_at_off( a ); const inc_t cs_a = bli_obj_col_stride( a ); @@ -110,110 +87,28 @@ void bli_trsm_ru_ker_var2 // packing. const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c ); - // Index into the type combination array to extract the correct - // function pointer. - ftypes[dt_exec] - ( - diagoffb, - schema_a, - schema_b, - m, - n, - k, - ( void* )buf_alpha1, - ( void* )buf_a, cs_a, pd_a, ps_a, - ( void* )buf_b, rs_b, pd_b, ps_b, - ( void* )buf_alpha2, - buf_c, rs_c, cs_c, - ( cntx_t* )cntx, - rntm, - thread - ); -} + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + // Cast the micro-kernel address to its function pointer type. + // NOTE: We use the lower-triangular gemmtrsm ukernel because, while + // the current macro-kernel targets the "ru" case (right-side/upper- + // triangular), it becomes lower-triangular after the kernel operation + // is transposed so that all kernel instances are of the "left" + // variety (since those are the only trsm ukernels that exist). + gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* minus_one = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha1_cast = buf_alpha1; + const char* alpha2_cast = buf_alpha2; -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffb, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ -\ - /* Alias some constants to simpler names. */ \ - const dim_t MR = pd_a; \ - const dim_t NR = pd_b; \ - const dim_t PACKMR = cs_a; \ - const dim_t PACKNR = rs_b; \ -\ - /* Cast the micro-kernel address to its function pointer type. */ \ - /* NOTE: We use the lower-triangular gemmtrsm ukernel because, while - the current macro-kernel targets the "ru" case (right-side/upper- - triangular), it becomes lower-triangular after the kernel operation - is transposed so that all kernel instances are of the "left" - variety (since those are the only trsm ukernels that exist). */ \ - PASTECH(ch,gemmtrsm_ukr_ft) \ - gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ - PASTECH(ch,gemm_ukr_ft) \ - gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ -\ - /* Temporary C buffer for edge cases. Note that the strides of this - temporary buffer are set so that they match the storage of the - original C matrix. For example, if C is column-stored, ct will be - column-stored as well. */ \ -/* - ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ - const inc_t rs_ct = ( col_pref ? 1 : NR ); \ - const inc_t cs_ct = ( col_pref ? MR : 1 ); \ -*/ \ -\ - ctype* restrict minus_one = PASTEMAC(ch,m1); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha1_cast = alpha1; \ - ctype* restrict alpha2_cast = alpha2; \ - ctype* restrict b1; \ - ctype* restrict c1; \ -\ - doff_t diagoffb_j; \ - dim_t k_full; \ - dim_t m_iter, m_left; \ - dim_t n_iter, n_left; \ - dim_t m_cur; \ - dim_t n_cur; \ - dim_t k_b0111; \ - dim_t k_b01; \ - dim_t off_b01; \ - dim_t off_b11; \ - dim_t i, j; \ - inc_t rstep_a; \ - inc_t cstep_b; \ - inc_t rstep_c, cstep_c; \ - inc_t istep_a; \ - inc_t istep_b; \ - inc_t ps_b_cur; \ - inc_t is_b_cur; \ - auxinfo_t aux; \ -\ /* Assumptions/assertions: rs_a == 1 @@ -234,260 +129,230 @@ void PASTEMAC(ch,varname) \ transposing the operation, then A needs to be packed with NR and B needs to be packed with MR (remember: B is the triangular matrix in the right-hand side parameter case). - */ \ -\ - /* Safety trap: Certain indexing within this macro-kernel does not - work as intended if both MR and NR are odd. */ \ - if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ - ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ -\ - /* If any dimension is zero, return immediately. */ \ - if ( bli_zero_dim3( m, n, k ) ) return; \ -\ - /* Safeguard: If the current panel of B is entirely below its diagonal, - it is implicitly zero. So we do nothing. */ \ - if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ -\ - /* Compute k_full as k inflated up to a multiple of NR. This is - needed because some parameter combinations of trsm reduce k - to advance past zero regions in the triangular matrix, and - when computing the imaginary stride of B (the non-triangular - matrix), which is used by 4m1/3m1 implementations, we need - this unreduced value of k. */ \ - k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \ -\ - /* If there is a zero region to the left of where the diagonal of B - intersects the top edge of the panel, adjust the pointer to C and - treat this case as if the diagonal offset were zero. This skips over - the region that was not packed. (Note we assume the diagonal offset - is a multiple of MR; this assumption will hold as long as the cache - blocksizes are each a multiple of MR and NR.) */ \ - if ( diagoffb > 0 ) \ - { \ - j = diagoffb; \ - n = n - j; \ - diagoffb = 0; \ - c_cast = c_cast + (j )*cs_c; \ - } \ -\ - /* If there is a zero region below where the diagonal of B intersects the - right side of the block, shrink it to prevent "no-op" iterations from - executing. */ \ - if ( -diagoffb + n < k ) \ - { \ - k = -diagoffb + n; \ - } \ -\ - /* Check the k dimension, which needs to be a multiple of NR. If k - isn't a multiple of NR, we adjust it higher to satisfy the micro- - kernel, which is expecting to perform an NR x NR triangular solve. - This adjustment of k is consistent with what happened when B was - packed: all of its bottom/right edges were zero-padded, and - furthermore, the panel that stores the bottom-right corner of the - matrix has its diagonal extended into the zero-padded region (as - identity). This allows the trsm of that bottom-right panel to - proceed without producing any infs or NaNs that would infect the - "good" values of the corresponding block of A. */ \ - if ( k % NR != 0 ) k += NR - ( k % NR ); \ -\ - /* NOTE: We don't need to check that n is a multiple of PACKNR since we - know that the underlying buffer was already allocated to have an n - dimension that is a multiple of PACKNR, with the region between the - last column and the next multiple of NR zero-padded accordingly. */ \ -\ - /* Compute number of primary and leftover components of the m and n - dimensions. */ \ - n_iter = n / NR; \ - n_left = n % NR; \ -\ - m_iter = m / MR; \ - m_left = m % MR; \ -\ - if ( n_left ) ++n_iter; \ - if ( m_left ) ++m_iter; \ -\ - /* Determine some increments used to step through A, B, and C. */ \ - rstep_a = ps_a; \ -\ - cstep_b = ps_b; \ -\ - rstep_c = rs_c * MR; \ - cstep_c = cs_c * NR; \ -\ - istep_a = PACKMR * k_full; \ - istep_b = PACKNR * k; \ -\ - if ( bli_is_odd( istep_a ) ) istep_a += 1; \ - if ( bli_is_odd( istep_b ) ) istep_b += 1; \ -\ - /* Save the pack schemas of A and B to the auxinfo_t object. - NOTE: We swap the values for A and B since the triangular - "A" matrix is actually contained within B. */ \ - bli_auxinfo_set_schema_a( schema_b, &aux ); \ - bli_auxinfo_set_schema_b( schema_a, &aux ); \ -\ - /* Save the imaginary stride of A to the auxinfo_t object. - NOTE: We swap the values for A and B since the triangular - "A" matrix is actually contained within B. */ \ - bli_auxinfo_set_is_b( istep_a, &aux ); \ -\ - b1 = b_cast; \ - c1 = c_cast; \ -\ - /* Loop over the n dimension (NR columns at a time). */ \ - for ( j = 0; j < n_iter; ++j ) \ - { \ - ctype* restrict a1; \ - ctype* restrict c11; \ - ctype* restrict b01; \ - ctype* restrict b11; \ - ctype* restrict b2; \ -\ - diagoffb_j = diagoffb - ( doff_t )j*NR; \ - a1 = a_cast; \ - c11 = c1; \ -\ - n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ -\ - /* Initialize our next panel of B to be the current panel of B. */ \ - b2 = b1; \ -\ - /* If the current panel of B intersects the diagonal, use a - special micro-kernel that performs a fused gemm and trsm. - If the current panel of B resides above the diagonal, use a - a regular gemm micro-kernel. Otherwise, if it is below the - diagonal, it was not packed (because it is implicitly zero) - and so we do nothing. */ \ - if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ - { \ - /* Determine the offset to and length of the panel that was packed - so we can index into the corresponding location in A. */ \ - off_b01 = 0; \ - k_b0111 = bli_min( k, -diagoffb_j + NR ); \ - k_b01 = k_b0111 - NR; \ - off_b11 = k_b01; \ -\ - /* Compute the addresses of the panel B10 and the triangular - block B11. */ \ - b01 = b1; \ - b11 = b1 + k_b01 * PACKNR; \ - /*b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 );*/ \ -\ - /* Compute the panel stride for the current micro-panel. */ \ - is_b_cur = k_b0111 * PACKNR; \ - is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ - ps_b_cur = is_b_cur; \ -\ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ - { \ - if ( bli_trsm_my_iter_rr( i, thread ) ){ \ -\ - ctype* restrict a10; \ - ctype* restrict a11; \ - ctype* restrict a2; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the A10 panel and A11 block. */ \ - a10 = a1 + off_b01 * PACKMR; \ - a11 = a1 + off_b11 * PACKMR; \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1; \ - /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\ - if ( i + bli_thread_num_threads(thread) >= m_iter ) \ - { \ - a2 = a_cast; \ - b2 = b1 + ps_b_cur; \ - if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. NOTE: We swap the values for A and B since the - triangular "A" matrix is actually contained within B. */ \ - bli_auxinfo_set_next_a( b2, &aux ); \ - bli_auxinfo_set_next_b( a2, &aux ); \ -\ - gemmtrsm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k_b01, \ - alpha1_cast, \ - b01, \ - b11, \ - a10, \ - a11, \ - c11, cs_c, rs_c, \ - &aux, \ - cntx \ - ); \ -\ - } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ - } \ -\ - b1 += ps_b_cur; \ - } \ - else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \ - { \ - /* Loop over the m dimension (MR rows at a time). */ \ - for ( i = 0; i < m_iter; ++i ) \ - { \ - if ( bli_trsm_my_iter_rr( i, thread ) ){ \ -\ - ctype* restrict a2; \ -\ - m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ -\ - /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1; \ - /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\ - if ( i + bli_thread_num_threads(thread) >= m_iter ) \ - { \ - a2 = a_cast; \ - b2 = b1 + cstep_b; \ - if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \ - b2 = b_cast; \ - } \ -\ - /* Save addresses of next panels of A and B to the auxinfo_t - object. NOTE: We swap the values for A and B since the - triangular "A" matrix is actually contained within B. */ \ - bli_auxinfo_set_next_a( b2, &aux ); \ - bli_auxinfo_set_next_b( a2, &aux ); \ -\ - /* Invoke the gemm micro-kernel. */ \ - gemm_ukr \ - ( \ - m_cur, \ - n_cur, \ - k, \ - minus_one, \ - b1, \ - a1, \ - alpha2_cast, \ - c11, cs_c, rs_c, \ - &aux, \ - cntx \ - ); \ -\ - } \ -\ - a1 += rstep_a; \ - c11 += rstep_c; \ - } \ -\ - b1 += cstep_b; \ - } \ -\ - c1 += cstep_c; \ - } \ -} + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of B is entirely below its diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; + + // If there is a zero region to the left of where the diagonal of B + // intersects the top edge of the panel, adjust the pointer to C and + // treat this case as if the diagonal offset were zero. This skips over + // the region that was not packed. (Note we assume the diagonal offset + // is a multiple of MR; this assumption will hold as long as the cache + // blocksizes are each a multiple of MR and NR.) + if ( diagoffb > 0 ) + { + n -= diagoffb; + c_cast += diagoffb * cs_c * dt_size; + diagoffb = 0; + } + + // If there is a zero region below where the diagonal of B intersects the + // right side of the block, shrink it to prevent "no-op" iterations from + // executing. + if ( -diagoffb + n < k ) + { + k = -diagoffb + n; + } + + // Check the k dimension, which needs to be a multiple of NR. If k + // isn't a multiple of NR, we adjust it higher to satisfy the micro- + // kernel, which is expecting to perform an NR x NR triangular solve. + // This adjustment of k is consistent with what happened when B was + // packed: all of its bottom/right edges were zero-padded, and + // furthermore, the panel that stores the bottom-right corner of the + // matrix has its diagonal extended into the zero-padded region (as + // identity). This allows the trsm of that bottom-right panel to + // proceed without producing any infs or NaNs that would infect the + // "good" values of the corresponding block of A. + if ( k % NR != 0 ) k += NR - ( k % NR ); + + // NOTE: We don't need to check that n is a multiple of PACKNR since we + // know that the underlying buffer was already allocated to have an n + // dimension that is a multiple of PACKNR, with the region between the + // last column and the next multiple of NR zero-padded accordingly. + + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + + // Compute number of primary and leftover components of the m and n + // dimensions. + dim_t n_iter = n / NR; + dim_t n_left = n % NR; + + dim_t m_iter = m / MR; + dim_t m_left = m % MR; + + if ( n_left ) ++n_iter; + if ( m_left ) ++m_iter; + + // Determine some increments used to step through A, B, and C. + inc_t rstep_a = ps_a * dt_size; + + inc_t cstep_b = ps_b * dt_size; + + inc_t rstep_c = rs_c * MR * dt_size; + inc_t cstep_c = cs_c * NR * dt_size; + + // Save the pack schemas of A and B to the auxinfo_t object. + // NOTE: We swap the values for A and B since the triangular + // "A" matrix is actually contained within B. + auxinfo_t aux; + bli_auxinfo_set_schema_a( schema_b, &aux ); + bli_auxinfo_set_schema_b( schema_a, &aux ); + + const char* b1 = b_cast; + char* c1 = c_cast; -INSERT_GENTFUNC_BASIC0( trsm_ru_ker_var2 ) + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = 0; j < n_iter; ++j ) + { + dim_t diagoffb_j = diagoffb - ( doff_t )j*NR; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + const char* a1 = a_cast; + char* c11 = c1; + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + // If the current panel of B intersects the diagonal, use a + // special micro-kernel that performs a fused gemm and trsm. + // If the current panel of B resides above the diagonal, use a + // a regular gemm micro-kernel. Otherwise, if it is below the + // diagonal, it was not packed (because it is implicitly zero) + // and so we do nothing. + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) + { + // Determine the offset to and length of the panel that was packed + // so we can index into the corresponding location in A. + dim_t off_b01 = 0; + dim_t k_b0111 = bli_min( k, -diagoffb_j + NR ); + dim_t k_b01 = k_b0111 - NR; + dim_t off_b11 = k_b01; + + // Compute the addresses of the panel B10 and the triangular + // block B11. + const char* b01 = b1; + const char* b11 = b1 + k_b01 * PACKNR * dt_size; + //b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 );*/ + + // Compute the panel stride for the current micro-panel. + inc_t ps_b_cur = k_b0111 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = 0; i < m_iter; ++i ) + { + if ( bli_trsm_my_iter_rr( i, thread ) ){ + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + // Compute the addresses of the A10 panel and A11 block. + const char* a10 = a1 + off_b01 * PACKMR * dt_size; + const char* a11 = a1 + off_b11 * PACKMR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = a1; + //if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) + { + a2 = a_cast; + b2 = b1 + ps_b_cur; + if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. NOTE: We swap the values for A and B since the + // triangular "A" matrix is actually contained within B. + bli_auxinfo_set_next_a( b2, &aux ); + bli_auxinfo_set_next_b( a2, &aux ); + + gemmtrsm_ukr + ( + m_cur, + n_cur, + k_b01, + ( void* )alpha1_cast, + ( void* )b01, + ( void* )b11, + ( void* )a10, + ( void* )a11, + c11, cs_c, rs_c, + &aux, + ( cntx_t* )cntx + ); + + } + + a1 += rstep_a; + c11 += rstep_c; + } + + b1 += ps_b_cur; + } + else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) + { + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = 0; i < m_iter; ++i ) + { + if ( bli_trsm_my_iter_rr( i, thread ) ){ + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = a1; + //if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) + { + a2 = a_cast; + b2 = b1 + cstep_b; + if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. NOTE: We swap the values for A and B since the + // triangular "A" matrix is actually contained within B. + bli_auxinfo_set_next_a( b2, &aux ); + bli_auxinfo_set_next_b( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )minus_one, + ( void* )b1, + ( void* )a1, + ( void* )alpha2_cast, + c11, cs_c, rs_c, + &aux, + ( cntx_t* )cntx + ); + + } + + a1 += rstep_a; + c11 += rstep_c; + } + + b1 += cstep_b; + } + + c1 += cstep_c; + } +} diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h index 7e747b4a8..a498e687e 100644 --- a/frame/3/trsm/bli_trsm_var.h +++ b/frame/3/trsm/bli_trsm_var.h @@ -43,12 +43,11 @@ \ void PASTEMAC0(opname) \ ( \ - const obj_t* a, \ - const obj_t* b, \ - const obj_t* c, \ - const cntx_t* cntx, \ - rntm_t* rntm, \ - cntl_t* cntl, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* c, \ + const cntx_t* cntx, \ + const cntl_t* cntl, \ thrinfo_t* thread \ ); @@ -63,36 +62,3 @@ GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoff, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, \ - dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) -INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) -INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) -INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) - diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c index a0a59c0a8..39c5372f3 100644 --- a/frame/3/trsm/bli_trsm_xx_ker_var2.c +++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c @@ -47,14 +47,12 @@ void bli_trsm_xx_ker_var2 const obj_t* b, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl, + const cntl_t* cntl, thrinfo_t* thread ) { - dim_t side; - dim_t uplo; - l3_var_oft f; + dim_t side; + dim_t uplo; // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular @@ -73,7 +71,7 @@ void bli_trsm_xx_ker_var2 } // Index into the variant array to extract the correct function pointer. - f = vars[side][uplo]; + l3_var_oft f = vars[side][uplo]; // Call the macrokernel. f @@ -82,7 +80,6 @@ void bli_trsm_xx_ker_var2 b, c, cntx, - rntm, cntl, thread ); diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c index 26da1b004..7a4d2c736 100644 --- a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c @@ -411,7 +411,7 @@ void PASTEMAC(ch,varname) \ a2 = a_cast; \ b2 = b1; \ /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ - if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + if ( j + bli_thrinfo_num_threads(thread) >= n_iter ) \ b2 = b_cast; \ } \ \ @@ -476,7 +476,7 @@ void PASTEMAC(ch,varname) \ a2 = a_cast; \ b2 = b1; \ /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ - if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + if ( j + bli_thrinfo_num_threads(thread) >= n_iter ) \ b2 = b_cast; \ } \ \ diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c index 607b40e54..ac4ab28b9 100644 --- a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c +++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c @@ -349,8 +349,8 @@ void PASTEMAC(ch,varname) \ /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ \ dim_t jr_start, jr_end; \ dim_t jr_inc; \ diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c index 3299b5f8e..7fa4bd2c0 100644 --- a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c +++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c @@ -349,8 +349,8 @@ void PASTEMAC(ch,varname) \ /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ \ dim_t jr_start, jr_end; \ dim_t jr_inc; \ diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c index b02ff0955..5379ac0ab 100644 --- a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c @@ -421,7 +421,7 @@ void PASTEMAC(ch,varname) \ a2 = a_cast; \ b2 = b1; \ /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ - if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + if ( j + bli_thrinfo_num_threads(thread) >= n_iter ) \ b2 = b_cast; \ } \ \ @@ -486,7 +486,7 @@ void PASTEMAC(ch,varname) \ a2 = a_cast; \ b2 = b1; \ /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ - if ( j + bli_thread_num_threads(thread) >= n_iter ) \ + if ( j + bli_thrinfo_num_threads(thread) >= n_iter ) \ b2 = b_cast; \ } \ \ diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c index e78cef477..fadf3b92b 100644 --- a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c +++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c @@ -357,8 +357,8 @@ void PASTEMAC(ch,varname) \ /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ \ dim_t jr_start, jr_end; \ dim_t jr_inc; \ diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c index 93cac371a..106ab499e 100644 --- a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c +++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c @@ -357,8 +357,8 @@ void PASTEMAC(ch,varname) \ /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ \ dim_t jr_start, jr_end; \ dim_t jr_inc; \ diff --git a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c index 1e903c3c1..99e6d7984 100644 --- a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c @@ -443,7 +443,7 @@ void PASTEMAC(ch,varname) \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ - if ( i + bli_thread_num_threads(thread) >= m_iter ) \ + if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + ps_b_cur; \ @@ -523,7 +523,7 @@ void PASTEMAC(ch,varname) \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ - if ( i + bli_thread_num_threads(thread) >= m_iter ) \ + if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + cstep_b; \ diff --git a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c index a44d64f45..ebddbcd19 100644 --- a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c @@ -436,7 +436,7 @@ void PASTEMAC(ch,varname) \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ - if ( i + bli_thread_num_threads(thread) >= m_iter ) \ + if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + ps_b_cur; \ @@ -516,7 +516,7 @@ void PASTEMAC(ch,varname) \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ - if ( i + bli_thread_num_threads(thread) >= m_iter ) \ + if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + cstep_b; \ diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c index b22ddbee0..daa092ba7 100644 --- a/frame/base/bli_cntl.c +++ b/frame/base/bli_cntl.c @@ -37,7 +37,7 @@ cntl_t* bli_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, opid_t family, bszid_t bszid, void_fp var_func, @@ -46,14 +46,13 @@ cntl_t* bli_cntl_create_node ) { cntl_t* cntl; - mem_t* pack_mem; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntl_create_node(): " ); #endif // Allocate the cntl_t struct. - cntl = bli_sba_acquire( rntm, sizeof( cntl_t ) ); + cntl = bli_sba_acquire( pool, sizeof( cntl_t ) ); bli_cntl_set_family( family, cntl ); bli_cntl_set_bszid( bszid, cntl ); @@ -62,19 +61,12 @@ cntl_t* bli_cntl_create_node bli_cntl_set_sub_prenode( NULL, cntl ); bli_cntl_set_sub_node( sub_node, cntl ); - // Query the address of the node's packed mem_t entry so we can initialize - // key fields (to NULL or 0). - // NOTE: This initialization is important, since it allows threads to - // discern whether blocks have been acquired from the memory allocator. - pack_mem = bli_cntl_pack_mem( cntl ); - bli_mem_clear( pack_mem ); - return cntl; } void bli_cntl_free_node ( - rntm_t* rntm, + pool_t* pool, cntl_t* cntl ) { @@ -82,7 +74,7 @@ void bli_cntl_free_node printf( "bli_cntl_free_node(): " ); #endif - bli_sba_release( rntm, cntl ); + bli_sba_release( pool, cntl ); } void bli_cntl_clear_node @@ -90,39 +82,20 @@ void bli_cntl_clear_node cntl_t* cntl ) { - mem_t* pack_mem; - // Clear various fields in the control tree. Clearing these fields // actually is not needed, but we do it for debugging/completeness. bli_cntl_set_var_func( NULL, cntl ); bli_cntl_set_params( NULL, cntl ); bli_cntl_set_sub_prenode( NULL, cntl ); bli_cntl_set_sub_node( NULL, cntl ); - - // Clearing these fields is potentially more important if the control - // tree is cached somewhere and reused. - pack_mem = bli_cntl_pack_mem( cntl ); - bli_mem_clear( pack_mem ); } // ----------------------------------------------------------------------------- void bli_cntl_free ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - if ( thread != NULL ) bli_cntl_free_w_thrinfo( rntm, cntl, thread ); - else bli_cntl_free_wo_thrinfo( rntm, cntl ); -} - -void bli_cntl_free_w_thrinfo - ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + pool_t* pool, + cntl_t* cntl ) { // Base case: simply return when asked to free NULL nodes. @@ -131,33 +104,13 @@ void bli_cntl_free_w_thrinfo cntl_t* cntl_sub_prenode = bli_cntl_sub_prenode( cntl ); cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl ); void* cntl_params = bli_cntl_params( cntl ); - mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl ); - - // Don't immediately dereference the prenode and subnode of the thrinfo_t - // node. In some cases, the thrinfo_t tree is not built out all the way, - // perhaps because there are more ways of parallelization than micropanels - // of data in this dimension, or because the problem is small enough that - // there is no gemm subproblem in bli_trsm_blk_var1(). Thus, we start with - // NULL values for these variables and only dereference the fields of the - // thrinfo_t struct if the thrinfo_t exists (ie: is non-NULL). We will also - // have to check the thrinfo_t pointer for NULLness before using it below, - // when checking if we need to free the pack_mem field of the cntl_t node - // (see below). - thrinfo_t* thread_sub_prenode = NULL; - thrinfo_t* thread_sub_node = NULL; - - if ( thread != NULL ) - { - thread_sub_prenode = bli_thrinfo_sub_prenode( thread ); - thread_sub_node = bli_thrinfo_sub_node( thread ); - } // Only recurse into prenode branch if it exists. if ( cntl_sub_prenode != NULL ) { // Recursively free all memory associated with the sub-prenode and its // children. - bli_cntl_free_w_thrinfo( rntm, cntl_sub_prenode, thread_sub_prenode ); + bli_cntl_free( pool, cntl_sub_prenode ); } // Only recurse into the child node if it exists. @@ -165,7 +118,7 @@ void bli_cntl_free_w_thrinfo { // Recursively free all memory associated with the sub-node and its // children. - bli_cntl_free_w_thrinfo( rntm, cntl_sub_node, thread_sub_node ); + bli_cntl_free( pool, cntl_sub_node ); } // Free the current node's params field, if it is non-NULL. @@ -175,80 +128,19 @@ void bli_cntl_free_w_thrinfo printf( "bli_cntl_free_w_thrinfo(): " ); #endif - bli_sba_release( rntm, cntl_params ); - } - - // Release the current node's pack mem_t entry back to the memory - // broker from which it originated, but only if the mem_t entry is - // allocated, and only if the current thread is chief for its group. - // Also note that we don't proceed with either of the above tests if - // the thrinfo_t pointer is NULL. (See above for background on when - // this can happen.) - if ( thread != NULL ) - if ( bli_thread_am_ochief( thread ) ) - if ( bli_mem_is_alloc( cntl_pack_mem ) ) - { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_cntl_free_w_thrinfo(): releasing mem pool block.\n" ); - #endif - - bli_pba_release( rntm, cntl_pack_mem ); + bli_sba_release( pool, cntl_params ); } // Free the current node. - bli_cntl_free_node( rntm, cntl ); -} - -void bli_cntl_free_wo_thrinfo - ( - rntm_t* rntm, - cntl_t* cntl - ) -{ - // Base case: simply return when asked to free NULL nodes. - if ( cntl == NULL ) return; - - cntl_t* cntl_sub_prenode = bli_cntl_sub_prenode( cntl ); - cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl ); - void* cntl_params = bli_cntl_params( cntl ); - mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl ); - - { - // Recursively free all memory associated with the sub-prenode and its - // children. - bli_cntl_free_wo_thrinfo( rntm, cntl_sub_prenode ); - } - - { - // Recursively free all memory associated with the sub-node and its - // children. - bli_cntl_free_wo_thrinfo( rntm, cntl_sub_node ); - } - - // Free the current node's params field, if it is non-NULL. - if ( cntl_params != NULL ) - { - bli_sba_release( rntm, cntl_params ); - } - - // Release the current node's pack mem_t entry back to the memory - // broker from which it originated, but only if the mem_t entry is - // allocated. - if ( bli_mem_is_alloc( cntl_pack_mem ) ) - { - bli_pba_release( rntm, cntl_pack_mem ); - } - - // Free the current node. - bli_cntl_free_node( rntm, cntl ); + bli_cntl_free_node( pool, cntl ); } // ----------------------------------------------------------------------------- cntl_t* bli_cntl_copy ( - rntm_t* rntm, - cntl_t* cntl + pool_t* pool, + const cntl_t* cntl ) { // Make a copy of the current node. Notice that the source node @@ -257,7 +149,7 @@ cntl_t* bli_cntl_copy // field. cntl_t* cntl_copy = bli_cntl_create_node ( - rntm, + pool, bli_cntl_family( cntl ), bli_cntl_bszid( cntl ), bli_cntl_var_func( cntl ), @@ -273,7 +165,7 @@ cntl_t* bli_cntl_copy // struct. uint64_t params_size = bli_cntl_params_size( cntl ); void* params_orig = bli_cntl_params( cntl ); - void* params_copy = bli_sba_acquire( rntm, ( size_t )params_size ); + void* params_copy = bli_sba_acquire( pool, ( size_t )params_size ); // Copy the original params struct to the new memory region. memcpy( params_copy, params_orig, params_size ); @@ -288,7 +180,7 @@ cntl_t* bli_cntl_copy { cntl_t* sub_prenode_copy = bli_cntl_copy ( - rntm, + pool, bli_cntl_sub_prenode( cntl ) ); @@ -302,7 +194,7 @@ cntl_t* bli_cntl_copy { cntl_t* sub_node_copy = bli_cntl_copy ( - rntm, + pool, bli_cntl_sub_node( cntl ) ); diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h index 406a350ee..2c1aeb603 100644 --- a/frame/base/bli_cntl.h +++ b/frame/base/bli_cntl.h @@ -45,14 +45,7 @@ struct cntl_s void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; - - // Optional fields (needed only by some operations such as packm). - // NOTE: first field of params must be a uint64_t containing the size - // of the struct. void* params; - - // Internal fields that track "cached" data. - mem_t pack_mem; }; typedef struct cntl_s cntl_t; */ @@ -62,7 +55,7 @@ typedef struct cntl_s cntl_t; BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( - rntm_t* rntm, + pool_t* pool, opid_t family, bszid_t bszid, void_fp var_func, @@ -72,7 +65,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node BLIS_EXPORT_BLIS void bli_cntl_free_node ( - rntm_t* rntm, + pool_t* pool, cntl_t* cntl ); @@ -85,28 +78,14 @@ BLIS_EXPORT_BLIS void bli_cntl_clear_node BLIS_EXPORT_BLIS void bli_cntl_free ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo - ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo - ( - rntm_t* rntm, - cntl_t* cntl + pool_t* pool, + cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( - rntm_t* rntm, - cntl_t* cntl + pool_t* pool, + const cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family @@ -163,11 +142,6 @@ BLIS_INLINE uint64_t bli_cntl_params_size( const cntl_t* cntl ) return *( ( uint64_t* )(cntl->params) ); } -BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) -{ - return &(cntl->pack_mem); -} - // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( const cntl_t* cntl ) @@ -220,8 +194,3 @@ BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) cntl->params = params; } -BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) -{ - cntl->pack_mem = *pack_mem; -} - diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 827b19cfd..90050a5ed 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -357,8 +357,6 @@ BLIS_EXPORT_BLIS void bli_cntx_print( const cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... ); -BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... ); - #endif diff --git a/frame/base/bli_mem.h b/frame/base/bli_mem.h index c25511486..b46c0509d 100644 --- a/frame/base/bli_mem.h +++ b/frame/base/bli_mem.h @@ -136,13 +136,26 @@ BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. +#ifdef __cplusplus +#define BLIS_MEM_INITIALIZER \ + { \ + .pblk = BLIS_PBLK_INITIALIZER, \ + /* When using C++, which is strongly typed, we avoid use of -1 as a + packbuf_t value since it will result in a compile-time error. */ \ + .buf_type = BLIS_BUFFER_FOR_GEN_USE, \ + .pool = NULL, \ + .size = 0, \ + } +#else // C99 #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ - } \ + } +#endif + BLIS_INLINE void bli_mem_clear( mem_t* mem ) { diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c index cabaf4ff6..abcf708e2 100644 --- a/frame/base/bli_pba.c +++ b/frame/base/bli_pba.c @@ -37,13 +37,13 @@ #include "blis.h" // Statically initialize the mutex within the packing block allocator object. -static pba_t pba = { .mutex = BLIS_PTHREAD_MUTEX_INITIALIZER }; +static pba_t global_pba = { .mutex = BLIS_PTHREAD_MUTEX_INITIALIZER }; // ----------------------------------------------------------------------------- pba_t* bli_pba_query( void ) { - return &pba; + return &global_pba; } void bli_pba_init @@ -92,17 +92,12 @@ void bli_pba_finalize void bli_pba_acquire_m ( - rntm_t* rntm, + pba_t* pba, siz_t req_size, packbuf_t buf_type, mem_t* mem ) { - pool_t* pool; - pblk_t* pblk; - dim_t pi; - err_t r_val; - // If the internal memory pools for packing block allocator are disabled, // we spoof the buffer type as BLIS_BUFFER_FOR_GEN_USE to induce the // immediate usage of bli_pba_malloc(). @@ -115,10 +110,6 @@ void bli_pba_acquire_m #endif #endif - // Query the memory broker from the runtime. - pba_t* pba = bli_rntm_pba( rntm ); - - if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) { malloc_ft malloc_fp = bli_pba_malloc_fp( pba ); @@ -126,6 +117,7 @@ void bli_pba_acquire_m // For general-use buffer requests, dynamically allocating memory // is assumed to be sufficient. + err_t r_val; void* buf = bli_fmalloc_align( malloc_fp, req_size, align_size, &r_val ); // Initialize the mem_t object with: @@ -148,11 +140,11 @@ void bli_pba_acquire_m // Map the requested packed buffer type to a zero-based index, which // we then use to select the corresponding memory pool. - pi = bli_packbuf_index( buf_type ); - pool = bli_pba_pool( pi, pba ); + dim_t pi = bli_packbuf_index( buf_type ); + pool_t* pool = bli_pba_pool( pi, pba ); // Extract the address of the pblk_t struct within the mem_t. - pblk = bli_mem_pblk( mem ); + pblk_t* pblk = bli_mem_pblk( mem ); // Acquire the mutex associated with the pba object. bli_pba_lock( pba ); @@ -197,13 +189,10 @@ void bli_pba_acquire_m void bli_pba_release ( - rntm_t* rntm, - mem_t* mem + pba_t* pba, + mem_t* mem ) { - // Query the memory broker from the runtime. - pba_t* pba = bli_rntm_pba( rntm ); - // Extract the buffer type so we know what kind of memory was allocated. packbuf_t buf_type = bli_mem_buf_type( mem ); diff --git a/frame/base/bli_pba.h b/frame/base/bli_pba.h index dfda53090..0adde1941 100644 --- a/frame/base/bli_pba.h +++ b/frame/base/bli_pba.h @@ -132,7 +132,7 @@ void bli_pba_finalize void bli_pba_acquire_m ( - rntm_t* rntm, + pba_t* pba, siz_t req_size, packbuf_t buf_type, mem_t* mem @@ -140,20 +140,10 @@ void bli_pba_acquire_m void bli_pba_release ( - rntm_t* rntm, - mem_t* mem + pba_t* pba, + mem_t* mem ); -BLIS_INLINE void bli_pba_rntm_set_pba - ( - rntm_t* rntm - ) -{ - pba_t* pba = bli_pba_query(); - - bli_rntm_set_pba( pba, rntm ); -} - siz_t bli_pba_pool_size ( const pba_t* pba, diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h index f6756c589..882ad1cc3 100644 --- a/frame/base/bli_rntm.h +++ b/frame/base/bli_rntm.h @@ -52,10 +52,6 @@ typedef struct rntm_s bool pack_a; bool pack_b; bool l3_sup; - - pool_t* sba_pool; - pba_t* pba; - } rntm_t; */ @@ -80,7 +76,7 @@ BLIS_INLINE dim_t bli_rntm_num_threads( const rntm_t* rntm ) BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, const rntm_t* rntm ) { - return rntm->thrloop[ bszid ]; + return ( bszid == BLIS_NO_PART ? 1 : rntm->thrloop[ bszid ] ); } BLIS_INLINE dim_t bli_rntm_jc_ways( const rntm_t* rntm ) @@ -122,20 +118,6 @@ BLIS_INLINE bool bli_rntm_l3_sup( const rntm_t* rntm ) return rntm->l3_sup; } -// -// -- rntm_t query (internal use only) ----------------------------------------- -// - -BLIS_INLINE pool_t* bli_rntm_sba_pool( const rntm_t* rntm ) -{ - return rntm->sba_pool; -} - -BLIS_INLINE pba_t* bli_rntm_pba( const rntm_t* rntm ) -{ - return rntm->pba; -} - // // -- rntm_t modification (internal use only) ---------------------------------- // @@ -196,16 +178,6 @@ BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, bli_rntm_set_pr_ways_only( 1, rntm ); } -BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) -{ - rntm->sba_pool = sba_pool; -} - -BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) -{ - rntm->pba = pba; -} - BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( 1, rntm ); @@ -276,15 +248,6 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) bli_rntm_set_l3_sup( TRUE, rntm ); } -BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) -{ - bli_rntm_set_sba_pool( NULL, rntm ); -} -BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) -{ - bli_rntm_set_pba( NULL, rntm ); -} - // // -- rntm_t initialization ---------------------------------------------------- // @@ -302,8 +265,6 @@ BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ - .sba_pool = NULL, \ - .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) @@ -317,9 +278,6 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); - - bli_rntm_clear_sba_pool( rntm ); - bli_rntm_clear_pba( rntm ); } // diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c index 776622bb4..5123c5b4b 100644 --- a/frame/base/bli_sba.c +++ b/frame/base/bli_sba.c @@ -57,7 +57,7 @@ void bli_sba_finalize( void ) void* bli_sba_acquire ( - rntm_t* rntm, + pool_t* pool, siz_t req_size ) { @@ -65,7 +65,16 @@ void* bli_sba_acquire err_t r_val; #ifdef BLIS_ENABLE_SBA_POOLS - if ( rntm == NULL ) + + // We don't expect NULL sba_pool pointers in the normal course of BLIS + // operation. However, there are rare instances where it is convenient + // to support use of bli_sba_acquire() without having to pass in a valid + // sba pool data structure. The case that inspired this branch was the + // gemm_ukr and related test modules in the BLIS testsuite. (There, it + // is convenient to not have to checkout an array_t from the sba, and it + // does no harm since the malloc() happens outside of the region that + // would be timed.) + if ( pool == NULL ) { block = bli_malloc_intl( req_size, &r_val ); } @@ -73,43 +82,26 @@ void* bli_sba_acquire { pblk_t pblk; - // Query the small block pool from the rntm. - pool_t* pool = bli_rntm_sba_pool( rntm ); - - // We don't expect NULL sba_pool pointers in the normal course of BLIS - // operation. However, there are rare instances where it is convenient - // to support use of bli_sba_acquire() without having to pass in a valid - // sba pool data structure. The case that inspired this branch was the - // gemm_ukr and related test modules in the BLIS testsuite. (There, it - // is convenient to not have to checkout an array_t from the sba, and it - // does no harm since the malloc() happens outside of the region that - // would be timed.) - if ( pool == NULL ) - { - block = bli_malloc_intl( req_size, &r_val ); - } - else + // Query the block_size of the pool_t so that we can request the exact + // size present. + const siz_t block_size = bli_pool_block_size( pool ); + + // Sanity check: Make sure the requested size is no larger than the + // block_size field of the pool. + if ( block_size < req_size ) { - // Query the block_size of the pool_t so that we can request the exact - // size present. - const siz_t block_size = bli_pool_block_size( pool ); - - // Sanity check: Make sure the requested size is no larger than the - // block_size field of the pool. - if ( block_size < req_size ) - { - printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n", - ( int )block_size, ( int )req_size ); - bli_abort(); - } - - // Check out a block using the block_size queried above. - bli_pool_checkout_block( block_size, &pblk, pool ); - - // The block address is stored within the pblk_t. - block = bli_pblk_buf( &pblk ); + printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n", + ( int )block_size, ( int )req_size ); + bli_abort(); } + + // Check out a block using the block_size queried above. + bli_pool_checkout_block( block_size, &pblk, pool ); + + // The block address is stored within the pblk_t. + block = bli_pblk_buf( &pblk ); } + #else block = bli_malloc_intl( req_size, &r_val ); @@ -122,12 +114,13 @@ void* bli_sba_acquire void bli_sba_release ( - rntm_t* rntm, + pool_t* pool, void* block ) { #ifdef BLIS_ENABLE_SBA_POOLS - if ( rntm == NULL ) + + if ( pool == NULL ) { bli_free_intl( block ); } @@ -135,32 +128,23 @@ void bli_sba_release { pblk_t pblk; - // Query the small block pool from the rntm. - pool_t* pool = bli_rntm_sba_pool( rntm ); - - if ( pool == NULL ) - { - bli_free_intl( block ); - } - else - { - // Query the block_size field from the pool. This is not super-important - // for this particular application of the pool_t (that is, the "leaf" - // component of the sba), but it seems like good housekeeping to maintain - // the block_size field of the pblk_t in case its ever needed/read. - const siz_t block_size = bli_pool_block_size( pool ); - - // Embed the block's memory address into a pblk_t, along with the - // block_size queried from the pool. - bli_pblk_set_buf( block, &pblk ); - bli_pblk_set_block_size( block_size, &pblk ); - - // Check the pblk_t back into the pool_t. (It's okay that the pblk_t is - // a local variable since its contents are copied into the pool's internal - // data structure--an array of pblk_t.) - bli_pool_checkin_block( &pblk, pool ); - } + // Query the block_size field from the pool. This is not super-important + // for this particular application of the pool_t (that is, the "leaf" + // component of the sba), but it seems like good housekeeping to maintain + // the block_size field of the pblk_t in case its ever needed/read. + const siz_t block_size = bli_pool_block_size( pool ); + + // Embed the block's memory address into a pblk_t, along with the + // block_size queried from the pool. + bli_pblk_set_buf( block, &pblk ); + bli_pblk_set_block_size( block_size, &pblk ); + + // Check the pblk_t back into the pool_t. (It's okay that the pblk_t is + // a local variable since its contents are copied into the pool's internal + // data structure--an array of pblk_t.) + bli_pool_checkin_block( &pblk, pool ); } + #else bli_free_intl( block ); @@ -173,11 +157,11 @@ array_t* bli_sba_checkout_array const siz_t n_threads ) { - #ifndef BLIS_ENABLE_SBA_POOLS - return NULL; - #endif - +#ifdef BLIS_ENABLE_SBA_POOLS return bli_apool_checkout_array( n_threads, &sba ); +#else + return NULL; +#endif } void bli_sba_checkin_array @@ -185,30 +169,10 @@ void bli_sba_checkin_array array_t* array ) { - #ifndef BLIS_ENABLE_SBA_POOLS - return; - #endif - +#ifdef BLIS_ENABLE_SBA_POOLS bli_apool_checkin_array( array, &sba ); -} - -void bli_sba_rntm_set_pool - ( - siz_t index, - array_t* array, - rntm_t* rntm - ) -{ - #ifndef BLIS_ENABLE_SBA_POOLS - bli_rntm_set_sba_pool( NULL, rntm ); +#else return; - #endif - - // Query the pool_t* in the array_t corresponding to index. - pool_t* pool = bli_apool_array_elem( index, array ); - - // Embed the pool_t* into the rntm_t. - bli_rntm_set_sba_pool( pool, rntm ); +#endif } - diff --git a/frame/base/bli_sba.h b/frame/base/bli_sba.h index 4fc3aaaee..8d9db844f 100644 --- a/frame/base/bli_sba.h +++ b/frame/base/bli_sba.h @@ -52,24 +52,17 @@ void bli_sba_checkin_array array_t* array ); -void bli_sba_rntm_set_pool - ( - siz_t index, - array_t* array, - rntm_t* rntm - ); - void* bli_sba_acquire ( - rntm_t* rntm, + pool_t* pool, siz_t req_size ); + void bli_sba_release ( - rntm_t* rntm, + pool_t* pool, void* block ); - #endif diff --git a/frame/compat/extra/bla_gemm3m.c b/frame/compat/extra/bla_gemm3m.c index 258ac5bbb..1d124cbc2 100644 --- a/frame/compat/extra/bla_gemm3m.c +++ b/frame/compat/extra/bla_gemm3m.c @@ -242,8 +242,7 @@ void PASTEF77(ch,blasname) \ &betao, \ &co, \ cntx, \ - rntm, \ - NULL \ + rntm \ ); \ } \ \ diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h index 42ad9c72b..71a6096e1 100644 --- a/frame/include/bli_extern_defs.h +++ b/frame/include/bli_extern_defs.h @@ -44,7 +44,5 @@ BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; -BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; -BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif diff --git a/frame/include/bli_oapi_ex.h b/frame/include/bli_oapi_ex.h index 7252fd7ff..b150b89fc 100644 --- a/frame/include/bli_oapi_ex.h +++ b/frame/include/bli_oapi_ex.h @@ -48,7 +48,7 @@ // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS -#define BLIS_OAPI_EX_PARAMS , const cntx_t* cntx, rntm_t* rntm +#define BLIS_OAPI_EX_PARAMS , const cntx_t* cntx, const rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. diff --git a/frame/include/bli_tapi_ex.h b/frame/include/bli_tapi_ex.h index f12be24b8..e7665e779 100644 --- a/frame/include/bli_tapi_ex.h +++ b/frame/include/bli_tapi_ex.h @@ -48,7 +48,7 @@ // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS -#define BLIS_TAPI_EX_PARAMS , const cntx_t* cntx, rntm_t* rntm +#define BLIS_TAPI_EX_PARAMS , const cntx_t* cntx, const rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index d37e62f8a..0c5d11e6b 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -69,7 +69,7 @@ // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). -#if BLIS_BLAS_INT_SIZE == 64 +#if BLIS_BLAS_INT_TYPE_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif @@ -1072,14 +1072,7 @@ struct cntl_s void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; - - // Optional fields (needed only by some operations such as packm). - // NOTE: first field of params must be a uint64_t containing the size - // of the struct. void* params; - - // Internal fields that track "cached" data. - mem_t pack_mem; }; typedef struct cntl_s cntl_t; @@ -1184,9 +1177,8 @@ typedef void (*obj_pack_fn_t) const struct obj_s* a, struct obj_s* ap, const struct cntx_s* cntx, - struct rntm_s* rntm, - struct cntl_s* cntl, - const struct thrinfo_s* thread + const struct cntl_s* cntl, + struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) @@ -1195,9 +1187,8 @@ typedef void (*obj_ker_fn_t) const struct obj_s* b, const struct obj_s* c, const struct cntx_s* cntx, - struct rntm_s* rntm, - struct cntl_s* cntl, - const struct thrinfo_s* thread + const struct cntl_s* cntl, + struct thrinfo_s* thread ); typedef struct obj_s @@ -1457,15 +1448,6 @@ typedef struct rntm_s bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. - - // "Internal" fields: these should not be exposed to the end-user. - - // The small block pool, which is attached in the l3 thread decorator. - pool_t* sba_pool; - - // The packing block allocator, which is attached in the l3 thread decorator. - pba_t* pba; - } rntm_t; diff --git a/frame/include/level0/1e/bli_copy1es.h b/frame/include/level0/1e/bli_copy1es.h index 0d5c98175..7dc6a493a 100644 --- a/frame/include/level0/1e/bli_copy1es.h +++ b/frame/include/level0/1e/bli_copy1es.h @@ -41,18 +41,18 @@ // - The first char encodes the type of x. // - The second char encodes the type of y. -#define bli_sscopy1es( a, bri, bir ) {} -#define bli_dscopy1es( a, bri, bir ) {} -#define bli_cscopy1es( a, bri, bir ) {} -#define bli_zscopy1es( a, bri, bir ) {} - -#define bli_sdcopy1es( a, bri, bir ) {} -#define bli_ddcopy1es( a, bri, bir ) {} -#define bli_cdcopy1es( a, bri, bir ) {} -#define bli_zdcopy1es( a, bri, bir ) {} - -#define bli_sccopy1es( a, bri, bir ) {} -#define bli_dccopy1es( a, bri, bir ) {} +#define bli_sscopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_dscopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_cscopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_zscopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } + +#define bli_sdcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_ddcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_cdcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_zdcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } + +#define bli_sccopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_dccopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ @@ -64,8 +64,8 @@ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } -#define bli_szcopy1es( a, bri, bir ) {} -#define bli_dzcopy1es( a, bri, bir ) {} +#define bli_szcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_dzcopy1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ diff --git a/frame/include/level0/1e/bli_copyj1es.h b/frame/include/level0/1e/bli_copyj1es.h index f2139a883..25bb19d5b 100644 --- a/frame/include/level0/1e/bli_copyj1es.h +++ b/frame/include/level0/1e/bli_copyj1es.h @@ -41,18 +41,18 @@ // - The first char encodes the type of x. // - The second char encodes the type of y. -#define bli_sscopyj1es( a, bri, bir ) {} -#define bli_dscopyj1es( a, bri, bir ) {} -#define bli_cscopyj1es( a, bri, bir ) {} -#define bli_zscopyj1es( a, bri, bir ) {} - -#define bli_sdcopyj1es( a, bri, bir ) {} -#define bli_ddcopyj1es( a, bri, bir ) {} -#define bli_cdcopyj1es( a, bri, bir ) {} -#define bli_zdcopyj1es( a, bri, bir ) {} - -#define bli_sccopyj1es( a, bri, bir ) {} -#define bli_dccopyj1es( a, bri, bir ) {} +#define bli_sscopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_dscopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_cscopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_zscopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } + +#define bli_sdcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_ddcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_cdcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_zdcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } + +#define bli_sccopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_dccopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ @@ -64,8 +64,8 @@ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } -#define bli_szcopyj1es( a, bri, bir ) {} -#define bli_dzcopyj1es( a, bri, bir ) {} +#define bli_szcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } +#define bli_dzcopyj1es( a, bri, bir ) { ( void )a; ( void )bri; ( void )bir; } #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ diff --git a/frame/include/level0/1e/bli_scal21es.h b/frame/include/level0/1e/bli_scal21es.h index f54f2fd01..cee7745e8 100644 --- a/frame/include/level0/1e/bli_scal21es.h +++ b/frame/include/level0/1e/bli_scal21es.h @@ -44,52 +44,52 @@ // -- (axy) = (??s) ------------------------------------------------------------ -#define bli_sssscal21es( a, x, yri, yir ) {} -#define bli_sdsscal21es( a, x, yri, yir ) {} -#define bli_scsscal21es( a, x, yri, yir ) {} -#define bli_szsscal21es( a, x, yri, yir ) {} - -#define bli_dssscal21es( a, x, yri, yir ) {} -#define bli_ddsscal21es( a, x, yri, yir ) {} -#define bli_dcsscal21es( a, x, yri, yir ) {} -#define bli_dzsscal21es( a, x, yri, yir ) {} - -#define bli_cssscal21es( a, x, yri, yir ) {} -#define bli_cdsscal21es( a, x, yri, yir ) {} -#define bli_ccsscal21es( a, x, yri, yir ) {} -#define bli_czsscal21es( a, x, yri, yir ) {} - -#define bli_zssscal21es( a, x, yri, yir ) {} -#define bli_zdsscal21es( a, x, yri, yir ) {} -#define bli_zcsscal21es( a, x, yri, yir ) {} -#define bli_zzsscal21es( a, x, yri, yir ) {} +#define bli_sssscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_sdsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_scsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_szsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } + +#define bli_dssscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_ddsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_dcsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_dzsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } + +#define bli_cssscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_cdsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_ccsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_czsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } + +#define bli_zssscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_zdsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_zcsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_zzsscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } // -- (axy) = (??d) ------------------------------------------------------------ -#define bli_ssdscal21es( a, x, yri, yir ) {} -#define bli_sddscal21es( a, x, yri, yir ) {} -#define bli_scdscal21es( a, x, yri, yir ) {} -#define bli_szdscal21es( a, x, yri, yir ) {} +#define bli_ssdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_sddscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_scdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_szdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } -#define bli_dsdscal21es( a, x, yri, yir ) {} -#define bli_dddscal21es( a, x, yri, yir ) {} -#define bli_dcdscal21es( a, x, yri, yir ) {} -#define bli_dzdscal21es( a, x, yri, yir ) {} +#define bli_dsdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_dddscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_dcdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_dzdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } -#define bli_csdscal21es( a, x, yri, yir ) {} -#define bli_cddscal21es( a, x, yri, yir ) {} -#define bli_ccdscal21es( a, x, yri, yir ) {} -#define bli_czdscal21es( a, x, yri, yir ) {} +#define bli_csdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_cddscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_ccdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_czdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } -#define bli_zsdscal21es( a, x, yri, yir ) {} -#define bli_zddscal21es( a, x, yri, yir ) {} -#define bli_zcdscal21es( a, x, yri, yir ) {} -#define bli_zzdscal21es( a, x, yri, yir ) {} +#define bli_zsdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_zddscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_zcdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_zzdscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } // -- (axy) = (??c) ------------------------------------------------------------ -#define bli_sscscal21es( a, x, yri, yir ) {} -#define bli_sdcscal21es( a, x, yri, yir ) {} +#define bli_sscscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_sdcscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ @@ -101,8 +101,8 @@ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } -#define bli_dscscal21es( a, x, yri, yir ) {} -#define bli_ddcscal21es( a, x, yri, yir ) {} +#define bli_dscscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_ddcscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ @@ -158,8 +158,8 @@ // -- (axy) = (??z) ------------------------------------------------------------ -#define bli_sszscal21es( a, x, yri, yir ) {} -#define bli_sdzscal21es( a, x, yri, yir ) {} +#define bli_sszscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_sdzscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ @@ -171,8 +171,8 @@ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } -#define bli_dszscal21es( a, x, yri, yir ) {} -#define bli_ddzscal21es( a, x, yri, yir ) {} +#define bli_dszscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_ddzscal21es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ diff --git a/frame/include/level0/1e/bli_scal2j1es.h b/frame/include/level0/1e/bli_scal2j1es.h index 741fbceed..a32c4f2e4 100644 --- a/frame/include/level0/1e/bli_scal2j1es.h +++ b/frame/include/level0/1e/bli_scal2j1es.h @@ -44,52 +44,52 @@ // -- (axy) = (??s) ------------------------------------------------------------ -#define bli_sssscal2j1es( a, x, yri, yir ) {} -#define bli_sdsscal2j1es( a, x, yri, yir ) {} -#define bli_scsscal2j1es( a, x, yri, yir ) {} -#define bli_szsscal2j1es( a, x, yri, yir ) {} - -#define bli_dssscal2j1es( a, x, yri, yir ) {} -#define bli_ddsscal2j1es( a, x, yri, yir ) {} -#define bli_dcsscal2j1es( a, x, yri, yir ) {} -#define bli_dzsscal2j1es( a, x, yri, yir ) {} - -#define bli_cssscal2j1es( a, x, yri, yir ) {} -#define bli_cdsscal2j1es( a, x, yri, yir ) {} -#define bli_ccsscal2j1es( a, x, yri, yir ) {} -#define bli_czsscal2j1es( a, x, yri, yir ) {} - -#define bli_zssscal2j1es( a, x, yri, yir ) {} -#define bli_zdsscal2j1es( a, x, yri, yir ) {} -#define bli_zcsscal2j1es( a, x, yri, yir ) {} -#define bli_zzsscal2j1es( a, x, yri, yir ) {} +#define bli_sssscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_sdsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_scsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_szsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } + +#define bli_dssscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_ddsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_dcsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_dzsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } + +#define bli_cssscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_cdsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_ccsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_czsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } + +#define bli_zssscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_zdsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_zcsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_zzsscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } // -- (axy) = (??d) ------------------------------------------------------------ -#define bli_ssdscal2j1es( a, x, yri, yir ) {} -#define bli_sddscal2j1es( a, x, yri, yir ) {} -#define bli_scdscal2j1es( a, x, yri, yir ) {} -#define bli_szdscal2j1es( a, x, yri, yir ) {} +#define bli_ssdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_sddscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_scdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_szdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } -#define bli_dsdscal2j1es( a, x, yri, yir ) {} -#define bli_dddscal2j1es( a, x, yri, yir ) {} -#define bli_dcdscal2j1es( a, x, yri, yir ) {} -#define bli_dzdscal2j1es( a, x, yri, yir ) {} +#define bli_dsdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_dddscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_dcdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_dzdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } -#define bli_csdscal2j1es( a, x, yri, yir ) {} -#define bli_cddscal2j1es( a, x, yri, yir ) {} -#define bli_ccdscal2j1es( a, x, yri, yir ) {} -#define bli_czdscal2j1es( a, x, yri, yir ) {} +#define bli_csdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_cddscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_ccdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_czdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } -#define bli_zsdscal2j1es( a, x, yri, yir ) {} -#define bli_zddscal2j1es( a, x, yri, yir ) {} -#define bli_zcdscal2j1es( a, x, yri, yir ) {} -#define bli_zzdscal2j1es( a, x, yri, yir ) {} +#define bli_zsdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_zddscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_zcdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_zzdscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } // -- (axy) = (??c) ------------------------------------------------------------ -#define bli_sscscal2j1es( a, x, yri, yir ) {} -#define bli_sdcscal2j1es( a, x, yri, yir ) {} +#define bli_sscscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_sdcscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ @@ -101,8 +101,8 @@ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } -#define bli_dscscal2j1es( a, x, yri, yir ) {} -#define bli_ddcscal2j1es( a, x, yri, yir ) {} +#define bli_dscscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_ddcscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ @@ -158,8 +158,8 @@ // -- (axy) = (??z) ------------------------------------------------------------ -#define bli_sszscal2j1es( a, x, yri, yir ) {} -#define bli_sdzscal2j1es( a, x, yri, yir ) {} +#define bli_sszscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_sdzscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ @@ -171,8 +171,8 @@ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } -#define bli_dszscal2j1es( a, x, yri, yir ) {} -#define bli_ddzscal2j1es( a, x, yri, yir ) {} +#define bli_dszscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } +#define bli_ddzscal2j1es( a, x, yri, yir ) { ( void )a; ( void )x; ( void )yri; ( void )yir; } #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ diff --git a/frame/include/level0/ri/bli_copyris.h b/frame/include/level0/ri/bli_copyris.h index 8dd7b9b73..cd971587d 100644 --- a/frame/include/level0/ri/bli_copyris.h +++ b/frame/include/level0/ri/bli_copyris.h @@ -40,11 +40,13 @@ #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ + ( void )ai; ( void )bi; \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ + ( void )ai; ( void )bi; \ } #define bli_ccopyris( ar, ai, br, bi ) \ @@ -59,23 +61,23 @@ (bi) = (ai); \ } -#define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) -#define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) +#define bli_sscopyris( ar, ai, br, bi ) { bli_scopyris( ar, 0.0F, br, bi ); ( void )ai; } +#define bli_dscopyris( ar, ai, br, bi ) { bli_scopyris( ar, 0.0, br, bi ); ( void )ai; } #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) -#define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) -#define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) +#define bli_sdcopyris( ar, ai, br, bi ) { bli_dcopyris( ar, 0.0F, br, bi ); ( void )ai; } +#define bli_ddcopyris( ar, ai, br, bi ) { bli_dcopyris( ar, 0.0, br, bi ); ( void )ai; } #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) -#define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) -#define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) +#define bli_sccopyris( ar, ai, br, bi ) { bli_ccopyris( ar, 0.0F, br, bi ); ( void )ai; } +#define bli_dccopyris( ar, ai, br, bi ) { bli_ccopyris( ar, 0.0, br, bi ); ( void )ai; } #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) -#define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) -#define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) +#define bli_szcopyris( ar, ai, br, bi ) { bli_zcopyris( ar, 0.0F, br, bi ); ( void )ai; } +#define bli_dzcopyris( ar, ai, br, bi ) { bli_zcopyris( ar, 0.0, br, bi ); ( void )ai; } #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) diff --git a/frame/include/level0/ri/bli_scal2jris.h b/frame/include/level0/ri/bli_scal2jris.h index 9e99e583d..f3b71ed2e 100644 --- a/frame/include/level0/ri/bli_scal2jris.h +++ b/frame/include/level0/ri/bli_scal2jris.h @@ -40,6 +40,7 @@ #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ + ( void )ai; ( void )xi; ( void )yi; \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ @@ -51,18 +52,21 @@ #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ + ( void )yi; \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ + ( void )ai; \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ + ( void )xi; \ } // Notes: diff --git a/frame/include/level0/ri/bli_scal2ris.h b/frame/include/level0/ri/bli_scal2ris.h index 45e0ce427..e30fd9789 100644 --- a/frame/include/level0/ri/bli_scal2ris.h +++ b/frame/include/level0/ri/bli_scal2ris.h @@ -40,6 +40,7 @@ #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ + ( void )ai; ( void )xi; ( void )yi; \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ @@ -51,18 +52,21 @@ #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ + ( void )yi; \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ + ( void )ai; \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ + ( void )xi; \ } // Notes: diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c deleted file mode 100644 index 890c174cf..000000000 --- a/frame/thread/bli_l3_decor_openmp.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#ifdef BLIS_ENABLE_OPENMP - -//#define PRINT_THRINFO -//#define PRINT_IMPL - -void bli_l3_thread_decorator_openmp - ( - l3int_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - // Query the total number of threads from the rntm_t object. - const dim_t n_threads = bli_rntm_num_threads( rntm ); - -#ifdef PRINT_IMPL - const timpl_t ti = bli_rntm_thread_impl( rntm ); - printf( "l3_decor_openmp: l3 decor with rntm.thread_impl = %s\n", - ( ti == BLIS_SINGLE ? "single" : - ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); -#endif - - #ifdef PRINT_THRINFO - err_t r_val; - thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ), &r_val ); - #endif - - // NOTE: The sba was initialized in bli_init(). - - // Check out an array_t from the small block allocator. This is done - // with an internal lock to ensure only one application thread accesses - // the sba at a time. bli_sba_checkout_array() will also automatically - // resize the array_t, if necessary. - array_t* array = bli_sba_checkout_array( n_threads ); - - // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we have the rntm_t.sba_pool field - // initialized and ready for the global communicator creation below. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. This will be - // inherited by all of the child threads when they make local copies of - // the rntm below. - bli_pba_rntm_set_pba( rntm ); - - // Allocate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); - - - _Pragma( "omp parallel num_threads(n_threads)" ) - { - // Create a thread-local copy of the master thread's rntm_t. This is - // necessary since we want each thread to be able to track its own - // small block pool_t as it executes down the function stack. - rntm_t rntm_l = *rntm; - rntm_t* rntm_p = &rntm_l; - - // Query the thread's id from OpenMP. - const dim_t tid = omp_get_thread_num(); - - // Check for a somewhat obscure OpenMP thread-mistmatch issue. - bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); - - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - bli_sba_rntm_set_pool( tid, array, rntm_p ); - - obj_t a_t, b_t, c_t; - cntl_t* cntl_use; - thrinfo_t* thread; - - // Alias thread-local copies of A, B, and C. These will be the objects - // we pass down the algorithmic function stack. Making thread-local - // aliases is highly recommended in case a thread needs to change any - // of the properties of an object without affecting other threads' - // objects. - bli_obj_alias_to( a, &a_t ); - bli_obj_alias_to( b, &b_t ); - bli_obj_alias_to( c, &c_t ); - - // This is part of a hack to support mixed domain in bli_gemm_front(). - // Sometimes we need to specify a non-standard schema for A and B, and - // we decided to transmit them via the schema field in the obj_t's - // rather than pass them in as function parameters. Once the values - // have been read, we immediately reset them back to their expected - // values for unpacked objects. - pack_t schema_a = bli_obj_pack_schema( &a_t ); - pack_t schema_b = bli_obj_pack_schema( &b_t ); - bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t ); - bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t ); - - // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( family, schema_a, schema_b, - &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use ); - - // Create the root node of the current thread's thrinfo_t structure. - bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); - -#if 1 - func - ( - alpha, - &a_t, - &b_t, - beta, - &c_t, - cntx, - rntm_p, - cntl_use, - thread - ); -#else - bli_thrinfo_grow_tree - ( - rntm_p, - cntl_use, - thread - ); -#endif - - // Free the thread's local control tree. - bli_l3_cntl_free( rntm_p, cntl_use, thread ); - - #ifdef PRINT_THRINFO - threads[tid] = thread; - #else - // Free the current thread's thrinfo_t structure. - bli_l3_thrinfo_free( rntm_p, thread ); - #endif - } - - // We shouldn't free the global communicator since it was already freed - // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called above). - - #ifdef PRINT_THRINFO - if ( family != BLIS_TRSM ) bli_l3_thrinfo_print_gemm_paths( threads ); - else bli_l3_thrinfo_print_trsm_paths( threads ); - exit(1); - #endif - - // Check the array_t back into the small block allocator. Similar to the - // check-out, this is done using a lock embedded within the sba to ensure - // mutual exclusion. - bli_sba_checkin_array( array ); -} - -// ----------------------------------------------------------------------------- - -void bli_l3_thread_decorator_thread_check - ( - dim_t n_threads, - dim_t tid, - thrcomm_t* gl_comm, - rntm_t* rntm - ) -{ - dim_t n_threads_real = omp_get_num_threads(); - - // Check if the number of OpenMP threads created within this parallel - // region is different from the number of threads that were requested - // of BLIS. This inequality may trigger when, for example, the - // following conditions are satisfied: - // - an application is executing an OpenMP parallel region in which - // BLIS is invoked, - // - BLIS is configured for multithreading via OpenMP, - // - OMP_NUM_THREADS = t > 1, - // - the number of threads requested of BLIS (regardless of method) - // is p <= t, - // - OpenMP nesting is disabled. - // In this situation, the application spawns t threads. Each application - // thread calls gemm (for example). Each gemm will attempt to spawn p - // threads via OpenMP. However, since nesting is disabled, the OpenMP - // implementation finds that t >= p threads are already spawned, and - // thus it doesn't spawn *any* additional threads for each gemm. - if ( n_threads_real != n_threads ) - { - // If the number of threads active in the current region is not - // equal to the number requested of BLIS, we then only continue - // if the number of threads in the current region is 1. If, for - // example, BLIS requested 4 threads but only got 3, then we - // abort(). - //if ( tid == 0 ) - //{ - if ( n_threads_real != 1 ) - { - bli_print_msg( "A different number of threads was " - "created than was requested.", - __FILE__, __LINE__ ); - bli_abort(); - } - - const timpl_t ti = bli_rntm_thread_impl( rntm ); - - //n_threads = 1; // not needed since it has no effect? - bli_thrcomm_init( ti, 1, gl_comm ); - bli_rntm_set_num_threads_only( 1, rntm ); - bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm ); - //} - - // Synchronize all threads and continue. - _Pragma( "omp barrier" ) - } -} - -#endif - diff --git a/frame/thread/bli_l3_decor_pthreads.c b/frame/thread/bli_l3_decor_pthreads.c deleted file mode 100644 index d31414d3b..000000000 --- a/frame/thread/bli_l3_decor_pthreads.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#ifdef BLIS_ENABLE_PTHREADS - -// A data structure to assist in passing operands to additional threads. -typedef struct thread_data -{ - l3int_ft func; - opid_t family; - const obj_t* alpha; - const obj_t* a; - const obj_t* b; - const obj_t* beta; - const obj_t* c; - const cntx_t* cntx; - rntm_t* rntm; - cntl_t* cntl; - dim_t tid; - thrcomm_t* gl_comm; - array_t* array; -} thread_data_t; - -// Entry point for additional threads -void* bli_l3_thread_entry( void* data_void ) -{ - const thread_data_t* data = data_void; - - const l3int_ft func = data->func; - const opid_t family = data->family; - const obj_t* alpha = data->alpha; - const obj_t* a = data->a; - const obj_t* b = data->b; - const obj_t* beta = data->beta; - const obj_t* c = data->c; - const cntx_t* cntx = data->cntx; - rntm_t* rntm = data->rntm; - cntl_t* cntl = data->cntl; - const dim_t tid = data->tid; - array_t* array = data->array; - thrcomm_t* gl_comm = data->gl_comm; - - // Create a thread-local copy of the master thread's rntm_t. This is - // necessary since we want each thread to be able to track its own - // small block pool_t as it executes down the function stack. - rntm_t rntm_l = *rntm; - rntm_t* rntm_p = &rntm_l; - - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - bli_sba_rntm_set_pool( tid, array, rntm_p ); - - obj_t a_t, b_t, c_t; - cntl_t* cntl_use; - thrinfo_t* thread; - - // Alias thread-local copies of A, B, and C. These will be the objects - // we pass down the algorithmic function stack. Making thread-local - // aliases is highly recommended in case a thread needs to change any - // of the properties of an object without affecting other threads' - // objects. - bli_obj_alias_to( a, &a_t ); - bli_obj_alias_to( b, &b_t ); - bli_obj_alias_to( c, &c_t ); - - // This is part of a hack to support mixed domain in bli_gemm_front(). - // Sometimes we need to specify a non-standard schema for A and B, and - // we decided to transmit them via the schema field in the obj_t's - // rather than pass them in as function parameters. Once the values - // have been read, we immediately reset them back to their expected - // values for unpacked objects. - pack_t schema_a = bli_obj_pack_schema( &a_t ); - pack_t schema_b = bli_obj_pack_schema( &b_t ); - bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t ); - bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t ); - - // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( family, schema_a, schema_b, - &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use ); - - // Create the root node of the current thread's thrinfo_t structure. - bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); - - func - ( - alpha, - &a_t, - &b_t, - beta, - &c_t, - cntx, - rntm_p, - cntl_use, - thread - ); - - // Free the thread's local control tree. - bli_l3_cntl_free( rntm_p, cntl_use, thread ); - - // Free the current thread's thrinfo_t structure. - bli_l3_thrinfo_free( rntm_p, thread ); - - return NULL; -} - -//#define PRINT_IMPL - -void bli_l3_thread_decorator_pthreads - ( - l3int_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - err_t r_val; - - // Query the total number of threads from the rntm_t object. - const dim_t n_threads = bli_rntm_num_threads( rntm ); - -#ifdef PRINT_IMPL - const timpl_t ti = bli_rntm_thread_impl( rntm ); - printf( "l3_decor_pthrea: l3 decor with rntm.thread_impl = %s\n", - ( ti == BLIS_SINGLE ? "single" : - ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); -#endif - - // NOTE: The sba was initialized in bli_init(). - - // Check out an array_t from the small block allocator. This is done - // with an internal lock to ensure only one application thread accesses - // the sba at a time. bli_sba_checkout_array() will also automatically - // resize the array_t, if necessary. - array_t* array = bli_sba_checkout_array( n_threads ); - - // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we have the rntm_t.sba_pool field - // initialized and ready for the global communicator creation below. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. This will be - // inherited by all of the child threads when they make local copies of - // the rntm below. - bli_pba_rntm_set_pba( rntm ); - - // Allocate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); - - // Allocate an array of pthread objects and auxiliary data structs to pass - // to the thread entry functions. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_thread_decorator().pth: " ); - #endif - bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_thread_decorator().pth: " ); - #endif - thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val ); - - // NOTE: We must iterate backwards so that the chief thread (thread id 0) - // can spawn all other threads before proceeding with its own computation. - for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- ) - { - // Set up thread data for additional threads (beyond thread 0). - datas[tid].func = func; - datas[tid].family = family; - datas[tid].alpha = alpha; - datas[tid].a = a; - datas[tid].b = b; - datas[tid].beta = beta; - datas[tid].c = c; - datas[tid].cntx = cntx; - datas[tid].rntm = rntm; - datas[tid].cntl = cntl; - datas[tid].tid = tid; - datas[tid].gl_comm = gl_comm; - datas[tid].array = array; - - // Spawn additional threads for ids greater than 1. - if ( tid != 0 ) - bli_pthread_create( &pthreads[tid], NULL, &bli_l3_thread_entry, &datas[tid] ); - else - bli_l3_thread_entry( ( void* )(&datas[0]) ); - } - - // We shouldn't free the global communicator since it was already freed - // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called from the thread entry function). - - // Thread 0 waits for additional threads to finish. - for ( dim_t tid = 1; tid < n_threads; tid++ ) - { - bli_pthread_join( pthreads[tid], NULL ); - } - - // Check the array_t back into the small block allocator. Similar to the - // check-out, this is done using a lock embedded within the sba to ensure - // mutual exclusion. - bli_sba_checkin_array( array ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_thread_decorator().pth: " ); - #endif - bli_free_intl( pthreads ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_thread_decorator().pth: " ); - #endif - bli_free_intl( datas ); -} - -#else - -// Define a dummy function bli_l3_thread_entry(), which is needed for -// consistent dynamic linking behavior when building shared objects in Linux -// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol. -void* bli_l3_thread_entry( void* data_void ) { return NULL; } - -#endif - diff --git a/frame/thread/bli_l3_decor_pthreads.h b/frame/thread/bli_l3_decor_pthreads.h deleted file mode 100644 index edf36cf6e..000000000 --- a/frame/thread/bli_l3_decor_pthreads.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_L3_DECOR_PTHREADS_H -#define BLIS_L3_DECOR_PTHREADS_H - -// Definitions specific to situations when POSIX multithreading is enabled. -#ifdef BLIS_ENABLE_PTHREADS - -// Thread entry point prototype. -void* bli_l3_thread_entry( void* data_void ); - -void bli_l3_thread_decorator_pthreads - ( - l3int_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); - -#endif - -#endif - diff --git a/frame/thread/bli_l3_decor_single.c b/frame/thread/bli_l3_decor_single.c deleted file mode 100644 index 6f0f8603b..000000000 --- a/frame/thread/bli_l3_decor_single.c +++ /dev/null @@ -1,165 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -//#define PRINT_IMPL - -void bli_l3_thread_decorator_single - ( - l3int_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ) -{ - // For sequential execution, we use only one thread. - const dim_t n_threads = 1; - -#ifdef PRINT_IMPL - const timpl_t ti = bli_rntm_thread_impl( rntm ); - printf( "l3_decor_single: l3 decor with rntm.thread_impl = %s\n", - ( ti == BLIS_SINGLE ? "single" : - ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); -#endif - - obj_t a_t, b_t; - bli_obj_alias_to( a, &a_t ); - bli_obj_alias_to( b, &b_t ); - - // This is part of a hack to support mixed domain in bli_gemm_front(). - // Sometimes we need to specify a non-standard schema for A and B, and - // we decided to transmit them via the schema field in the obj_t's - // rather than pass them in as function parameters. Once the values - // have been read, we immediately reset them back to their expected - // values for unpacked objects. - pack_t schema_a = bli_obj_pack_schema( &a_t ); - pack_t schema_b = bli_obj_pack_schema( &b_t ); - bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t ); - bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t ); - - // NOTE: The sba was initialized in bli_init(). - - // Check out an array_t from the small block allocator. This is done - // with an internal lock to ensure only one application thread accesses - // the sba at a time. bli_sba_checkout_array() will also automatically - // resize the array_t, if necessary. - array_t* array = bli_sba_checkout_array( n_threads ); - - // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we can create the global comm below. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. - bli_pba_rntm_set_pba( rntm ); - - // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); -#if 0 - timpl_t ti2 = bli_rntm_thread_impl( rntm ); - printf( "l3_decor_single: created thrcomm_t.ti = %s\n", - ( ti2 == BLIS_SINGLE ? "single" : - ( ti2 == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); -#endif - - - { - // NOTE: We don't need to create another copy of the rntm_t since - // it was already copied in one of the high-level oapi functions. - rntm_t* rntm_p = rntm; - - cntl_t* cntl_use; - thrinfo_t* thread; - - const dim_t tid = 0; - - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - // NOTE: This is commented out because, in the single-threaded case, - // this is redundant since it's already been done above. - //bli_sba_rntm_set_pool( tid, array, rntm_p ); - - // NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't - // need to alias objects for A, B, and C since they were already aliased - // in bli_*_front(). However, we may add aliasing here in the future so - // that, with all three (_single.c, _openmp.c, _pthreads.c) implementations - // consistently providing local aliases, we can then eliminate aliasing - // elsewhere. - - // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( family, schema_a, schema_b, - &a_t, &b_t, c, rntm_p, cntl, &cntl_use ); - - // Create the root node of the thread's thrinfo_t structure. - bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); - - func - ( - alpha, - &a_t, - &b_t, - beta, - c, - cntx, - rntm_p, - cntl_use, - thread - ); - - // Free the thread's local control tree. - bli_l3_cntl_free( rntm_p, cntl_use, thread ); - - // Free the current thread's thrinfo_t structure. - bli_l3_thrinfo_free( rntm_p, thread ); - } - - // We shouldn't free the global communicator since it was already freed - // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called above). - - // Check the array_t back into the small block allocator. Similar to the - // check-out, this is done using a lock embedded within the sba to ensure - // mutual exclusion. - bli_sba_checkin_array( array ); -} - diff --git a/frame/thread/bli_l3_sup_decor_openmp.c b/frame/thread/bli_l3_sup_decor_openmp.c deleted file mode 100644 index 7d06ad622..000000000 --- a/frame/thread/bli_l3_sup_decor_openmp.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#ifdef BLIS_ENABLE_OPENMP - -//#define PRINT_THRINFO - -err_t bli_l3_sup_thread_decorator_openmp - ( - l3supint_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm - ) -{ - // Query the total number of threads from the rntm_t object. - const dim_t n_threads = bli_rntm_num_threads( rntm ); - - // NOTE: The sba was initialized in bli_init(). - - // Check out an array_t from the small block allocator. This is done - // with an internal lock to ensure only one application thread accesses - // the sba at a time. bli_sba_checkout_array() will also automatically - // resize the array_t, if necessary. - array_t* array = bli_sba_checkout_array( n_threads ); - - // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we have the rntm_t.sba_pool field - // initialized and ready for the global communicator creation below. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. This will be - // inherited by all of the child threads when they make local copies of - // the rntm below. - bli_pba_rntm_set_pba( rntm ); - - // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); - - - _Pragma( "omp parallel num_threads(n_threads)" ) - { - // Create a thread-local copy of the master thread's rntm_t. This is - // necessary since we want each thread to be able to track its own - // small block pool_t as it executes down the function stack. - rntm_t rntm_l = *rntm; - rntm_t* rntm_p = &rntm_l; - - // Query the thread's id from OpenMP. - const dim_t tid = omp_get_thread_num(); - - // Check for a somewhat obscure OpenMP thread-mistmatch issue. - // NOTE: This calls the same function used for the conventional/large - // code path. - bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); - - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - bli_sba_rntm_set_pool( tid, array, rntm_p ); - - thrinfo_t* thread = NULL; - - // Create the root node of the thread's thrinfo_t structure. - bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); - - func - ( - alpha, - a, - b, - beta, - c, - cntx, - rntm_p, - thread - ); - - // Free the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_free( rntm_p, thread ); - } - - // We shouldn't free the global communicator since it was already freed - // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called from the thread entry function). - - // Check the array_t back into the small block allocator. Similar to the - // check-out, this is done using a lock embedded within the sba to ensure - // mutual exclusion. - bli_sba_checkin_array( array ); - - return BLIS_SUCCESS; -} - -#endif - diff --git a/frame/thread/bli_l3_sup_decor_pthreads.c b/frame/thread/bli_l3_sup_decor_pthreads.c deleted file mode 100644 index 7be5cf8fb..000000000 --- a/frame/thread/bli_l3_sup_decor_pthreads.c +++ /dev/null @@ -1,225 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#ifdef BLIS_ENABLE_PTHREADS - -// A data structure to assist in passing operands to additional threads. -typedef struct thread_data -{ - l3supint_ft func; - opid_t family; - const obj_t* alpha; - const obj_t* a; - const obj_t* b; - const obj_t* beta; - const obj_t* c; - const cntx_t* cntx; - rntm_t* rntm; - dim_t tid; - thrcomm_t* gl_comm; - array_t* array; -} thread_data_t; - -// Entry point for additional threads -void* bli_l3_sup_thread_entry( void* data_void ) -{ - thread_data_t* data = data_void; - - l3supint_ft func = data->func; - opid_t family = data->family; - const obj_t* alpha = data->alpha; - const obj_t* a = data->a; - const obj_t* b = data->b; - const obj_t* beta = data->beta; - const obj_t* c = data->c; - const cntx_t* cntx = data->cntx; - rntm_t* rntm = data->rntm; - dim_t tid = data->tid; - array_t* array = data->array; - thrcomm_t* gl_comm = data->gl_comm; - - ( void )family; - - // Create a thread-local copy of the master thread's rntm_t. This is - // necessary since we want each thread to be able to track its own - // small block pool_t as it executes down the function stack. - rntm_t rntm_l = *rntm; - rntm_t* rntm_p = &rntm_l; - - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - bli_sba_rntm_set_pool( tid, array, rntm_p ); - - thrinfo_t* thread = NULL; - - // Create the root node of the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); - - func - ( - alpha, - a, - b, - beta, - c, - cntx, - rntm_p, - thread - ); - - // Free the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_free( rntm_p, thread ); - - return NULL; -} - -err_t bli_l3_sup_thread_decorator_pthreads - ( - l3supint_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm - ) -{ - err_t r_val; - - // Query the total number of threads from the context. - const dim_t n_threads = bli_rntm_num_threads( rntm ); - - // NOTE: The sba was initialized in bli_init(). - - // Check out an array_t from the small block allocator. This is done - // with an internal lock to ensure only one application thread accesses - // the sba at a time. bli_sba_checkout_array() will also automatically - // resize the array_t, if necessary. - array_t* array = bli_sba_checkout_array( n_threads ); - - // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we have the rntm_t.sba_pool field - // initialized and ready for the global communicator creation below. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. This will be - // inherited by all of the child threads when they make local copies of - // the rntm below. - bli_pba_rntm_set_pba( rntm ); - - // Allocate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); - - // Allocate an array of pthread objects and auxiliary data structs to pass - // to the thread entry functions. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_thread_decorator().pth: " ); - #endif - bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_thread_decorator().pth: " ); - #endif - thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val ); - - // NOTE: We must iterate backwards so that the chief thread (thread id 0) - // can spawn all other threads before proceeding with its own computation. - for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- ) - { - // Set up thread data for additional threads (beyond thread 0). - datas[tid].func = func; - datas[tid].family = family; - datas[tid].alpha = alpha; - datas[tid].a = a; - datas[tid].b = b; - datas[tid].beta = beta; - datas[tid].c = c; - datas[tid].cntx = cntx; - datas[tid].rntm = rntm; - datas[tid].tid = tid; - datas[tid].gl_comm = gl_comm; - datas[tid].array = array; - - // Spawn additional threads for ids greater than 1. - if ( tid != 0 ) - bli_pthread_create( &pthreads[tid], NULL, &bli_l3_sup_thread_entry, &datas[tid] ); - else - bli_l3_sup_thread_entry( ( void* )(&datas[0]) ); - } - - // We shouldn't free the global communicator since it was already freed - // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called from the thread entry function). - - // Thread 0 waits for additional threads to finish. - for ( dim_t tid = 1; tid < n_threads; tid++ ) - { - bli_pthread_join( pthreads[tid], NULL ); - } - - // Check the array_t back into the small block allocator. Similar to the - // check-out, this is done using a lock embedded within the sba to ensure - // mutual exclusion. - bli_sba_checkin_array( array ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_thread_decorator().pth: " ); - #endif - bli_free_intl( pthreads ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_thread_decorator().pth: " ); - #endif - bli_free_intl( datas ); - - return BLIS_SUCCESS; -} - -#else - -// Define a dummy function bli_l3_thread_entry(), which is needed for -// consistent dynamic linking behavior when building shared objects in Linux -// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol. -void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; } - -#endif - diff --git a/frame/thread/bli_l3_sup_decor_pthreads.h b/frame/thread/bli_l3_sup_decor_pthreads.h deleted file mode 100644 index 310ea4e8b..000000000 --- a/frame/thread/bli_l3_sup_decor_pthreads.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_L3_SUP_DECOR_PTHREADS_H -#define BLIS_L3_SUP_DECOR_PTHREADS_H - -// Definitions specific to situations when POSIX multithreading is enabled. -#ifdef BLIS_ENABLE_PTHREADS - -// Thread entry point prototype. -void* bli_l3_sup_thread_entry( void* data_void ); - -err_t bli_l3_sup_thread_decorator_pthreads - ( - l3supint_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm - ); - -#endif - -#endif - diff --git a/frame/thread/bli_l3_sup_decor_single.c b/frame/thread/bli_l3_sup_decor_single.c deleted file mode 100644 index a419154e7..000000000 --- a/frame/thread/bli_l3_sup_decor_single.c +++ /dev/null @@ -1,138 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define SKIP_THRINFO_TREE - -err_t bli_l3_sup_thread_decorator_single - ( - l3supint_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm - ) -{ - // For sequential execution, we use only one thread. - const dim_t n_threads = 1; - - // NOTE: The sba was initialized in bli_init(). - - // Check out an array_t from the small block allocator. This is done - // with an internal lock to ensure only one application thread accesses - // the sba at a time. bli_sba_checkout_array() will also automatically - // resize the array_t, if necessary. - array_t* array = bli_sba_checkout_array( n_threads ); - - // Access the pool_t* for thread 0 and embed it into the rntm. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. - bli_pba_rntm_set_pba( rntm ); - -#ifndef SKIP_THRINFO_TREE - // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); -#endif - - - { - // NOTE: We don't need to create another copy of the rntm_t since - // it was already copied in one of the high-level oapi functions. - rntm_t* rntm_p = rntm; - - // There is only one thread id (for the thief thread). - const dim_t tid = 0; - - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - // NOTE: This is commented out because, in the single-threaded case, - // this is redundant since it's already been done above. - //bli_sba_rntm_set_pool( tid, array, rntm_p ); - -#ifndef SKIP_THRINFO_TREE - thrinfo_t* thread = NULL; - - // Create the root node of the thread's thrinfo_t structure. - bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); -#else - // This optimization allows us to use one of the global thrinfo_t - // objects for single-threaded execution rather than grow one from - // scratch. The key is that bli_thrinfo_sup_grow(), which is called - // from within the variants, will immediately return if it detects - // that the thrinfo_t* passed into it is either - // &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED. - thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED; - - ( void )tid; -#endif - - func - ( - alpha, - a, - b, - beta, - c, - cntx, - rntm_p, - thread - ); - -#ifndef SKIP_THRINFO_TREE - // Free the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_free( rntm_p, thread ); -#endif - } - - // We shouldn't free the global communicator since it was already freed - // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called above). - - // Check the array_t back into the small block allocator. Similar to the - // check-out, this is done using a lock embedded within the sba to ensure - // mutual exclusion. - bli_sba_checkin_array( array ); - - return BLIS_SUCCESS; -} - diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index 6cd4325df..0547d296e 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -37,34 +37,30 @@ // -- Method-agnostic functions ------------------------------------------------ -thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ) +thrcomm_t* bli_thrcomm_create( timpl_t ti, pool_t* sba_pool, dim_t n_threads ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_create(): " ); #endif - thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) ); - - const timpl_t ti = bli_rntm_thread_impl( rntm ); + thrcomm_t* comm = bli_sba_acquire( sba_pool, sizeof(thrcomm_t) ); bli_thrcomm_init( ti, n_threads, comm ); return comm; } -void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ) +void bli_thrcomm_free( pool_t* sba_pool, thrcomm_t* comm ) { if ( comm == NULL ) return; - const timpl_t ti = bli_rntm_thread_impl( rntm ); - - bli_thrcomm_cleanup( ti, comm ); + bli_thrcomm_cleanup( comm ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_free(): " ); #endif - bli_sba_release( rntm, comm ); + bli_sba_release( sba_pool, comm ); } // -- Method-specific functions ------------------------------------------------ @@ -140,69 +136,46 @@ void bli_thrcomm_init( timpl_t ti, dim_t nt, thrcomm_t* comm ) { const thrcomm_init_ft fp = init_fpa[ ti ]; + // Sanity check: the function pointer queried from the function pointer + // array should never be NULL. if ( fp == NULL ) bli_abort(); // Call the threading-specific init function. fp( nt, comm ); // Embed the type of threading implementation within the thrcomm_t struct. - // This can be used later to make sure the application doesn't use a - // thrcomm_t initialized with threading type A with the API for threading - // type B. Note that we wait until after the init function has returned - // in case that function zeros out the entire struct before setting the - // fields. + // Note that we wait until after the init function has returned in case + // that function zeros out the entire struct before setting the fields. comm->ti = ti; } -void bli_thrcomm_cleanup( timpl_t ti, thrcomm_t* comm ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { - const thrcomm_cleanup_ft fp = cleanup_fpa[ ti ]; - - if ( fp == NULL ) bli_abort(); - // If comm is BLIS_SINGLE_COMM, we return early since there is no cleanup, // especially if it is being used with a threading implementation that // would normally want to free its thrcomm_t resources. if ( comm == &BLIS_SINGLE_COMM ) return; - // Sanity check. Make sure the threading implementation we were asked to use - // is the same as the implementation that initialized the thrcomm_t object. - if ( ti != comm->ti ) - { - printf( "bli_thrcomm_cleanup(): thrcomm_t.ti = %s, but request via rntm_t.ti = %s\n", - ( comm->ti == BLIS_SINGLE ? "single" : - ( comm->ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ), - ( ti == BLIS_SINGLE ? "single" : - ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); - bli_abort(); - } + const timpl_t ti = bli_thrcomm_thread_impl( comm ); + const thrcomm_cleanup_ft fp = cleanup_fpa[ ti ]; + + // Sanity check: the function pointer queried from the function pointer + // array should never be NULL. + if ( fp == NULL ) bli_abort(); // Call the threading-specific cleanup function. fp( comm ); } -void bli_thrcomm_barrier( timpl_t ti, dim_t tid, thrcomm_t* comm ) +void bli_thrcomm_barrier( dim_t tid, thrcomm_t* comm ) { + const timpl_t ti = bli_thrcomm_thread_impl( comm ); const thrcomm_barrier_ft fp = barrier_fpa[ ti ]; + // Sanity check: the function pointer queried from the function pointer + // array should never be NULL. if ( fp == NULL ) bli_abort(); - // Sanity check. Make sure the threading implementation we were asked to use - // is the same as the implementation that initialized the thrcomm_t object. - // We skip this check if comm is BLIS_SINGLE_COMM since the timpl_t value - // embedded in comm will often be different than that of BLIS_SINGLE_COMM - // (but we don't return early since we still need to barrier... wait, or do - // we?). - if ( ti != comm->ti && comm != &BLIS_SINGLE_COMM ) - { - printf( "bli_thrcomm_barrier(): thrcomm_t.ti = %s, but request via rntm_t.ti = %s\n", - ( comm->ti == BLIS_SINGLE ? "single" : - ( comm->ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ), - ( ti == BLIS_SINGLE ? "single" : - ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ) ); - bli_abort(); - } - // Call the threading-specific barrier function. fp( tid, comm ); } @@ -211,19 +184,18 @@ void bli_thrcomm_barrier( timpl_t ti, dim_t tid, thrcomm_t* comm ) void* bli_thrcomm_bcast ( - timpl_t ti, dim_t id, void* to_send, thrcomm_t* comm ) -{ +{ if ( comm == NULL || comm->n_threads == 1 ) return to_send; if ( id == 0 ) comm->sent_object = to_send; - bli_thrcomm_barrier( ti, id, comm ); + bli_thrcomm_barrier( id, comm ); void* object = comm->sent_object; - bli_thrcomm_barrier( ti, id, comm ); + bli_thrcomm_barrier( id, comm ); return object; } @@ -257,7 +229,7 @@ void bli_thrcomm_barrier_atomic( dim_t t_id, thrcomm_t* comm ) // the current barrier. The first n-1 threads will spin on this variable // until it changes. The sense variable gets incremented by the last // thread to enter the barrier, just before it exits. But it turns out - // that you don't need many unique IDs before you can wrap around. In + // that you don't need many unique IDs before you can wrap around. In // fact, if everything else is working, a binary variable is sufficient, // which is what we do here (i.e., 0 is incremented to 1, which is then // decremented back to 0, and so forth). diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index 4532fd00d..7abd190c7 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -120,10 +120,15 @@ BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) return comm->n_threads; } +BLIS_INLINE timpl_t bli_thrcomm_thread_impl( thrcomm_t* comm ) +{ + return comm->ti; +} + // Threading method-agnostic function prototypes. -thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); -void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); +thrcomm_t* bli_thrcomm_create( timpl_t ti, pool_t* sba_pool, dim_t n_threads ); +void bli_thrcomm_free( pool_t* sba_pool, thrcomm_t* comm ); // Threading method-specific function prototypes. // NOTE: These are the prototypes to the dispatcher functions and thus they @@ -131,11 +136,11 @@ void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); // (and do) omit the timpl_t from their function signatures since their // threading implementation is intrinsically known. void bli_thrcomm_init( timpl_t ti, dim_t n_threads, thrcomm_t* comm ); -void bli_thrcomm_cleanup( timpl_t ti, thrcomm_t* comm ); -BLIS_EXPORT_BLIS void bli_thrcomm_barrier( timpl_t ti, dim_t thread_id, thrcomm_t* comm ); +void bli_thrcomm_cleanup( thrcomm_t* comm ); +BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); // Other function prototypes. -BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( timpl_t ti, dim_t inside_id, void* to_send, thrcomm_t* comm ); +BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index eefc20fdd..8904c88e3 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -35,9 +35,7 @@ #include "blis.h" -thrinfo_t BLIS_PACKM_SINGLE_THREADED = {}; -thrinfo_t BLIS_GEMM_SINGLE_THREADED = {}; -thrcomm_t BLIS_SINGLE_COMM = {}; +thrcomm_t BLIS_SINGLE_COMM = {}; // The global rntm_t structure. (The definition resides in bli_rntm.c.) extern rntm_t global_rntm; @@ -46,13 +44,39 @@ extern rntm_t global_rntm; // resides in bli_rntm.c.) extern bli_pthread_mutex_t global_rntm_mutex; +typedef void (*thread_launch_t) + ( + dim_t nt, + thread_func_t func, + const void* params + ); + +static thread_launch_t thread_launch_fpa[ BLIS_NUM_THREAD_IMPLS ] = +{ + [BLIS_SINGLE] = bli_thread_launch_single, + [BLIS_OPENMP] = +#if defined(BLIS_ENABLE_OPENMP) + bli_thread_launch_openmp, +#elif defined(BLIS_ENABLE_PTHREADS) + NULL, +#else + NULL, +#endif + [BLIS_POSIX] = +#if defined(BLIS_ENABLE_PTHREADS) + bli_thread_launch_pthreads, +#elif defined(BLIS_ENABLE_OPENMP) + NULL, +#else + NULL, +#endif +}; + // ----------------------------------------------------------------------------- void bli_thread_init( void ) { bli_thrcomm_init( BLIS_SINGLE, 1, &BLIS_SINGLE_COMM ); - bli_packm_thrinfo_init_single( &BLIS_PACKM_SINGLE_THREADED ); - bli_l3_thrinfo_init_single( &BLIS_GEMM_SINGLE_THREADED ); // Read the environment variables and use them to initialize the // global runtime object. @@ -65,6 +89,19 @@ void bli_thread_finalize( void ) // ----------------------------------------------------------------------------- +void bli_thread_launch + ( + timpl_t ti, + dim_t nt, + thread_func_t func, + const void* params + ) +{ + thread_launch_fpa[ti]( nt, func, params ); +} + +// ----------------------------------------------------------------------------- + void bli_thread_range_sub ( const thrinfo_t* thread, @@ -75,11 +112,11 @@ void bli_thread_range_sub dim_t* end ) { - dim_t n_way = bli_thread_n_way( thread ); + dim_t n_way = bli_thrinfo_n_way( thread ); if ( n_way == 1 ) { *start = 0; *end = n; return; } - dim_t work_id = bli_thread_work_id( thread ); + dim_t work_id = bli_thrinfo_work_id( thread ); dim_t all_start = 0; dim_t all_end = n; @@ -515,8 +552,8 @@ siz_t bli_thread_range_weighted_sub dim_t* j_end_thr ) { - dim_t n_way = bli_thread_n_way( thread ); - dim_t my_id = bli_thread_work_id( thread ); + dim_t n_way = bli_thrinfo_n_way( thread ); + dim_t my_id = bli_thrinfo_work_id( thread ); dim_t bf_left = n % bf; diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 88bdccda5..821e2fe7c 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -42,29 +42,30 @@ // Include thread info (thrinfo_t) object definitions and prototypes. #include "bli_thrinfo.h" -#include "bli_thrinfo_sup.h" -// Include some operation-specific thrinfo_t prototypes. -// Note that the bli_packm_thrinfo.h must be included before the others! -#include "bli_packm_thrinfo.h" -#include "bli_l3_thrinfo.h" +// Thread lanuch prototypes. Must go before including implementation headers. +typedef void (*thread_func_t)( thrcomm_t* gl_comm, dim_t tid, const void* params ); -// Include the level-3 thread decorator and related definitions and prototypes -// for the conventional code path. -#include "bli_l3_decor.h" - -// Include the level-3 thread decorator and related definitions and prototypes -// for the sup code path. -#include "bli_l3_sup_decor.h" +// Include threading implementations. +#include "bli_thread_openmp.h" +#include "bli_thread_pthreads.h" +#include "bli_thread_single.h" // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); +BLIS_EXPORT_BLIS void bli_thread_launch + ( + timpl_t ti, + dim_t nt, + thread_func_t func, + const void* params + ); + // Thread range-related prototypes. -BLIS_EXPORT_BLIS -void bli_thread_range_sub +BLIS_EXPORT_BLIS void bli_thread_range_sub ( const thrinfo_t* thread, dim_t n, @@ -224,8 +225,8 @@ BLIS_INLINE void bli_thread_range_jrir_rr ) { // Use interleaved partitioning of jr/ir loops. - *start = bli_thread_work_id( thread ); - *inc = bli_thread_n_way( thread ); + *start = bli_thrinfo_work_id( thread ); + *inc = bli_thrinfo_n_way( thread ); *end = n; } @@ -295,8 +296,8 @@ BLIS_INLINE void bli_thread_range_weighted_jrir #else // Use interleaved partitioning of jr/ir loops. - *start = bli_thread_work_id( thread ); - *inc = bli_thread_n_way( thread ); + *start = bli_thrinfo_work_id( thread ); + *inc = bli_thrinfo_n_way( thread ); *end = n; #endif diff --git a/frame/thread/bli_l3_decor_openmp.h b/frame/thread/bli_thread_openmp.c similarity index 69% rename from frame/thread/bli_l3_decor_openmp.h rename to frame/thread/bli_thread_openmp.c index 95e1582e5..c7a74832b 100644 --- a/frame/thread/bli_l3_decor_openmp.h +++ b/frame/thread/bli_thread_openmp.c @@ -5,7 +5,6 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -33,35 +32,32 @@ */ -#ifndef BLIS_L3_DECOR_OPENMP_H -#define BLIS_L3_DECOR_OPENMP_H +#include "blis.h" -// Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP -void bli_l3_thread_decorator_openmp - ( - l3int_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl - ); +void bli_thread_launch_openmp( dim_t n_threads, thread_func_t func, const void* params ) +{ + const timpl_t ti = BLIS_OPENMP; -void bli_l3_thread_decorator_thread_check - ( - dim_t n_threads, - dim_t tid, - thrcomm_t* gl_comm, - rntm_t* rntm - ); + // Allocate a global communicator for the root thrinfo_t structures. + pool_t* gl_comm_pool = NULL; + thrcomm_t* gl_comm = bli_thrcomm_create( ti, gl_comm_pool, n_threads ); -#endif + _Pragma( "omp parallel num_threads(n_threads)" ) + { + // Query the thread's id from OpenMP. + const dim_t tid = omp_get_thread_num(); + + // Call the thread entry point, passing the global communicator, the + // thread id, and the params struct as arguments. + func( gl_comm, tid, params ); + } + + // Free the global communicator, because the root thrinfo_t node + // never frees its communicator. + bli_thrcomm_free( gl_comm_pool, gl_comm ); +} #endif diff --git a/frame/thread/bli_l3_sup_decor_openmp.h b/frame/thread/bli_thread_openmp.h similarity index 82% rename from frame/thread/bli_l3_sup_decor_openmp.h rename to frame/thread/bli_thread_openmp.h index 4c5059d00..d26023a15 100644 --- a/frame/thread/bli_l3_sup_decor_openmp.h +++ b/frame/thread/bli_thread_openmp.h @@ -32,23 +32,17 @@ */ -#ifndef BLIS_L3_SUP_DECOR_OPENMP_H -#define BLIS_L3_SUP_DECOR_OPENMP_H +#ifndef BLIS_THREAD_OPENMP_H +#define BLIS_THREAD_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP -err_t bli_l3_sup_thread_decorator_openmp +void bli_thread_launch_openmp ( - l3supint_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm + dim_t nt, + thread_func_t func, + const void* params ); #endif diff --git a/frame/thread/bli_thread_pthreads.c b/frame/thread/bli_thread_pthreads.c new file mode 100644 index 000000000..88a11cf11 --- /dev/null +++ b/frame/thread/bli_thread_pthreads.c @@ -0,0 +1,128 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_PTHREADS + +// A data structure to assist in passing operands to additional threads. +typedef struct thread_data +{ + dim_t tid; + thrcomm_t* gl_comm; + thread_func_t func; + const void* params; +} thread_data_t; + +// Entry point for additional threads +static void* bli_posix_thread_entry( void* data_void ) +{ + const thread_data_t* data = data_void; + + const dim_t tid = data->tid; + thrcomm_t* gl_comm = data->gl_comm; + thread_func_t func = data->func; + const void* params = data->params; + + // Call the thread entry point, passing the global communicator, the + // thread id, and the params struct as arguments. + func( gl_comm, tid, params ); + + return NULL; +} + +void bli_thread_launch_pthreads( dim_t n_threads, thread_func_t func, const void* params ) +{ + err_t r_val; + + const timpl_t ti = BLIS_POSIX; + + // Allocate a global communicator for the root thrinfo_t structures. + pool_t* gl_comm_pool = NULL; + thrcomm_t* gl_comm = bli_thrcomm_create( ti, gl_comm_pool, n_threads ); + + // Allocate an array of pthread objects and auxiliary data structs to pass + // to the thread entry functions. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val ); + + // NOTE: We must iterate backwards so that the chief thread (thread id 0) + // can spawn all other threads before proceeding with its own computation. + for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- ) + { + // Set up thread data for additional threads (beyond thread 0). + datas[tid].tid = tid; + datas[tid].gl_comm = gl_comm; + datas[tid].func = func; + datas[tid].params = params; + + // Spawn additional threads for ids greater than 1. + if ( tid != 0 ) + bli_pthread_create( &pthreads[tid], NULL, &bli_posix_thread_entry, &datas[tid] ); + else + bli_posix_thread_entry( &datas[0] ); + } + + // Thread 0 waits for additional threads to finish. + for ( dim_t tid = 1; tid < n_threads; tid++ ) + { + bli_pthread_join( pthreads[tid], NULL ); + } + + // Free the global communicator, because the root thrinfo_t node + // never frees its communicator. + bli_thrcomm_free( gl_comm_pool, gl_comm ); + + // Free the array of pthread objects and auxiliary data structs. + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_free_intl( pthreads ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_free_intl( datas ); +} + +#endif + diff --git a/frame/thread/bli_l3_decor_single.h b/frame/thread/bli_thread_pthreads.h similarity index 82% rename from frame/thread/bli_l3_decor_single.h rename to frame/thread/bli_thread_pthreads.h index c118ad7be..5fb82e292 100644 --- a/frame/thread/bli_l3_decor_single.h +++ b/frame/thread/bli_thread_pthreads.h @@ -32,22 +32,20 @@ */ -#ifndef BLIS_L3_DECOR_SINGLE_H -#define BLIS_L3_DECOR_SINGLE_H +#ifndef BLIS_THREAD_PTHREADS_H +#define BLIS_THREAD_PTHREADS_H -void bli_l3_thread_decorator_single +// Definitions specific to situations when POSIX multithreading is enabled. +#ifdef BLIS_ENABLE_PTHREADS + +void bli_thread_launch_pthreads ( - l3int_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm, - cntl_t* cntl + dim_t nt, + thread_func_t func, + const void* params ); #endif +#endif + diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.h b/frame/thread/bli_thread_single.c similarity index 77% rename from sandbox/gemmlike/thread/bls_l3_decor_single.h rename to frame/thread/bli_thread_single.c index 82dfbc993..323e0d8f2 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor_single.h +++ b/frame/thread/bli_thread_single.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021, The University of Texas at Austin + Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,23 +32,12 @@ */ -#ifndef BLIS_SBX_L3_DECOR_SINGLE_H -#define BLIS_SBX_L3_DECOR_SINGLE_H - -void bls_l3_thread_decorator_single - ( - l3sbxint_ft func, - opid_t family, - //pack_t schema_a, - //pack_t schema_b, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ); - -#endif +#include "blis.h" + +void bli_thread_launch_single( dim_t nt, thread_func_t func, const void* params ) +{ + // Call the thread entry point, passing the global single-threaded + // communicator, thread id of 0, and the params struct as arguments. + func( &BLIS_SINGLE_COMM, 0, params ); +} diff --git a/frame/thread/bli_l3_sup_decor_single.h b/frame/thread/bli_thread_single.h similarity index 81% rename from frame/thread/bli_l3_sup_decor_single.h rename to frame/thread/bli_thread_single.h index 8ca279baf..fda91232e 100644 --- a/frame/thread/bli_l3_sup_decor_single.h +++ b/frame/thread/bli_thread_single.h @@ -32,20 +32,14 @@ */ -#ifndef BLIS_L3_SUP_DECOR_SINGLE_H -#define BLIS_L3_SUP_DECOR_SINGLE_H +#ifndef BLIS_THREAD_SINGLE_H +#define BLIS_THREAD_SINGLE_H -err_t bli_l3_sup_thread_decorator_single +void bli_thread_launch_single ( - l3supint_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm + dim_t nt, + thread_func_t func, + const void* params ); #endif diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c index 3730ab946..f48e70bb6 100644 --- a/frame/thread/bli_thrinfo.c +++ b/frame/thread/bli_thrinfo.c @@ -35,101 +35,83 @@ #include "blis.h" -thrinfo_t* bli_thrinfo_create +#define BLIS_NUM_STATIC_COMMS 80 + +thrinfo_t* bli_thrinfo_create_root ( - rntm_t* rntm, - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - bool free_comm, - bszid_t bszid, - thrinfo_t* sub_node + thrcomm_t* comm, + dim_t thread_id, + pool_t* sba_pool, + pba_t* pba ) { - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_thrinfo_create(): " ); - #endif - - thrinfo_t* thread = bli_sba_acquire( rntm, sizeof( thrinfo_t ) ); - - bli_thrinfo_init + return bli_thrinfo_create ( - thread, - ocomm, ocomm_id, - n_way, work_id, - free_comm, - bszid, - sub_node + comm, + thread_id, + 1, + 0, + FALSE, + sba_pool, + pba ); - - return thread; } -void bli_thrinfo_init +thrinfo_t* bli_thrinfo_create ( - thrinfo_t* thread, - thrcomm_t* ocomm, - dim_t ocomm_id, + thrcomm_t* comm, + dim_t thread_id, dim_t n_way, dim_t work_id, bool free_comm, - bszid_t bszid, - thrinfo_t* sub_node + pool_t* sba_pool, + pba_t* pba ) { - bli_thrinfo_set_ocomm( ocomm, thread ); - bli_thrinfo_set_ocomm_id( ocomm_id, thread ); + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_thrinfo_create(): " ); + #endif + + thrinfo_t* thread = bli_sba_acquire( sba_pool, sizeof( thrinfo_t ) ); + + bli_thrinfo_set_comm( comm, thread ); + bli_thrinfo_set_thread_id( thread_id, thread ); bli_thrinfo_set_n_way( n_way, thread ); bli_thrinfo_set_work_id( work_id, thread ); bli_thrinfo_set_free_comm( free_comm, thread ); - bli_thrinfo_set_bszid( bszid, thread ); + bli_thrinfo_set_sba_pool( sba_pool, thread ); + bli_thrinfo_set_pba( pba, thread ); + bli_mem_clear( bli_thrinfo_mem( thread ) ); - bli_thrinfo_set_sub_node( sub_node, thread ); + bli_thrinfo_set_sub_node( NULL, thread ); bli_thrinfo_set_sub_prenode( NULL, thread ); -} -void bli_thrinfo_init_single - ( - thrinfo_t* thread - ) -{ - bli_thrinfo_init - ( - thread, - &BLIS_SINGLE_COMM, 0, - 1, - 0, - FALSE, - BLIS_NO_PART, - thread - ); + return thread; } void bli_thrinfo_free ( - rntm_t* rntm, thrinfo_t* thread ) { - if ( thread == NULL || - thread == &BLIS_PACKM_SINGLE_THREADED || - thread == &BLIS_GEMM_SINGLE_THREADED - ) return; + if ( thread == NULL ) return; thrinfo_t* thrinfo_sub_prenode = bli_thrinfo_sub_prenode( thread ); thrinfo_t* thrinfo_sub_node = bli_thrinfo_sub_node( thread ); + pool_t* sba_pool = bli_thrinfo_sba_pool( thread ); + mem_t* cntl_mem_p = bli_thrinfo_mem( thread ); + pba_t* pba = bli_thrinfo_pba( thread ); // Recursively free all children of the current thrinfo_t. if ( thrinfo_sub_prenode != NULL ) { - bli_thrinfo_free( rntm, thrinfo_sub_prenode ); + bli_thrinfo_free( thrinfo_sub_prenode ); } // Recursively free all children of the current thrinfo_t. if ( thrinfo_sub_node != NULL ) { - bli_thrinfo_free( rntm, thrinfo_sub_node ); + bli_thrinfo_free( thrinfo_sub_node ); } // Free the communicators, but only if the current thrinfo_t struct @@ -139,198 +121,48 @@ void bli_thrinfo_free if ( bli_thrinfo_needs_free_comm( thread ) ) { // The ochief always frees his communicator. - if ( bli_thread_am_ochief( thread ) ) - bli_thrcomm_free( rntm, bli_thrinfo_ocomm( thread ) ); + if ( bli_thrinfo_am_chief( thread ) ) + bli_thrcomm_free( sba_pool, bli_thrinfo_comm( thread ) ); } #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrinfo_free(): " ); #endif - // Free the thrinfo_t struct. - bli_sba_release( rntm, thread ); -} - -// ----------------------------------------------------------------------------- - -void bli_thrinfo_grow - ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ) -{ - // First, consider the prenode branch of the thrinfo_t tree, which should be - // expanded only if there exists a prenode branch in the cntl_t tree. - - if ( bli_cntl_sub_prenode( cntl ) != NULL ) + // Free any allocated memory from the pba. + if ( bli_mem_is_alloc( cntl_mem_p ) && bli_thrinfo_am_chief( thread ) ) { - // We only need to take action if the thrinfo_t sub-node is NULL; if it - // is non-NULL, then it has already been created and we'll use it as-is. - if ( bli_thrinfo_sub_prenode( thread ) == NULL ) - { - // Assertion / sanity check. - if ( bli_cntl_bszid( cntl ) != BLIS_MC ) - { - printf( "Assertion failed: Expanding prenode for non-IC loop?\n" ); - bli_abort(); - } - - // Now we must create the packa, jr, and ir nodes that make up - // the prenode branch of current cntl_t node. - - // Create a new node (or, if needed, multiple nodes) along the - // prenode branch of the tree and return the pointer to the - // (highest) child. - thrinfo_t* thread_prenode = bli_thrinfo_rgrow_prenode - ( - rntm, - cntl, - bli_cntl_sub_prenode( cntl ), - thread - ); - - // Attach the child thrinfo_t node for the secondary branch to its - // parent structure. - bli_thrinfo_set_sub_prenode( thread_prenode, thread ); - } - } - - // Now, grow the primary branch of the thrinfo_t tree. - - // NOTE: If bli_thrinfo_rgrow() is being called, the sub_node field will - // always be non-NULL, and so there's no need to check it. - //if ( bli_cntl_sub_node( cntl ) != NULL ) - { - // We only need to take action if the thrinfo_t sub-node is NULL; if it - // is non-NULL, then it has already been created and we'll use it as-is. - if ( bli_thrinfo_sub_node( thread ) == NULL ) - { - // Create a new node (or, if needed, multiple nodes) along the - // main sub-node branch of the tree and return the pointer to the - // (highest) child. - thrinfo_t* thread_child = bli_thrinfo_rgrow - ( - rntm, - cntl, - bli_cntl_sub_node( cntl ), - thread - ); - - // Attach the child thrinfo_t node for the primary branch to its - // parent structure. - bli_thrinfo_set_sub_node( thread_child, thread ); - } - } -} - -// ----------------------------------------------------------------------------- - -thrinfo_t* bli_thrinfo_rgrow - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_cur, - thrinfo_t* thread_par - ) -{ - thrinfo_t* thread_cur; - - // We must handle two cases: those where the next node in the - // control tree is a partitioning node, and those where it is - // a non-partitioning (ie: packing) node. - if ( bli_cntl_bszid( cntl_cur ) != BLIS_NO_PART ) - { - // Create the child thrinfo_t node corresponding to cntl_cur, - // with cntl_par being the parent. - thread_cur = bli_thrinfo_create_for_cntl - ( - rntm, - cntl_par, - cntl_cur, - thread_par - ); - } - else // if ( bli_cntl_bszid( cntl_cur ) == BLIS_NO_PART ) - { - // Recursively grow the thread structure and return the top-most - // thrinfo_t node of that segment. - thrinfo_t* thread_seg = bli_thrinfo_rgrow - ( - rntm, - cntl_par, - bli_cntl_sub_node( cntl_cur ), - thread_par - ); - - // Create a thrinfo_t node corresponding to cntl_cur. Since the - // corresponding cntl node, cntl_cur, is a non-partitioning node - // (bszid = BLIS_NO_PART), this means it's a packing node. Packing - // thrinfo_t nodes are formed differently than those corresponding to - // partitioning nodes; specifically, their work_id's are set equal to - // the their comm_id's. Also, notice that the free_comm field is set - // to FALSE since cntl_cur is a non-partitioning node. The reason: - // the communicator used here will be freed when thread_seg, or one - // of its descendents, is freed. - thread_cur = bli_thrinfo_create + bli_pba_release ( - rntm, // rntm - bli_thrinfo_ocomm( thread_seg ), // ocomm - bli_thread_ocomm_id( thread_seg ), // ocomm_id - bli_cntl_calc_num_threads_in( rntm, cntl_cur ), // n_way - bli_thread_ocomm_id( thread_seg ), // work_id - FALSE, // free_comm - BLIS_NO_PART, // bszid - thread_seg // sub_node + pba, + cntl_mem_p ); } - return thread_cur; + // Free the thrinfo_t struct. + bli_sba_release( sba_pool, thread ); } -#define BLIS_NUM_STATIC_COMMS 80 +// ----------------------------------------------------------------------------- -thrinfo_t* bli_thrinfo_create_for_cntl +thrinfo_t* bli_thrinfo_split ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_chl, + dim_t n_way, thrinfo_t* thread_par ) { - // If we are running with a single thread, all of the code can be reduced - // and simplified to this. - if ( bli_rntm_calc_num_threads( rntm ) == 1 ) - { - thrinfo_t* thread_chl = bli_thrinfo_create - ( - rntm, // rntm - &BLIS_SINGLE_COMM, // ocomm - 0, // ocomm_id - 1, // n_way - 0, // work_id - FALSE, // free_comm - BLIS_NO_PART, // bszid - NULL // sub_node - ); - return thread_chl; - } - - thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ]; - thrcomm_t** new_comms = NULL; - - const bszid_t bszid_chl = bli_cntl_bszid( cntl_chl ); - - const dim_t parent_nt_in = bli_thread_num_threads( thread_par ); - const dim_t parent_n_way = bli_thread_n_way( thread_par ); - const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); - const dim_t parent_work_id = bli_thread_work_id( thread_par ); + thrcomm_t* parent_comm = bli_thrinfo_comm( thread_par ); + const timpl_t ti = bli_thrcomm_thread_impl( parent_comm ); + const dim_t parent_num_threads = bli_thrinfo_num_threads( thread_par ); + const dim_t parent_thread_id = bli_thrinfo_thread_id( thread_par ); + pool_t* sba_pool = bli_thrinfo_sba_pool( thread_par ); + pba_t* pba = bli_thrinfo_pba( thread_par ); // Sanity check: make sure the number of threads in the parent's // communicator is divisible by the number of new sub-groups. - if ( parent_nt_in % parent_n_way != 0 ) + if ( parent_num_threads % n_way != 0 ) { - printf( "Assertion failed: parent_nt_in parent_n_way != 0\n" ); + printf( "Assertion failed: parent_num_threads %% n_way != 0\n" ); bli_abort(); } @@ -339,312 +171,105 @@ thrinfo_t* bli_thrinfo_create_for_cntl // - the current thread's id within the new communicator, // - the current thread's work id, given the ways of parallelism // to be obtained within the next loop. - const dim_t child_nt_in = bli_cntl_calc_num_threads_in( rntm, cntl_chl ); - const dim_t child_n_way = bli_rntm_ways_for( bszid_chl, rntm ); - const dim_t child_comm_id = parent_comm_id % child_nt_in; - const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); - -//printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl ); - - // The parent's chief thread creates a temporary array of thrcomm_t - // pointers. - if ( bli_thread_am_ochief( thread_par ) ) - { - err_t r_val; - - if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) - new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val ); - else - new_comms = static_comms; - } - - // Broadcast the temporary array to all threads in the parent's - // communicator. - new_comms = bli_thread_broadcast( rntm, thread_par, new_comms ); + const dim_t child_num_threads = parent_num_threads / n_way; + const dim_t child_thread_id = parent_thread_id % child_num_threads; + const dim_t child_work_id = parent_thread_id / child_num_threads; - // Chiefs in the child communicator allocate the communicator - // object and store it in the array element corresponding to the - // parent's work id. - if ( child_comm_id == 0 ) - new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in ); - - bli_thread_barrier( rntm, thread_par ); - - // All threads create a new thrinfo_t node using the communicator - // that was created by their chief, as identified by parent_work_id. - thrinfo_t* thread_chl = bli_thrinfo_create - ( - rntm, // rntm - new_comms[ parent_work_id ], // ocomm - child_comm_id, // ocomm_id - child_n_way, // n_way - child_work_id, // work_id - TRUE, // free_comm - bszid_chl, // bszid - NULL // sub_node - ); - - bli_thread_barrier( rntm, thread_par ); + thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ]; + thrcomm_t** new_comms = NULL; + thrcomm_t* my_comm = NULL; + bool free_comm = FALSE; - // The parent's chief thread frees the temporary array of thrcomm_t - // pointers. - if ( bli_thread_am_ochief( thread_par ) ) + if ( n_way == 1 ) { - if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) - bli_free_intl( new_comms ); + my_comm = parent_comm; } - - return thread_chl; -} - -// ----------------------------------------------------------------------------- - -thrinfo_t* bli_thrinfo_rgrow_prenode - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_cur, - thrinfo_t* thread_par - ) -{ - thrinfo_t* thread_cur; - - // We must handle two cases: those where the next node in the - // control tree is a partitioning node, and those where it is - // a non-partitioning (ie: packing) node. - if ( bli_cntl_bszid( cntl_cur ) != BLIS_NO_PART ) + else if ( n_way == parent_num_threads ) { - // Create the child thrinfo_t node corresponding to cntl_cur, - // with cntl_par being the parent. - thread_cur = bli_thrinfo_create_for_cntl_prenode - ( - rntm, - cntl_par, - cntl_cur, - thread_par - ); + my_comm = &BLIS_SINGLE_COMM; } - else // if ( bli_cntl_bszid( cntl_cur ) == BLIS_NO_PART ) + else { - // Recursively grow the thread structure and return the top-most - // thrinfo_t node of that segment. - thrinfo_t* thread_seg = bli_thrinfo_rgrow_prenode - ( - rntm, - cntl_par, - bli_cntl_sub_node( cntl_cur ), - thread_par - ); - - // Create a thrinfo_t node corresponding to cntl_cur. Since the - // corresponding cntl node, cntl_cur, is a non-partitioning node - // (bszid = BLIS_NO_PART), this means it's a packing node. Packing - // thrinfo_t nodes are formed differently than those corresponding to - // partitioning nodes; specifically, their work_id's are set equal to - // the their comm_id's. Also, notice that the free_comm field is set - // to FALSE since cntl_cur is a non-partitioning node. The reason: - // the communicator used here will be freed when thread_seg, or one - // of its descendents, is freed. - thread_cur = bli_thrinfo_create - ( - rntm, // rntm - bli_thrinfo_ocomm( thread_seg ), // ocomm - bli_thread_ocomm_id( thread_seg ), // ocomm_id - bli_cntl_calc_num_threads_in( rntm, cntl_par ), // n_way - bli_thread_ocomm_id( thread_seg ), // work_id - FALSE, // free_comm - BLIS_NO_PART, // bszid - thread_seg // sub_node - ); - } + // The parent's chief thread creates a temporary array of thrcomm_t + // pointers. + if ( bli_thrinfo_am_chief( thread_par ) ) + { + err_t r_val; - return thread_cur; -} + if ( n_way > BLIS_NUM_STATIC_COMMS ) + new_comms = bli_malloc_intl( n_way * sizeof( thrcomm_t* ), &r_val ); + else + new_comms = static_comms; + } -thrinfo_t* bli_thrinfo_create_for_cntl_prenode - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_chl, - thrinfo_t* thread_par - ) -{ - // NOTE: This function only has to work for the ic -> (pa -> jr) - // thrinfo_t tree branch extension. After that, the function - // bli_thrinfo_create_for_cntl() will be called for the last jr->ir - // branch extension. + // Broadcast the temporary array to all threads in the parent's + // communicator. + new_comms = bli_thrinfo_broadcast( thread_par, new_comms ); - const bszid_t bszid_chl = bli_cntl_bszid( cntl_chl ); + // Chiefs in the child communicator allocate the communicator + // object and store it in the array element corresponding to the + // parent's work id. + if ( child_thread_id == 0 ) + new_comms[ child_work_id ] = bli_thrcomm_create( ti, sba_pool, child_num_threads ); - const dim_t parent_nt_in = bli_thread_num_threads( thread_par ); - const dim_t parent_n_way = bli_thread_n_way( thread_par ); - const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); - //const dim_t parent_work_id = bli_thread_work_id( thread_par ); + bli_thrinfo_barrier( thread_par ); - // Sanity check: make sure the number of threads in the parent's - // communicator is divisible by the number of new sub-groups. - if ( parent_nt_in % parent_n_way != 0 ) - { - printf( "Assertion failed: parent_nt_in (%d) parent_n_way (%d) != 0\n", - ( int )parent_nt_in, ( int )parent_n_way ); - bli_abort(); + my_comm = new_comms[ child_work_id ]; + free_comm = TRUE; } - //dim_t child_nt_in = bli_cntl_calc_num_threads_in( rntm, cntl_chl ); - //dim_t child_n_way = bli_rntm_ways_for( bszid_chl, rntm ); - const dim_t child_nt_in = parent_nt_in; - const dim_t child_n_way = parent_nt_in; - const dim_t child_comm_id = parent_comm_id % child_nt_in; - const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); - - bli_thread_barrier( rntm, thread_par ); - - // NOTE: Recall that parent_comm_id == child_comm_id, so checking for the - // parent's chief-ness is equivalent to checking for chief-ness in the new - // about-to-be-created communicator group. - thrcomm_t* new_comm = NULL; - if ( bli_thread_am_ochief( thread_par ) ) - new_comm = bli_thrcomm_create( rntm, child_nt_in ); - - // Broadcast the new thrcomm_t address to the other threads in the - // parent's group. - new_comm = bli_thread_broadcast( rntm, thread_par, new_comm ); - // All threads create a new thrinfo_t node using the communicator // that was created by their chief, as identified by parent_work_id. thrinfo_t* thread_chl = bli_thrinfo_create ( - rntm, // rntm - new_comm, // ocomm - child_comm_id, // ocomm_id - child_n_way, // n_way - child_work_id, // work_id - TRUE, // free_comm - bszid_chl, // bszid - NULL // sub_node + my_comm, + child_thread_id, + n_way, + child_work_id, + free_comm, + sba_pool, + pba ); - bli_thread_barrier( rntm, thread_par ); + bli_thrinfo_barrier( thread_par ); + + // The parent's chief thread frees the temporary array of thrcomm_t + // pointers. + if ( bli_thrinfo_am_chief( thread_par ) && + new_comms != static_comms ) + { + bli_free_intl( new_comms ); + } return thread_chl; } -// ----------------------------------------------------------------------------- - -#if 0 -void bli_thrinfo_grow_tree +void bli_thrinfo_print ( - rntm_t* rntm, - cntl_t* cntl, thrinfo_t* thread ) { - cntl_t* cntl_jc = cntl; - thrinfo_t* thrinfo_jc = thread; - - bli_thrinfo_grow( rntm, cntl_jc, thrinfo_jc ); - - // inside jc loop: - cntl_t* cntl_pc = bli_cntl_sub_node( cntl_jc ); - thrinfo_t* thrinfo_pc = bli_thrinfo_sub_node( thrinfo_jc ); - - bli_thrinfo_grow( rntm, cntl_pc, thrinfo_pc ); - - // inside pc loop: - cntl_t* cntl_pb = bli_cntl_sub_node( cntl_pc ); - thrinfo_t* thrinfo_pb = bli_thrinfo_sub_node( thrinfo_pc ); - - bli_thrinfo_grow( rntm, cntl_pb, thrinfo_pb ); - - // after pb packing: - cntl_t* cntl_ic = bli_cntl_sub_node( cntl_pb ); - thrinfo_t* thrinfo_ic = bli_thrinfo_sub_node( thrinfo_pb ); - - bli_thrinfo_grow( rntm, cntl_ic, thrinfo_ic ); - - // -- main branch -- - - // inside ic loop: - cntl_t* cntl_pa = bli_cntl_sub_node( cntl_ic ); - thrinfo_t* thrinfo_pa = bli_thrinfo_sub_node( thrinfo_ic ); - - bli_thrinfo_grow( rntm, cntl_pa, thrinfo_pa ); - - // after pa packing: - cntl_t* cntl_jr = bli_cntl_sub_node( cntl_pa ); - thrinfo_t* thrinfo_jr = bli_thrinfo_sub_node( thrinfo_pa ); - - bli_thrinfo_grow( rntm, cntl_jr, thrinfo_jr ); - - // inside jr loop: - //cntl_t* cntl_ir = bli_cntl_sub_node( cntl_jr ); - //thrinfo_t* thrinfo_ir = bli_thrinfo_sub_node( thrinfo_jr ); - - // -- trsm branch -- - - // inside ic loop: - cntl_t* cntl_pa0 = bli_cntl_sub_prenode( cntl_ic ); - thrinfo_t* thrinfo_pa0 = bli_thrinfo_sub_prenode( thrinfo_ic ); - - bli_thrinfo_grow( rntm, cntl_pa0, thrinfo_pa0 ); - - // after pa packing: - cntl_t* cntl_jr0 = bli_cntl_sub_node( cntl_pa0 ); - thrinfo_t* thrinfo_jr0 = bli_thrinfo_sub_node( thrinfo_pa0 ); - - bli_thrinfo_grow( rntm, cntl_jr0, thrinfo_jr0 ); - - // inside jr loop: - //cntl_t* cntl_ir0 = bli_cntl_sub_node( cntl_jr0 ); - //thrinfo_t* thrinfo_ir0= bli_thrinfo_sub_node( thrinfo_jr0 ); + printf( " lvl nt tid nway wkid free\n" ); + bli_thrinfo_print_sub( thread, 0 ); } -void bli_thrinfo_grow_tree_ic +void bli_thrinfo_print_sub ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + thrinfo_t* thread, + gint_t level ) { - cntl_t* cntl_ic = cntl; - thrinfo_t* thrinfo_ic = thread; - - bli_thrinfo_grow( rntm, cntl_ic, thrinfo_ic ); - - // -- main branch -- - - // inside ic loop: - cntl_t* cntl_pa = bli_cntl_sub_node( cntl_ic ); - thrinfo_t* thrinfo_pa = bli_thrinfo_sub_node( thrinfo_ic ); - - bli_thrinfo_grow( rntm, cntl_pa, thrinfo_pa ); - - // after pa packing: - cntl_t* cntl_jr = bli_cntl_sub_node( cntl_pa ); - thrinfo_t* thrinfo_jr = bli_thrinfo_sub_node( thrinfo_pa ); - - bli_thrinfo_grow( rntm, cntl_jr, thrinfo_jr ); - - // inside jr loop: - //cntl_t* cntl_ir = bli_cntl_sub_node( cntl_jr ); - //thrinfo_t* thrinfo_ir = bli_thrinfo_sub_node( thrinfo_jr ); - - // -- trsm branch -- - - // inside ic loop: - cntl_t* cntl_pa0 = bli_cntl_sub_prenode( cntl_ic ); - thrinfo_t* thrinfo_pa0 = bli_thrinfo_sub_prenode( thrinfo_ic ); - - bli_thrinfo_grow( rntm, cntl_pa0, thrinfo_pa0 ); - - // after pa packing: - cntl_t* cntl_jr0 = bli_cntl_sub_node( cntl_pa0 ); - thrinfo_t* thrinfo_jr0 = bli_thrinfo_sub_node( thrinfo_pa0 ); - - bli_thrinfo_grow( rntm, cntl_jr0, thrinfo_jr0 ); - - // inside jr loop: - //cntl_t* cntl_ir0 = bli_cntl_sub_node( cntl_jr0 ); - //thrinfo_t* thrinfo_ir0= bli_thrinfo_sub_node( thrinfo_jr0 ); + if ( thread == NULL ) return; + + printf( "%4ld %4ld %4ld %4ld %4ld %4ld\n", + ( unsigned long )level, + ( unsigned long )bli_thrinfo_num_threads( thread ), + ( unsigned long )bli_thrinfo_thread_id( thread ), + ( unsigned long )bli_thrinfo_n_way( thread ), + ( unsigned long )bli_thrinfo_work_id( thread ), + ( unsigned long )bli_thrinfo_needs_free_comm( thread )); + + bli_thrinfo_print_sub( bli_thrinfo_sub_prenode( thread ), level+1 ); + bli_thrinfo_print_sub( bli_thrinfo_sub_node( thread ), level+1 ); } -#endif + diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h index 9d234bc91..d15fb49f6 100644 --- a/frame/thread/bli_thrinfo.h +++ b/frame/thread/bli_thrinfo.h @@ -41,15 +41,16 @@ struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. - thrcomm_t* ocomm; + thrcomm_t* comm; - // Our thread id within the ocomm thread communicator. - dim_t ocomm_id; + // Our thread id within the thread communicator. + dim_t thread_id; - // The number of distinct threads used to parallelize the loop. + // The number of communicators which are "siblings" of our communicator. dim_t n_way; - // What we're working on. + // An id to identify what we're working on. This is the same for all threads + // in the same communicator, and 0 <= work_id < n_way. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, @@ -58,9 +59,14 @@ struct thrinfo_s // to false. bool free_comm; - // The bszid_t to help identify the node. This is mostly only useful when - // debugging or tracing the allocation and release of thrinfo_t nodes. - bszid_t bszid; + // The small block pool. + pool_t* sba_pool; + + // The packing block allocator. + pba_t* pba; + + // Storage for allocated memory obtained from the packing block allocator. + mem_t mem; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; @@ -69,35 +75,33 @@ typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions -// NOTE: The naming of these should be made consistent at some point. -// (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) -BLIS_INLINE dim_t bli_thread_num_threads( const thrinfo_t* t ) +BLIS_INLINE dim_t bli_thrinfo_num_threads( const thrinfo_t* t ) { - return (t->ocomm)->n_threads; + return (t->comm)->n_threads; } -BLIS_INLINE dim_t bli_thread_ocomm_id( const thrinfo_t* t ) +BLIS_INLINE dim_t bli_thrinfo_thread_id( const thrinfo_t* t ) { - return t->ocomm_id; + return t->thread_id; } -BLIS_INLINE dim_t bli_thread_n_way( const thrinfo_t* t ) +BLIS_INLINE dim_t bli_thrinfo_n_way( const thrinfo_t* t ) { return t->n_way; } -BLIS_INLINE dim_t bli_thread_work_id( const thrinfo_t* t ) +BLIS_INLINE dim_t bli_thrinfo_work_id( const thrinfo_t* t ) { return t->work_id; } -BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( const thrinfo_t* t ) +BLIS_INLINE thrcomm_t* bli_thrinfo_comm( const thrinfo_t* t ) { - return t->ocomm; + return t->comm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( const thrinfo_t* t ) @@ -105,9 +109,19 @@ BLIS_INLINE bool bli_thrinfo_needs_free_comm( const thrinfo_t* t ) return t->free_comm; } -BLIS_INLINE dim_t bli_thread_bszid( const thrinfo_t* t ) +BLIS_INLINE pool_t* bli_thrinfo_sba_pool( const thrinfo_t* t ) +{ + return t->sba_pool; +} + +BLIS_INLINE pba_t* bli_thrinfo_pba( const thrinfo_t* t ) +{ + return t->pba; +} + +BLIS_INLINE mem_t* bli_thrinfo_mem( thrinfo_t* t ) { - return t->bszid; + return &t->mem; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( const thrinfo_t* t ) @@ -122,21 +136,21 @@ BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( const thrinfo_t* t ) // thrinfo_t query (complex) -BLIS_INLINE bool bli_thread_am_ochief( const thrinfo_t* t ) +BLIS_INLINE bool bli_thrinfo_am_chief( const thrinfo_t* t ) { - return t->ocomm_id == 0; + return t->thread_id == 0; } // thrinfo_t modification -BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) +BLIS_INLINE void bli_thrinfo_set_comm( thrcomm_t* comm, thrinfo_t* t ) { - t->ocomm = ocomm; + t->comm = comm; } -BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) +BLIS_INLINE void bli_thrinfo_set_thread_id( dim_t thread_id, thrinfo_t* t ) { - t->ocomm_id = ocomm_id; + t->thread_id = thread_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) @@ -154,9 +168,14 @@ BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) t->free_comm = free_comm; } -BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) +BLIS_INLINE void bli_thrinfo_set_sba_pool( pool_t* sba_pool, thrinfo_t* t ) +{ + t->sba_pool = sba_pool; +} + +BLIS_INLINE void bli_thrinfo_set_pba( pba_t* pba, thrinfo_t* t ) { - t->bszid = bszid; + t->pba = pba; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) @@ -171,22 +190,14 @@ BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* // other thrinfo_t-related functions -BLIS_INLINE void* bli_thread_broadcast( const rntm_t* rntm, const thrinfo_t* t, void* p ) +BLIS_INLINE void* bli_thrinfo_broadcast( const thrinfo_t* t, void* p ) { - // We can't use any bli_rntm_*() APIs here because they haven't been - // defined yet. So we have to manually access the timpl_t field (le ugh). - //const timpl_t ti = bli_rntm_thread_impl( rntm ); - - return bli_thrcomm_bcast( rntm->thread_impl, t->ocomm_id, p, t->ocomm ); + return bli_thrcomm_bcast( t->thread_id, p, t->comm ); } -BLIS_INLINE void bli_thread_barrier( const rntm_t* rntm, const thrinfo_t* t ) +BLIS_INLINE void bli_thrinfo_barrier( const thrinfo_t* t ) { - // We can't use any bli_rntm_*() APIs here because they haven't been - // defined yet. So we have to manually access the timpl_t field (le ugh). - //const timpl_t ti = bli_rntm_thread_impl( rntm ); - - bli_thrcomm_barrier( rntm->thread_impl, t->ocomm_id, t->ocomm ); + bli_thrcomm_barrier( t->thread_id, t->comm ); } @@ -194,98 +205,47 @@ BLIS_INLINE void bli_thread_barrier( const rntm_t* rntm, const thrinfo_t* t ) // Prototypes for level-3 thrinfo functions not specific to any operation. // -thrinfo_t* bli_thrinfo_create +thrinfo_t* bli_thrinfo_create_root ( - rntm_t* rntm, - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - bool free_comm, - bszid_t bszid, - thrinfo_t* sub_node + thrcomm_t* comm, + dim_t thread_id, + pool_t* sba_pool, + pba_t* pba ); -void bli_thrinfo_init +thrinfo_t* bli_thrinfo_create ( - thrinfo_t* thread, - thrcomm_t* ocomm, - dim_t ocomm_id, + thrcomm_t* comm, + dim_t thread_id, dim_t n_way, dim_t work_id, bool free_comm, - bszid_t bszid, - thrinfo_t* sub_node - ); - -void bli_thrinfo_init_single - ( - thrinfo_t* thread + pool_t* sba_pool, + pba_t* pba ); -void bli_thrinfo_free +BLIS_EXPORT_BLIS void bli_thrinfo_free ( - rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- -void bli_thrinfo_grow +thrinfo_t* bli_thrinfo_split ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread - ); - -thrinfo_t* bli_thrinfo_rgrow - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_cur, - thrinfo_t* thread_par - ); - -thrinfo_t* bli_thrinfo_create_for_cntl - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_chl, - thrinfo_t* thread_par - ); - -thrinfo_t* bli_thrinfo_rgrow_prenode - ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_cur, + dim_t n_way, thrinfo_t* thread_par ); -thrinfo_t* bli_thrinfo_create_for_cntl_prenode +void bli_thrinfo_print ( - rntm_t* rntm, - cntl_t* cntl_par, - cntl_t* cntl_chl, - thrinfo_t* thread_par - ); - -// ----------------------------------------------------------------------------- - -#if 0 -void bli_thrinfo_grow_tree - ( - rntm_t* rntm, - cntl_t* cntl, thrinfo_t* thread ); -void bli_thrinfo_grow_tree_ic +void bli_thrinfo_print_sub ( - rntm_t* rntm, - cntl_t* cntl, - thrinfo_t* thread + thrinfo_t* thread, + gint_t level ); -#endif #endif diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c deleted file mode 100644 index 26a40e00f..000000000 --- a/frame/thread/bli_thrinfo_sup.c +++ /dev/null @@ -1,290 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_thrinfo_sup_grow - ( - rntm_t* rntm, - const bszid_t* bszid_par, - thrinfo_t* thread - ) -{ - if ( thread == &BLIS_GEMM_SINGLE_THREADED || - thread == &BLIS_PACKM_SINGLE_THREADED ) return; - - // NOTE: If bli_thrinfo_sup_rgrow() is being called, the sub_node field will - // always be non-NULL, and so there's no need to check it. - //if ( bli_cntl_sub_node( cntl ) != NULL ) - { - // We only need to take action if the thrinfo_t sub-node is NULL; if it - // is non-NULL, then it has already been created and we'll use it as-is. - if ( bli_thrinfo_sub_node( thread ) == NULL ) - { - // Create a new node (or, if needed, multiple nodes) along the - // main sub-node branch of the tree and return the pointer to the - // (highest) child. - thrinfo_t* thread_child = bli_thrinfo_sup_rgrow - ( - rntm, - bszid_par, - &bszid_par[1], - thread - ); - - // Attach the child thrinfo_t node for the primary branch to its - // parent structure. - bli_thrinfo_set_sub_node( thread_child, thread ); - } - } -} - -// ----------------------------------------------------------------------------- - -thrinfo_t* bli_thrinfo_sup_rgrow - ( - rntm_t* rntm, - const bszid_t* bszid_par, - const bszid_t* bszid_cur, - thrinfo_t* thread_par - ) -{ - thrinfo_t* thread_cur; - - // We must handle two cases: those where the next node in the - // control tree is a partitioning node, and those where it is - // a non-partitioning (ie: packing) node. - if ( *bszid_cur != BLIS_NO_PART ) - { - // Create the child thrinfo_t node corresponding to cntl_cur, - // with cntl_par being the parent. - thread_cur = bli_thrinfo_sup_create_for_cntl - ( - rntm, - bszid_par, - bszid_cur, - thread_par - ); - } - else // if ( *bszid_cur == BLIS_NO_PART ) - { - // Recursively grow the thread structure and return the top-most - // thrinfo_t node of that segment. - thrinfo_t* thread_seg = bli_thrinfo_sup_rgrow - ( - rntm, - bszid_par, - &bszid_cur[1], - thread_par - ); - - // Create a thrinfo_t node corresponding to cntl_cur. Since the - // corresponding cntl node, cntl_cur, is a non-partitioning node - // (bszid = BLIS_NO_PART), this means it's a packing node. Packing - // thrinfo_t nodes are formed differently than those corresponding to - // partitioning nodes; specifically, their work_id's are set equal to - // the their comm_id's. Also, notice that the free_comm field is set - // to FALSE since cntl_cur is a non-partitioning node. The reason: - // the communicator used here will be freed when thread_seg, or one - // of its descendents, is freed. - thread_cur = bli_thrinfo_create - ( - rntm, // rntm - bli_thrinfo_ocomm( thread_seg ), // ocomm - bli_thread_ocomm_id( thread_seg ), // ocomm_id - bli_rntm_calc_num_threads_in( bszid_cur, rntm ), // n_way - bli_thread_ocomm_id( thread_seg ), // work_id - FALSE, // free_comm - BLIS_NO_PART, // bszid - thread_seg // sub_node - ); - } - - return thread_cur; -} - -#define BLIS_NUM_STATIC_COMMS 80 - -thrinfo_t* bli_thrinfo_sup_create_for_cntl - ( - rntm_t* rntm, - const bszid_t* bszid_par, - const bszid_t* bszid_chl, - thrinfo_t* thread_par - ) -{ - // If we are running with a single thread, all of the code can be reduced - // and simplified to this. - if ( bli_rntm_calc_num_threads( rntm ) == 1 ) - { - thrinfo_t* thread_chl = bli_thrinfo_create - ( - rntm, // rntm - &BLIS_SINGLE_COMM, // ocomm - 0, // ocomm_id - 1, // n_way - 0, // work_id - FALSE, // free_comm - BLIS_NO_PART, // bszid - NULL // sub_node - ); - - return thread_chl; - } - - // The remainder of this function handles the cases involving the use of - // multiple BLIS threads. - - if ( bli_rntm_pack_a( rntm ) == FALSE && - bli_rntm_pack_b( rntm ) == FALSE ) - { - // If we are packing neither A nor B, there are no broadcasts or barriers - // needed to synchronize threads (since all threads can work completely - // independently). In this special case situation, the thrinfo_t can be - // created with much simpler logic. - - const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); - - // Compute: - // - the number of threads inside the new child comm, - // - the current thread's id within the new communicator, - // - the current thread's work id, given the ways of parallelism - // to be obtained within the next loop. - const dim_t child_nt_in = bli_rntm_calc_num_threads_in( bszid_chl, rntm ); - const dim_t child_n_way = bli_rntm_ways_for( *bszid_chl, rntm ); - const dim_t child_comm_id = parent_comm_id % child_nt_in; - const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); - - // All threads create a new thrinfo_t node using the communicator - // that was created by their chief, as identified by parent_work_id. - thrinfo_t* thread_chl = bli_thrinfo_create - ( - rntm, // rntm - NULL, // ocomm - child_comm_id, // ocomm_id - child_n_way, // n_way - child_work_id, // work_id - TRUE, // free_comm - *bszid_chl, // bszid - NULL // sub_node - ); - - return thread_chl; - } - else - { - // If we are packing at least one of A or B, then we use the general - // approach that employs broadcasts and barriers. - - thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ]; - thrcomm_t** new_comms = NULL; - - const dim_t parent_nt_in = bli_thread_num_threads( thread_par ); - const dim_t parent_n_way = bli_thread_n_way( thread_par ); - const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); - const dim_t parent_work_id = bli_thread_work_id( thread_par ); - - // Sanity check: make sure the number of threads in the parent's - // communicator is divisible by the number of new sub-groups. - if ( parent_nt_in % parent_n_way != 0 ) - { - printf( "Assertion failed: parent_nt_in parent_n_way != 0\n" ); - bli_abort(); - } - - // Compute: - // - the number of threads inside the new child comm, - // - the current thread's id within the new communicator, - // - the current thread's work id, given the ways of parallelism - // to be obtained within the next loop. - const dim_t child_nt_in = bli_rntm_calc_num_threads_in( bszid_chl, rntm ); - const dim_t child_n_way = bli_rntm_ways_for( *bszid_chl, rntm ); - const dim_t child_comm_id = parent_comm_id % child_nt_in; - const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); - -//printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl ); - - // The parent's chief thread creates a temporary array of thrcomm_t - // pointers. - if ( bli_thread_am_ochief( thread_par ) ) - { - err_t r_val; - - if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) - new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val ); - else - new_comms = static_comms; - } - - // Broadcast the temporary array to all threads in the parent's - // communicator. - new_comms = bli_thread_broadcast( rntm, thread_par, new_comms ); - - // Chiefs in the child communicator allocate the communicator - // object and store it in the array element corresponding to the - // parent's work id. - if ( child_comm_id == 0 ) - new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in ); - - bli_thread_barrier( rntm, thread_par ); - - // All threads create a new thrinfo_t node using the communicator - // that was created by their chief, as identified by parent_work_id. - thrinfo_t* thread_chl = bli_thrinfo_create - ( - rntm, // rntm - new_comms[ parent_work_id ], // ocomm - child_comm_id, // ocomm_id - child_n_way, // n_way - child_work_id, // work_id - TRUE, // free_comm - *bszid_chl, // bszid - NULL // sub_node - ); - - bli_thread_barrier( rntm, thread_par ); - - // The parent's chief thread frees the temporary array of thrcomm_t - // pointers. - if ( bli_thread_am_ochief( thread_par ) ) - { - if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) - bli_free_intl( new_comms ); - } - - return thread_chl; - } -} - diff --git a/frame/thread/bli_thrinfo_sup.h b/frame/thread/bli_thrinfo_sup.h deleted file mode 100644 index 1afcd3337..000000000 --- a/frame/thread/bli_thrinfo_sup.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_THRINFO_SUP_H -#define BLIS_THRINFO_SUP_H - -// -// Prototypes for level-3 thrinfo sup functions. -// - -void bli_thrinfo_sup_grow - ( - rntm_t* rntm, - const bszid_t* bszid_par, - thrinfo_t* thread - ); - -thrinfo_t* bli_thrinfo_sup_rgrow - ( - rntm_t* rntm, - const bszid_t* bszid_par, - const bszid_t* bszid_cur, - thrinfo_t* thread_par - ); - -thrinfo_t* bli_thrinfo_sup_create_for_cntl - ( - rntm_t* rntm, - const bszid_t* bszid_par, - const bszid_t* bszid_chl, - thrinfo_t* thread_par - ); - -#endif diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c index abc9c9089..5bd03882a 100644 --- a/frame/util/bli_util_tapi.c +++ b/frame/util/bli_util_tapi.c @@ -74,7 +74,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )x, incx, \ asum, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ } @@ -110,7 +110,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ m, \ a, rs_a, cs_a, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ } @@ -153,7 +153,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )x, incx, \ norm, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ } @@ -204,7 +204,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ ( ctype* )x, rs_x, cs_x, \ norm, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ } @@ -248,7 +248,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ n, \ x, incx, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ \ /* Check the 1-norm of the randomzied vector. In the unlikely event that @@ -310,7 +310,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ n, \ x, rs_x, cs_x, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ \ /* Check the 1-norm of the randomzied matrix. In the unlikely event that @@ -366,7 +366,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ scale, \ sumsq, \ ( cntx_t* )cntx, \ - rntm \ + ( rntm_t* )rntm \ ); \ } diff --git a/sandbox/gemmlike/attic/bls_gemm_bp_var2.c b/sandbox/gemmlike/attic/bls_gemm_bp_var2.c index 8caccf923..133786843 100644 --- a/sandbox/gemmlike/attic/bls_gemm_bp_var2.c +++ b/sandbox/gemmlike/attic/bls_gemm_bp_var2.c @@ -374,8 +374,8 @@ void PASTECH2(bls_,ch,varname) \ /* Query the number of threads and thread ids for the JR loop. NOTE: These values are only needed when computing the next micropanel of B. */ \ - const dim_t jr_nt = bli_thread_n_way( thread_jr ); \ - const dim_t jr_tid = bli_thread_work_id( thread_jr ); \ + const dim_t jr_nt = bli_thrinfo_n_way( thread_jr ); \ + const dim_t jr_tid = bli_thrinfo_work_id( thread_jr ); \ \ /* Compute number of primary and leftover components of the JR loop. */ \ dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ @@ -404,8 +404,8 @@ void PASTECH2(bls_,ch,varname) \ /* Query the number of threads and thread ids for the IR loop. NOTE: These values are only needed when computing the next micropanel of A. */ \ - const dim_t ir_nt = bli_thread_n_way( thread_ir ); \ - const dim_t ir_tid = bli_thread_work_id( thread_ir ); \ + const dim_t ir_nt = bli_thrinfo_n_way( thread_ir ); \ + const dim_t ir_tid = bli_thrinfo_work_id( thread_ir ); \ \ /* Compute number of primary and leftover components of the IR loop. */ \ dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ @@ -464,7 +464,7 @@ void PASTECH2(bls_,ch,varname) \ /* This barrier is needed to prevent threads from starting to pack the next row panel of B before the current row panel is fully computed upon. */ \ - bli_thread_barrier( thread_pb ); \ + bli_thrinfo_barrier( thread_pb ); \ } \ } \ \ diff --git a/sandbox/gemmlike/bli_gemm_ex.c b/sandbox/gemmlike/bli_gemm_ex.c index fe220e603..f8e8f86f7 100644 --- a/sandbox/gemmlike/bli_gemm_ex.c +++ b/sandbox/gemmlike/bli_gemm_ex.c @@ -52,7 +52,7 @@ void bli_gemm_ex const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -90,7 +90,7 @@ void bli_gemm_ex bli_gemm_front ( ( obj_t* )alpha, ( obj_t* )a, ( obj_t* )b, ( obj_t* )beta, ( obj_t* )c, - ( cntx_t* )cntx, ( rntm_t* )rntm, NULL + ( cntx_t* )cntx, ( rntm_t* )rntm ); } diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c index 1e567a114..ba930ebc5 100644 --- a/sandbox/gemmlike/bls_gemm.c +++ b/sandbox/gemmlike/bls_gemm.c @@ -67,7 +67,7 @@ void bls_gemm_ex const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ) { bli_init_once(); @@ -75,8 +75,8 @@ void bls_gemm_ex // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; - if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } - else { rntm_l = *rntm; rntm = &rntm_l; } + if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); } + else { rntm_l = *rntm; } // Set the .pack_a and .pack_b fields to TRUE. This is only needed because // this sandbox uses bli_thrinfo_sup_grow(), which calls @@ -87,8 +87,8 @@ void bls_gemm_ex // while this sandbox implementation executes (and it also reinforces the // fact that we *are* indeed packing A and B, albeit not in the sup context // originally envisioned for the .pack_a and .pack_b fields). - bli_rntm_set_pack_a( TRUE, rntm ); - bli_rntm_set_pack_b( TRUE, rntm ); + bli_rntm_set_pack_a( TRUE, &rntm_l ); + bli_rntm_set_pack_b( TRUE, &rntm_l ); // Obtain a valid (native) context from the gks if necessary. // NOTE: This must be done before calling the _check() function, since @@ -166,7 +166,7 @@ void bls_gemm_ex bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ), - rntm + &rntm_l ); // Spawn threads (if applicable), where bls_gemm_int() is the thread entry @@ -182,7 +182,7 @@ void bls_gemm_ex ( obj_t* )beta, ( obj_t* )&c_local, ( cntx_t* )cntx, - rntm + &rntm_l ); } diff --git a/sandbox/gemmlike/bls_gemm.h b/sandbox/gemmlike/bls_gemm.h index d01c6647e..7380f02ad 100644 --- a/sandbox/gemmlike/bls_gemm.h +++ b/sandbox/gemmlike/bls_gemm.h @@ -53,7 +53,7 @@ void bls_gemm_ex const obj_t* beta, const obj_t* c, const cntx_t* cntx, - rntm_t* rntm + const rntm_t* rntm ); // diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c index c8fd50083..28c5032bc 100644 --- a/sandbox/gemmlike/bls_gemm_bp_var1.c +++ b/sandbox/gemmlike/bls_gemm_bp_var1.c @@ -186,42 +186,13 @@ void PASTECH2(bls_,ch,varname) \ \ auxinfo_t aux; \ \ - /* Initialize a mem_t entry for A and B. Strictly speaking, this is only - needed for the matrix we will be packing (if any), but we do it - unconditionally to be safe. */ \ - mem_t mem_a = BLIS_MEM_INITIALIZER; \ - mem_t mem_b = BLIS_MEM_INITIALIZER; \ -\ - /* Define an array of bszid_t ids, which will act as our substitute for - the cntl_t tree. */ \ - bszid_t bszids[8] = { BLIS_NC, /* 5th loop */ \ - BLIS_KC, /* 4th loop */ \ - BLIS_NO_PART, /* pack B */ \ - BLIS_MC, /* 3rd loop */ \ - BLIS_NO_PART, /* pack A */ \ - BLIS_NR, /* 2nd loop */ \ - BLIS_MR, /* 1st loop */ \ - BLIS_KR }; /* microkernel loop */ \ -\ - bszid_t* restrict bszids_jc = &bszids[0]; \ - bszid_t* restrict bszids_pc = &bszids[1]; \ - /*bszid_t* restrict bszids_pb = &bszids[2];*/ \ - bszid_t* restrict bszids_ic = &bszids[3]; \ - /*bszid_t* restrict bszids_pa = &bszids[4];*/ \ - bszid_t* restrict bszids_jr = &bszids[5]; \ - /*bszid_t* restrict bszids_ir = &bszids[6];*/ \ -\ - thrinfo_t* restrict thread_jc = NULL; \ - thrinfo_t* restrict thread_pc = NULL; \ - thrinfo_t* restrict thread_pb = NULL; \ - thrinfo_t* restrict thread_ic = NULL; \ - thrinfo_t* restrict thread_pa = NULL; \ - thrinfo_t* restrict thread_jr = NULL; \ - thrinfo_t* restrict thread_ir = NULL; \ -\ - /* Identify the current thrinfo_t node and then grow the tree. */ \ - thread_jc = thread; \ - bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ + thrinfo_t* restrict thread_jc = bli_thrinfo_sub_node( thread ); \ + thrinfo_t* restrict thread_pc = bli_thrinfo_sub_node( thread_jc ); \ + thrinfo_t* restrict thread_pb = bli_thrinfo_sub_node( thread_pc ); \ + thrinfo_t* restrict thread_ic = bli_thrinfo_sub_node( thread_pb ); \ + thrinfo_t* restrict thread_pa = bli_thrinfo_sub_node( thread_ic ); \ + thrinfo_t* restrict thread_jr = bli_thrinfo_sub_node( thread_pa ); \ + thrinfo_t* restrict thread_ir = bli_thrinfo_sub_node( thread_jr ); \ \ /* Compute the JC loop thread range for the current thread. */ \ dim_t jc_start, jc_end; \ @@ -240,10 +211,6 @@ void PASTECH2(bls_,ch,varname) \ \ ctype* restrict b_jc = b_00 + jj * jcstep_b; \ ctype* restrict c_jc = c_00 + jj * jcstep_c; \ -\ - /* Identify the current thrinfo_t node and then grow the tree. */ \ - thread_pc = bli_thrinfo_sub_node( thread_jc ); \ - bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ \ /* Compute the PC loop thread range for the current thread. */ \ const dim_t pc_start = 0, pc_end = k; \ @@ -267,14 +234,6 @@ void PASTECH2(bls_,ch,varname) \ \ ctype* b_use; \ inc_t rs_b_use, cs_b_use, ps_b_use; \ -\ - /* Identify the current thrinfo_t node. Note that the thrinfo_t - node will have already been created by a previous call to - bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART - cause the tree to grow by two (e.g. to the next bszid that is - a normal bszid_t value). */ \ - thread_pb = bli_thrinfo_sub_node( thread_pc ); \ - /*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \ \ /* Determine the packing buffer and related parameters for matrix B. Then call the packm implementation. */ \ @@ -288,18 +247,12 @@ void PASTECH2(bls_,ch,varname) \ &b_use, &rs_b_use, &cs_b_use, \ &ps_b_use, \ cntx, \ - rntm, \ - &mem_b, \ thread_pb \ ); \ \ /* Alias b_use so that it's clear this is our current block of matrix B. */ \ ctype* restrict b_pc_use = b_use; \ -\ - /* Identify the current thrinfo_t node and then grow the tree. */ \ - thread_ic = bli_thrinfo_sub_node( thread_pb ); \ - bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ \ /* Compute the IC loop thread range for the current thread. */ \ dim_t ic_start, ic_end; \ @@ -321,14 +274,6 @@ void PASTECH2(bls_,ch,varname) \ \ ctype* a_use; \ inc_t rs_a_use, cs_a_use, ps_a_use; \ -\ - /* Identify the current thrinfo_t node. Note that the thrinfo_t - node will have already been created by a previous call to - bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART - cause the tree to grow by two (e.g. to the next bszid that is - a normal bszid_t value). */ \ - thread_pa = bli_thrinfo_sub_node( thread_ic ); \ - /*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \ \ /* Determine the packing buffer and related parameters for matrix A. Then call the packm implementation. */ \ @@ -342,24 +287,18 @@ void PASTECH2(bls_,ch,varname) \ &a_use, &rs_a_use, &cs_a_use, \ &ps_a_use, \ cntx, \ - rntm, \ - &mem_a, \ thread_pa \ ); \ \ /* Alias a_use so that it's clear this is our current block of matrix A. */ \ ctype* restrict a_ic_use = a_use; \ -\ - /* Identify the current thrinfo_t node and then grow the tree. */ \ - thread_jr = bli_thrinfo_sub_node( thread_pa ); \ - bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ \ /* Query the number of threads and thread ids for the JR loop. NOTE: These values are only needed when computing the next micropanel of B. */ \ - const dim_t jr_nt = bli_thread_n_way( thread_jr ); \ - const dim_t jr_tid = bli_thread_work_id( thread_jr ); \ + const dim_t jr_nt = bli_thrinfo_n_way( thread_jr ); \ + const dim_t jr_tid = bli_thrinfo_work_id( thread_jr ); \ \ /* Compute number of primary and leftover components of the JR loop. */ \ dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ @@ -381,15 +320,12 @@ void PASTECH2(bls_,ch,varname) \ /* Assume for now that our next panel of B to be the current panel of B. */ \ ctype* restrict b2 = b_jr; \ -\ - /* Identify the current thrinfo_t node. */ \ - thread_ir = bli_thrinfo_sub_node( thread_jr ); \ \ /* Query the number of threads and thread ids for the IR loop. NOTE: These values are only needed when computing the next micropanel of A. */ \ - const dim_t ir_nt = bli_thread_n_way( thread_ir ); \ - const dim_t ir_tid = bli_thread_work_id( thread_ir ); \ + const dim_t ir_nt = bli_thrinfo_n_way( thread_ir ); \ + const dim_t ir_tid = bli_thrinfo_work_id( thread_ir ); \ \ /* Compute number of primary and leftover components of the IR loop. */ \ dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ @@ -446,23 +382,9 @@ void PASTECH2(bls_,ch,varname) \ /* This barrier is needed to prevent threads from starting to pack the next row panel of B before the current row panel is fully computed upon. */ \ - bli_thread_barrier( rntm, thread_pb ); \ + bli_thrinfo_barrier( thread_pb ); \ } \ } \ -\ - /* Release any memory that was acquired for packing matrices A and B. */ \ - PASTECH2(bls_,ch,packm_finalize_mem_a) \ - ( \ - rntm, \ - &mem_a, \ - thread_pa \ - ); \ - PASTECH2(bls_,ch,packm_finalize_mem_b) \ - ( \ - rntm, \ - &mem_b, \ - thread_pb \ - ); \ \ /* PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \ diff --git a/frame/thread/bli_l3_sup_decor.c b/sandbox/gemmlike/bls_l3_decor.c similarity index 50% rename from frame/thread/bli_l3_sup_decor.c rename to sandbox/gemmlike/bls_l3_decor.c index 53c7b41be..4bf030586 100644 --- a/frame/thread/bli_l3_sup_decor.c +++ b/sandbox/gemmlike/bls_l3_decor.c @@ -34,51 +34,79 @@ #include "blis.h" -// Initialize a function pointer array containing function addresses for -// each of the threading-specific level-3 sup thread decorators. - -static l3_sup_decor_ft l3_sup_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] = +struct l3_sbx_decor_params_s { - [BLIS_SINGLE] = bli_l3_sup_thread_decorator_single, - [BLIS_OPENMP] = -#if defined(BLIS_ENABLE_OPENMP) - bli_l3_sup_thread_decorator_openmp, -#elif defined(BLIS_ENABLE_PTHREADS) - NULL, -#else - NULL, -#endif - [BLIS_POSIX] = -#if defined(BLIS_ENABLE_PTHREADS) - bli_l3_sup_thread_decorator_pthreads, -#elif defined(BLIS_ENABLE_OPENMP) - NULL, -#else - NULL, -#endif + l3sbxint_ft func; + opid_t family; + obj_t* alpha; + obj_t* a; + obj_t* b; + obj_t* beta; + obj_t* c; + cntx_t* cntx; + rntm_t* rntm; + array_t* array; }; +typedef struct l3_sbx_decor_params_s l3_sbx_decor_params_t; + +static void bls_l3_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, const void* data_void ) +{ + const l3_sbx_decor_params_t* data = data_void; + + l3sbxint_ft func = data->func; + opid_t family = data->family; + obj_t* alpha = data->alpha; + obj_t* a = data->a; + obj_t* b = data->b; + obj_t* beta = data->beta; + obj_t* c = data->c; + cntx_t* cntx = data->cntx; + rntm_t* rntm = data->rntm; + array_t* array = data->array; + + ( void )family; + + // Create the root node of the thread's thrinfo_t structure. + pool_t* sba_pool = bli_apool_array_elem( tid, array ); + thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, sba_pool, rntm ); + + func + ( + alpha, + a, + b, + beta, + c, + cntx, + rntm, + thread + ); -// Define a dispatcher that chooses a threading-specific function from the -// above function pointer array. + // Free the current thread's thrinfo_t structure. + bli_thrinfo_free( thread ); +} -err_t bli_l3_sup_thread_decorator +void bls_l3_thread_decorator ( - l3supint_ft func, - opid_t family, - const obj_t* alpha, - const obj_t* a, - const obj_t* b, - const obj_t* beta, - const obj_t* c, - const cntx_t* cntx, - rntm_t* rntm + l3sbxint_ft func, + opid_t family, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm ) { - rntm_t rntm_l; + rntm_t rntm_l = *rntm; // Query the threading implementation and the number of threads requested. - timpl_t ti = bli_rntm_thread_impl( rntm ); - dim_t nt = bli_rntm_num_threads( rntm ); + timpl_t ti = bli_rntm_thread_impl( &rntm_l ); + dim_t nt = bli_rntm_num_threads( &rntm_l ); + + if ( bli_error_checking_is_enabled() ) + bli_l3_thread_decorator_check( &rntm_l ); #ifdef BLIS_ENABLE_NT1_VIA_SINGLE if ( nt == 1 ) @@ -101,37 +129,39 @@ err_t bli_l3_sup_thread_decorator // than one thread. Here, we choose to favor the requested threading // implementation over the number of threads, and so reset all // parallelism parameters to 1. - rntm_l = *rntm; nt = 1; bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l ); bli_rntm_set_num_threads_only( 1, &rntm_l ); - rntm = &rntm_l; } - // Use the timpl_t value to index into the corresponding function address - // from the function pointer array. - const l3_sup_decor_ft fp = l3_sup_decor_fpa[ ti ]; - - // Call the threading-specific decorator function. - return fp - ( - func, - family, - alpha, - a, - b, - beta, - c, - cntx, - rntm - ); -} - -void bli_l3_sup_thread_decorator_check - ( - rntm_t* rntm - ) -{ - bli_l3_sup_thread_decorator_check( rntm ); + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* array = bli_sba_checkout_array( nt ); + + // Declare a params struct and embed within it all of the information + // that is relevant to the computation. + l3_sbx_decor_params_t params; + params.func = func; + params.family = family; + params.alpha = alpha; + params.a = a; + params.b = b; + params.beta = beta; + params.c = c; + params.cntx = cntx; + params.rntm = &rntm_l; + params.array = array; + + // Launch the threads using the threading implementation specified by ti, + // and use bli_l3_thread_decorator_entry() as their entry points. The + // params struct will be passed along to each thread. + bli_thread_launch( ti, nt, bls_l3_thread_decorator_entry, ¶ms ); + + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); } diff --git a/sandbox/gemmlike/thread/bls_l3_decor.h b/sandbox/gemmlike/bls_l3_decor.h similarity index 79% rename from sandbox/gemmlike/thread/bls_l3_decor.h rename to sandbox/gemmlike/bls_l3_decor.h index 58b076270..524c24f38 100644 --- a/sandbox/gemmlike/thread/bls_l3_decor.h +++ b/sandbox/gemmlike/bls_l3_decor.h @@ -48,21 +48,6 @@ typedef void (*l3sbxint_ft) thrinfo_t* thread ); -// Level-3 thread decorator function type. -typedef void (*l3sbx_decor_ft) - ( - l3sbxint_ft func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ); - -// Level-3 thread decorator prototype. void bls_l3_thread_decorator ( l3sbxint_ft func, @@ -76,15 +61,5 @@ void bls_l3_thread_decorator rntm_t* rntm ); -void bls_l3_thread_decorator_check - ( - rntm_t* rntm - ); - -// Include definitions specific to the method of multithreading. -#include "bls_l3_decor_single.h" -#include "bls_l3_decor_openmp.h" -#include "bls_l3_decor_pthreads.h" - #endif diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c index 9e1f67fc5..412c6c24e 100644 --- a/sandbox/gemmlike/bls_l3_packm_a.c +++ b/sandbox/gemmlike/bls_l3_packm_a.c @@ -43,8 +43,6 @@ void PASTECH2(bls_,ch,opname) \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ @@ -61,16 +59,18 @@ void PASTECH2(bls_,ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thrinfo_barrier( thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \ +\ + mem_t* mem = bli_thrinfo_mem( thread ); \ \ /* Check the mem_t entry provided by the caller. If it is unallocated, then we need to acquire a block from the packed block allocator. */ \ if ( bli_mem_is_unalloc( mem ) ) \ { \ - if ( bli_thread_am_ochief( thread ) ) \ + if ( bli_thrinfo_am_chief( thread ) ) \ { \ /* Acquire directly to the chief thread's mem_t that was passed in. It needs to be that mem_t struct, and not a local (temporary) @@ -81,7 +81,7 @@ void PASTECH2(bls_,ch,opname) \ again, I prefer to keep barriers to a minimum.) */ \ bli_pba_acquire_m \ ( \ - rntm, \ + bli_thrinfo_pba( thread ), \ size_needed, \ pack_buf_type, \ mem \ @@ -90,13 +90,13 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ + mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The chief thread already has the mem_t, so it does not need to perform any copy.) */ \ - if ( !bli_thread_am_ochief( thread ) ) \ + if ( !bli_thrinfo_am_chief( thread ) ) \ { \ *mem = *mem_p; \ } \ @@ -115,7 +115,7 @@ void PASTECH2(bls_,ch,opname) \ \ if ( mem_size < size_needed ) \ { \ - if ( bli_thread_am_ochief( thread ) ) \ + if ( bli_thrinfo_am_chief( thread ) ) \ { \ /* The chief thread releases the existing block associated with the mem_t, and then re-acquires a new block, saving @@ -125,12 +125,12 @@ void PASTECH2(bls_,ch,opname) \ (temporary) mem_t. */ \ bli_pba_release \ ( \ - rntm, \ + bli_thrinfo_pba( thread ), \ mem \ ); \ bli_pba_acquire_m \ ( \ - rntm, \ + bli_thrinfo_pba( thread ), \ size_needed, \ pack_buf_type, \ mem \ @@ -139,13 +139,13 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ + mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The chief thread already has the mem_t, so it does not need to perform any copy.) */ \ - if ( !bli_thread_am_ochief( thread ) ) \ + if ( !bli_thrinfo_am_chief( thread ) ) \ { \ *mem = *mem_p; \ } \ @@ -165,39 +165,6 @@ GENTFUNC( scomplex, c, packm_init_mem_a ) GENTFUNC( dcomplex, z, packm_init_mem_a ) -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTECH2(bls_,ch,opname) \ - ( \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ) \ -{ \ - if ( thread != NULL ) \ - if ( bli_thread_am_ochief( thread ) ) \ - { \ - /* Check the mem_t entry provided by the caller. Only proceed if it - is allocated, which it should be. */ \ - if ( bli_mem_is_alloc( mem ) ) \ - { \ - bli_pba_release \ - ( \ - rntm, \ - mem \ - ); \ - } \ - } \ -} - -//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_a ) -GENTFUNC( float, s, packm_finalize_mem_a ) -GENTFUNC( double, d, packm_finalize_mem_a ) -GENTFUNC( scomplex, c, packm_finalize_mem_a ) -GENTFUNC( dcomplex, z, packm_finalize_mem_a ) - - #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ @@ -267,8 +234,6 @@ void PASTECH2(bls_,ch,opname) \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ @@ -282,8 +247,6 @@ void PASTECH2(bls_,ch,opname) \ ( \ m_alloc, k_alloc, mr, \ cntx, \ - rntm, \ - mem, \ thread \ ); \ \ @@ -295,7 +258,7 @@ void PASTECH2(bls_,ch,opname) \ &m_max, &k_max, \ p, rs_p, cs_p, \ &pd_p, ps_p, \ - mem \ + bli_thrinfo_mem( thread ) \ ); \ \ /* Pack matrix A to the destination buffer chosen above. Here, the packed @@ -311,13 +274,13 @@ void PASTECH2(bls_,ch,opname) \ kappa, \ a, rs_a, cs_a, \ *p, *rs_p, *cs_p, \ - pd_p, *ps_p, \ + pd_p, *ps_p, \ cntx, \ thread \ ); \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thrinfo_barrier( thread ); \ } //INSERT_GENTFUNC_BASIC0( packm_a ) diff --git a/sandbox/gemmlike/bls_l3_packm_a.h b/sandbox/gemmlike/bls_l3_packm_a.h index 201a24efa..2ab53dcbf 100644 --- a/sandbox/gemmlike/bls_l3_packm_a.h +++ b/sandbox/gemmlike/bls_l3_packm_a.h @@ -41,8 +41,6 @@ void PASTECH2(bls_,ch,opname) \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ @@ -53,23 +51,6 @@ GENTPROT( scomplex, c, packm_init_mem_a ) GENTPROT( dcomplex, z, packm_init_mem_a ) -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTECH2(bls_,ch,opname) \ - ( \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ); \ - -//INSERT_GENTPROT_BASIC0( packm_finalize_mem_a ) -GENTPROT( float, s, packm_finalize_mem_a ) -GENTPROT( double, d, packm_finalize_mem_a ) -GENTPROT( scomplex, c, packm_finalize_mem_a ) -GENTPROT( dcomplex, z, packm_finalize_mem_a ) - - #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ @@ -109,8 +90,6 @@ void PASTECH2(bls_,ch,opname) \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c index cb8275fae..cc9757b1d 100644 --- a/sandbox/gemmlike/bls_l3_packm_b.c +++ b/sandbox/gemmlike/bls_l3_packm_b.c @@ -43,8 +43,6 @@ void PASTECH2(bls_,ch,opname) \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ @@ -61,16 +59,18 @@ void PASTECH2(bls_,ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thrinfo_barrier( thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \ +\ + mem_t* mem = bli_thrinfo_mem( thread ); \ \ /* Check the mem_t entry provided by the caller. If it is unallocated, then we need to acquire a block from the packed block allocator. */ \ if ( bli_mem_is_unalloc( mem ) ) \ { \ - if ( bli_thread_am_ochief( thread ) ) \ + if ( bli_thrinfo_am_chief( thread ) ) \ { \ /* Acquire directly to the chief thread's mem_t that was passed in. It needs to be that mem_t struct, and not a local (temporary) @@ -81,7 +81,7 @@ void PASTECH2(bls_,ch,opname) \ again, I prefer to keep barriers to a minimum.) */ \ bli_pba_acquire_m \ ( \ - rntm, \ + bli_thrinfo_pba( thread ), \ size_needed, \ pack_buf_type, \ mem \ @@ -90,13 +90,13 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ + mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The chief thread already has the mem_t, so it does not need to perform any copy.) */ \ - if ( !bli_thread_am_ochief( thread ) ) \ + if ( !bli_thrinfo_am_chief( thread ) ) \ { \ *mem = *mem_p; \ } \ @@ -115,7 +115,7 @@ void PASTECH2(bls_,ch,opname) \ \ if ( mem_size < size_needed ) \ { \ - if ( bli_thread_am_ochief( thread ) ) \ + if ( bli_thrinfo_am_chief( thread ) ) \ { \ /* The chief thread releases the existing block associated with the mem_t, and then re-acquires a new block, saving @@ -125,12 +125,12 @@ void PASTECH2(bls_,ch,opname) \ (temporary) mem_t. */ \ bli_pba_release \ ( \ - rntm, \ + bli_thrinfo_pba( thread ), \ mem \ ); \ bli_pba_acquire_m \ ( \ - rntm, \ + bli_thrinfo_pba( thread ), \ size_needed, \ pack_buf_type, \ mem \ @@ -139,13 +139,13 @@ void PASTECH2(bls_,ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \ + mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The chief thread already has the mem_t, so it does not need to perform any copy.) */ \ - if ( !bli_thread_am_ochief( thread ) ) \ + if ( !bli_thrinfo_am_chief( thread ) ) \ { \ *mem = *mem_p; \ } \ @@ -165,39 +165,6 @@ GENTFUNC( scomplex, c, packm_init_mem_b ) GENTFUNC( dcomplex, z, packm_init_mem_b ) -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname ) \ -\ -void PASTECH2(bls_,ch,opname) \ - ( \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ) \ -{ \ - if ( thread != NULL ) \ - if ( bli_thread_am_ochief( thread ) ) \ - { \ - /* Check the mem_t entry provided by the caller. Only proceed if it - is allocated, which it should be. */ \ - if ( bli_mem_is_alloc( mem ) ) \ - { \ - bli_pba_release \ - ( \ - rntm, \ - mem \ - ); \ - } \ - } \ -} - -//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_b ) -GENTFUNC( float, s, packm_finalize_mem_b ) -GENTFUNC( double, d, packm_finalize_mem_b ) -GENTFUNC( scomplex, c, packm_finalize_mem_b ) -GENTFUNC( dcomplex, z, packm_finalize_mem_b ) - - #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ @@ -267,8 +234,6 @@ void PASTECH2(bls_,ch,opname) \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ @@ -282,8 +247,6 @@ void PASTECH2(bls_,ch,opname) \ ( \ k_alloc, n_alloc, nr, \ cntx, \ - rntm, \ - mem, \ thread \ ); \ \ @@ -295,7 +258,7 @@ void PASTECH2(bls_,ch,opname) \ &k_max, &n_max, \ p, rs_p, cs_p, \ &pd_p, ps_p, \ - mem \ + bli_thrinfo_mem( thread ) \ ); \ \ /* Pack matrix B to the destination buffer chosen above. Here, the packed @@ -311,13 +274,13 @@ void PASTECH2(bls_,ch,opname) \ kappa, \ b, rs_b, cs_b, \ *p, *rs_p, *cs_p, \ - pd_p, *ps_p, \ + pd_p, *ps_p, \ cntx, \ thread \ ); \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_barrier( rntm, thread ); \ + bli_thrinfo_barrier( thread ); \ } //INSERT_GENTFUNC_BASIC0( packm_b ) diff --git a/sandbox/gemmlike/bls_l3_packm_b.h b/sandbox/gemmlike/bls_l3_packm_b.h index 728d21aed..791cf9b71 100644 --- a/sandbox/gemmlike/bls_l3_packm_b.h +++ b/sandbox/gemmlike/bls_l3_packm_b.h @@ -41,8 +41,6 @@ void PASTECH2(bls_,ch,opname) \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ @@ -53,23 +51,6 @@ GENTPROT( scomplex, c, packm_init_mem_b ) GENTPROT( dcomplex, z, packm_init_mem_b ) -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTECH2(bls_,ch,opname) \ - ( \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ - thrinfo_t* restrict thread \ - ); \ - -//INSERT_GENTPROT_BASIC0( packm_finalize_mem_b ) -GENTPROT( float, s, packm_finalize_mem_b ) -GENTPROT( double, d, packm_finalize_mem_b ) -GENTPROT( scomplex, c, packm_finalize_mem_b ) -GENTPROT( dcomplex, z, packm_finalize_mem_b ) - - #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ @@ -109,8 +90,6 @@ void PASTECH2(bls_,ch,opname) \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ - mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ diff --git a/sandbox/gemmlike/bls_l3_packm_var.h b/sandbox/gemmlike/bls_l3_packm_var.h index 98300536b..4c6db2cac 100644 --- a/sandbox/gemmlike/bls_l3_packm_var.h +++ b/sandbox/gemmlike/bls_l3_packm_var.h @@ -41,7 +41,7 @@ \ void PASTECH2(bls_,ch,varname) \ ( \ - trans_t transc, \ + conj_t conjc, \ pack_t schema, \ dim_t m, \ dim_t n, \ diff --git a/sandbox/gemmlike/bls_l3_packm_var1.c b/sandbox/gemmlike/bls_l3_packm_var1.c index c0649a9ec..e4d566b44 100644 --- a/sandbox/gemmlike/bls_l3_packm_var1.c +++ b/sandbox/gemmlike/bls_l3_packm_var1.c @@ -43,7 +43,7 @@ \ void PASTECH2(bls_,ch,varname) \ ( \ - trans_t transc, \ + conj_t conjc, \ pack_t schema, \ dim_t m, \ dim_t n, \ @@ -73,11 +73,6 @@ void PASTECH2(bls_,ch,varname) \ inc_t incc; \ inc_t ldc; \ inc_t ldp; \ - conj_t conjc; \ -\ -\ - /* Extract the conjugation bit from the transposition argument. */ \ - conjc = bli_extract_conj( transc ); \ \ /* Create flags to incidate row or column storage. Note that the schema bit that encodes row or column is describing the form of @@ -126,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ + const dim_t nt = bli_thrinfo_num_threads( thread ); \ + const dim_t tid = bli_thrinfo_thread_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/sandbox/gemmlike/bls_l3_packm_var2.c index 8d2b90cac..3e7e7888a 100644 --- a/sandbox/gemmlike/bls_l3_packm_var2.c +++ b/sandbox/gemmlike/bls_l3_packm_var2.c @@ -43,7 +43,7 @@ \ void PASTECH2(bls_,ch,varname) \ ( \ - trans_t transc, \ + conj_t conjc, \ pack_t schema, \ dim_t m, \ dim_t n, \ @@ -73,11 +73,6 @@ void PASTECH2(bls_,ch,varname) \ inc_t incc; \ inc_t ldc; \ inc_t ldp; \ - conj_t conjc; \ -\ -\ - /* Extract the conjugation bit from the transposition argument. */ \ - conjc = bli_extract_conj( transc ); \ \ /* Create flags to incidate row or column storage. Note that the schema bit that encodes row or column is describing the form of @@ -126,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ + const dim_t nt = bli_thrinfo_num_threads( thread ); \ + const dim_t tid = bli_thrinfo_thread_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ diff --git a/sandbox/gemmlike/bls_l3_packm_var3.c b/sandbox/gemmlike/bls_l3_packm_var3.c index 5ea80ff42..4ccb1828d 100644 --- a/sandbox/gemmlike/bls_l3_packm_var3.c +++ b/sandbox/gemmlike/bls_l3_packm_var3.c @@ -45,7 +45,7 @@ \ void PASTECH2(bls_,ch,varname) \ ( \ - trans_t transc, \ + conj_t conjc, \ pack_t schema, \ dim_t m, \ dim_t n, \ @@ -75,11 +75,6 @@ void PASTECH2(bls_,ch,varname) \ inc_t incc; \ inc_t ldc; \ inc_t ldp; \ - conj_t conjc; \ -\ -\ - /* Extract the conjugation bit from the transposition argument. */ \ - conjc = bli_extract_conj( transc ); \ \ /* Create flags to incidate row or column storage. Note that the schema bit that encodes row or column is describing the form of @@ -126,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thread_n_way( thread ); \ - const dim_t tid = bli_thread_work_id( thread ); \ + const dim_t nt = bli_thrinfo_num_threads( thread ); \ + const dim_t tid = bli_thrinfo_thread_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ diff --git a/sandbox/gemmlike/thread/bls_l3_decor.c b/sandbox/gemmlike/thread/bls_l3_decor.c deleted file mode 100644 index 7fa799f14..000000000 --- a/sandbox/gemmlike/thread/bls_l3_decor.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2022, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -// Initialize a function pointer array containing function addresses for -// each of the threading-specific level-3 thread decorators. - -static l3sbx_decor_ft l3_decor_fpa[ BLIS_NUM_THREAD_IMPLS ] = -{ - [BLIS_SINGLE] = bls_l3_thread_decorator_single, - [BLIS_OPENMP] = -#if defined(BLIS_ENABLE_OPENMP) - bls_l3_thread_decorator_openmp, -#elif defined(BLIS_ENABLE_PTHREADS) - NULL, -#else - NULL, -#endif - [BLIS_POSIX] = -#if defined(BLIS_ENABLE_PTHREADS) - bls_l3_thread_decorator_pthreads, -#elif defined(BLIS_ENABLE_OPENMP) - NULL, -#else - NULL, -#endif -}; - -// Define a dispatcher that chooses a threading-specific function from the -// above function pointer array. - -void bls_l3_thread_decorator - ( - l3sbxint_ft func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ) -{ - rntm_t rntm_l; - - // Query the threading implementation and the number of threads requested. - timpl_t ti = bli_rntm_thread_impl( rntm ); - dim_t nt = bli_rntm_num_threads( rntm ); - - if ( bli_error_checking_is_enabled() ) - bls_l3_thread_decorator_check( rntm ); - - if ( 1 < nt && ti == BLIS_SINGLE ) - { - // Here, we resolve conflicting information. The caller requested - // a sequential threading implementation, but also requested more - // than one thread. Here, we choose to favor the requested threading - // implementation over the number of threads, and so reset all - // parallelism parameters to 1. - rntm_l = *rntm; - nt = 1; - bli_rntm_set_ways_only( 1, 1, 1, 1, 1, &rntm_l ); - bli_rntm_set_num_threads_only( 1, &rntm_l ); - rntm = &rntm_l; - } - - // Use the timpl_t value to index into the corresponding function address - // from the function pointer array. - const l3sbx_decor_ft fp = l3_decor_fpa[ ti ]; - - // Call the threading-specific decorator function. - fp - ( - func, - family, - alpha, - a, - b, - beta, - c, - cntx, - rntm - ); -} - -void bls_l3_thread_decorator_check - ( - rntm_t* rntm - ) -{ - //err_t e_val; - - //e_val = bli_check_valid_thread_impl( bli_rntm_thread_impl( rntm ) ); - //bli_check_error_code( e_val ); - - const timpl_t ti = bli_rntm_thread_impl( rntm ); - - if ( -#ifndef BLIS_ENABLE_OPENMP - ti == BLIS_OPENMP || -#endif -#ifndef BLIS_ENABLE_PTHREADS - ti == BLIS_POSIX || -#endif - FALSE - ) - { - fprintf( stderr, "\n" ); - fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); - fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); - fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ ); - bli_abort(); - } -} - diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c deleted file mode 100644 index 9c29ef27e..000000000 --- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2021, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#ifdef BLIS_ENABLE_OPENMP - -//#define PRINT_THRINFO - -void bls_l3_thread_decorator_openmp - ( - l3sbxint_ft func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ) -{ - // Query the total number of threads from the rntm_t object. - const dim_t n_threads = bli_rntm_num_threads( rntm ); - - // NOTE: The sba was initialized in bli_init(). - - // Check out an array_t from the small block allocator. This is done - // with an internal lock to ensure only one application thread accesses - // the sba at a time. bli_sba_checkout_array() will also automatically - // resize the array_t, if necessary. - array_t* array = bli_sba_checkout_array( n_threads ); - - // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we have the rntm_t.sba_pool field - // initialized and ready for the global communicator creation below. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. This will be - // inherited by all of the child threads when they make local copies of - // the rntm below. - bli_pba_rntm_set_pba( rntm ); - - // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); - - - _Pragma( "omp parallel num_threads(n_threads)" ) - { - // Create a thread-local copy of the master thread's rntm_t. This is - // necessary since we want each thread to be able to track its own - // small block pool_t as it executes down the function stack. - rntm_t rntm_l = *rntm; - rntm_t* restrict rntm_p = &rntm_l; - - // Query the thread's id from OpenMP. - const dim_t tid = omp_get_thread_num(); - - // Check for a somewhat obscure OpenMP thread-mistmatch issue. - bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); - - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - bli_sba_rntm_set_pool( tid, array, rntm_p ); - - thrinfo_t* thread = NULL; - - // Create the root node of the thread's thrinfo_t structure. - bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); - - func - ( - alpha, - a, - b, - beta, - c, - cntx, - rntm_p, - thread - ); - - // Free the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_free( rntm_p, thread ); - } - - // We shouldn't free the global communicator since it was already freed - // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called from the thread entry function). - - // Check the array_t back into the small block allocator. Similar to the - // check-out, this is done using a lock embedded within the sba to ensure - // mutual exclusion. - bli_sba_checkin_array( array ); -} - -#endif - diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.h b/sandbox/gemmlike/thread/bls_l3_decor_openmp.h deleted file mode 100644 index 8198a1ba1..000000000 --- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2021, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_SBX_L3_DECOR_OPENMP_H -#define BLIS_SBX_L3_DECOR_OPENMP_H - -// Definitions specific to situations when OpenMP multithreading is enabled. -#ifdef BLIS_ENABLE_OPENMP - -void bls_l3_thread_decorator_openmp - ( - l3sbxint_ft func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ); - -#endif - -#endif - diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c deleted file mode 100644 index 95d0e968e..000000000 --- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c +++ /dev/null @@ -1,222 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2021, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#ifdef BLIS_ENABLE_PTHREADS - -// A data structure to assist in passing operands to additional threads. -typedef struct thread_data -{ - l3sbxint_ft func; - opid_t family; - obj_t* alpha; - obj_t* a; - obj_t* b; - obj_t* beta; - obj_t* c; - cntx_t* cntx; - rntm_t* rntm; - dim_t tid; - thrcomm_t* gl_comm; - array_t* array; -} thread_data_t; - -// Entry point function for additional threads. -void* bls_l3_thread_entry( void* data_void ) -{ - thread_data_t* data = data_void; - - l3sbxint_ft func = data->func; - opid_t family = data->family; - obj_t* alpha = data->alpha; - obj_t* a = data->a; - obj_t* b = data->b; - obj_t* beta = data->beta; - obj_t* c = data->c; - cntx_t* cntx = data->cntx; - rntm_t* rntm = data->rntm; - dim_t tid = data->tid; - array_t* array = data->array; - thrcomm_t* gl_comm = data->gl_comm; - - ( void )family; - - // Create a thread-local copy of the master thread's rntm_t. This is - // necessary since we want each thread to be able to track its own - // small block pool_t as it executes down the function stack. - rntm_t rntm_l = *rntm; - rntm_t* restrict rntm_p = &rntm_l; - - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - bli_sba_rntm_set_pool( tid, array, rntm_p ); - - thrinfo_t* thread = NULL; - - // Create the root node of the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); - - func - ( - alpha, - a, - b, - beta, - c, - cntx, - rntm_p, - thread - ); - - // Free the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_free( rntm_p, thread ); - - return NULL; -} - -void bls_l3_thread_decorator_pthreads - ( - l3sbxint_ft func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ) -{ - err_t r_val; - - // Query the total number of threads from the context. - const dim_t n_threads = bli_rntm_num_threads( rntm ); - - // NOTE: The sba was initialized in bli_init(). - - // Check out an array_t from the small block allocator. This is done - // with an internal lock to ensure only one application thread accesses - // the sba at a time. bli_sba_checkout_array() will also automatically - // resize the array_t, if necessary. - array_t* restrict array = bli_sba_checkout_array( n_threads ); - - // Access the pool_t* for thread 0 and embed it into the rntm. We do - // this up-front only so that we have the rntm_t.sba_pool field - // initialized and ready for the global communicator creation below. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. This will be - // inherited by all of the child threads when they make local copies of - // the rntm below. - bli_pba_rntm_set_pba( rntm ); - - // Allocate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); - - // Allocate an array of pthread objects and auxiliary data structs to pass - // to the thread entry functions. - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_thread_decorator().pth: " ); - #endif - bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_thread_decorator().pth: " ); - #endif - thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val ); - - // NOTE: We must iterate backwards so that the chief thread (thread id 0) - // can spawn all other threads before proceeding with its own computation. - for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- ) - { - // Set up thread data for additional threads (beyond thread 0). - datas[tid].func = func; - datas[tid].family = family; - datas[tid].alpha = alpha; - datas[tid].a = a; - datas[tid].b = b; - datas[tid].beta = beta; - datas[tid].c = c; - datas[tid].cntx = cntx; - datas[tid].rntm = rntm; - datas[tid].tid = tid; - datas[tid].gl_comm = gl_comm; - datas[tid].array = array; - - // Spawn additional threads for ids greater than 1. - if ( tid != 0 ) - bli_pthread_create( &pthreads[tid], NULL, &bls_l3_thread_entry, &datas[tid] ); - else - bls_l3_thread_entry( ( void* )(&datas[0]) ); - } - - // We shouldn't free the global communicator since it was already freed - // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called from the thread entry function). - - // Thread 0 waits for additional threads to finish. - for ( dim_t tid = 1; tid < n_threads; tid++ ) - { - bli_pthread_join( pthreads[tid], NULL ); - } - - // Check the array_t back into the small block allocator. Similar to the - // check-out, this is done using a lock embedded within the sba to ensure - // mutual exclusion. - bli_sba_checkin_array( array ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_thread_decorator().pth: " ); - #endif - bli_free_intl( pthreads ); - - #ifdef BLIS_ENABLE_MEM_TRACING - printf( "bli_l3_thread_decorator().pth: " ); - #endif - bli_free_intl( datas ); -} - -#else - -// Define a dummy function bli_l3_thread_entry(), which is needed for -// consistent dynamic linking behavior when building shared objects in Linux -// or OSX, or Windows DLLs; otherwise, we risk having an unresolved symbol. -void* bli_l3_thread_entry( void* data_void ) { return NULL; } - -#endif - diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h deleted file mode 100644 index 162086bb0..000000000 --- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2021, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_SBX_L3_DECOR_PTHREADS_H -#define BLIS_SBX_L3_DECOR_PTHREADS_H - -// Definitions specific to situations when POSIX multithreading is enabled. -#ifdef BLIS_ENABLE_PTHREADS - -// Thread entry point prototype. -void* bls_l3_thread_entry( void* data_void ); - -void bls_l3_thread_decorator_pthreads - ( - l3sbxint_ft func, - opid_t family, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ); - -#endif - -#endif - diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.c b/sandbox/gemmlike/thread/bls_l3_decor_single.c deleted file mode 100644 index b5f5a6669..000000000 --- a/sandbox/gemmlike/thread/bls_l3_decor_single.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2021, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#define SKIP_THRINFO_TREE - -void bls_l3_thread_decorator_single - ( - l3sbxint_ft func, - opid_t family, - //pack_t schema_a, - //pack_t schema_b, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm - ) -{ - // For sequential execution, we use only one thread. - const dim_t n_threads = 1; - - // NOTE: The sba was initialized in bli_init(). - - // Check out an array_t from the small block allocator. This is done - // with an internal lock to ensure only one application thread accesses - // the sba at a time. bli_sba_checkout_array() will also automatically - // resize the array_t, if necessary. - array_t* array = bli_sba_checkout_array( n_threads ); - - // Access the pool_t* for thread 0 and embed it into the rntm. - bli_sba_rntm_set_pool( 0, array, rntm ); - - // Set the packing block allocator field of the rntm. - bli_pba_rntm_set_pba( rntm ); - -#ifndef SKIP_THRINFO_TREE - // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads ); -#endif - - - { - // NOTE: We don't need to create another copy of the rntm_t since - // it was already copied in one of the high-level oapi functions. - rntm_t* rntm_p = rntm; - - // There is only one thread id (for the thief thread). - const dim_t tid = 0; - - // Use the thread id to access the appropriate pool_t* within the - // array_t, and use it to set the sba_pool field within the rntm_t. - // If the pool_t* element within the array_t is NULL, it will first - // be allocated/initialized. - // NOTE: This is commented out because, in the single-threaded case, - // this is redundant since it's already been done above. - //bli_sba_rntm_set_pool( tid, array, rntm_p ); - -#ifndef SKIP_THRINFO_TREE - thrinfo_t* thread = NULL; - - // Create the root node of the thread's thrinfo_t structure. - bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); -#else - // This optimization allows us to use one of the global thrinfo_t - // objects for single-threaded execution rather than grow one from - // scratch. The key is that bli_thrinfo_sup_grow(), which is called - // from within the variants, will immediately return if it detects - // that the thrinfo_t* passed into it is either - // &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED. - thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED; - - ( void )tid; -#endif - - func - ( - alpha, - a, - b, - beta, - c, - cntx, - rntm_p, - thread - ); - -#ifndef SKIP_THRINFO_TREE - // Free the current thread's thrinfo_t structure. - bli_l3_sup_thrinfo_free( rntm_p, thread ); -#endif - } - - // We shouldn't free the global communicator since it was already freed - // by the global communicator's chief thread in bli_l3_thrinfo_free() - // (called above). - - // Check the array_t back into the small block allocator. Similar to the - // check-out, this is done using a lock embedded within the sba to ensure - // mutual exclusion. - bli_sba_checkin_array( array ); -} - diff --git a/sandbox/old/ref99/old/packm/blx_l3_packm.c b/sandbox/old/ref99/old/packm/blx_l3_packm.c index 982e2d963..dcec1e8cb 100644 --- a/sandbox/old/ref99/old/packm/blx_l3_packm.c +++ b/sandbox/old/ref99/old/packm/blx_l3_packm.c @@ -51,7 +51,7 @@ void blx_l3_packm siz_t size_needed; // FGVZ: Not sure why we need this barrier, but we do. - bli_thread_barrier( thread ); + bli_thrinfo_barrier( thread ); // Every thread initializes x_pack and determines the size of memory // block needed (which gets embedded into the otherwise "blank" mem_t @@ -102,7 +102,7 @@ void blx_l3_packm // Broadcast the address of the chief thread's local mem_t entry to // all threads. - local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); + local_mem_p = bli_thrinfo_broadcast( thread, &local_mem_s ); // Save the contents of the chief thread's local mem_t entry to the // mem_t field in this thread's control tree node. @@ -142,7 +142,7 @@ void blx_l3_packm // Broadcast the address of the chief thread's local mem_t entry to // all threads. - local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); + local_mem_p = bli_thrinfo_broadcast( thread, &local_mem_s ); // Save the chief thread's local mem_t entry to the mem_t field in // this thread's control tree node. @@ -155,7 +155,7 @@ void blx_l3_packm // will already have the cached values in their local control // trees' mem_t entries, currently pointed to by cntl_mem_p. - bli_thread_barrier( thread ); + bli_thrinfo_barrier( thread ); } } @@ -178,6 +178,6 @@ void blx_l3_packm ); // Barrier so that packing is done before computation. - bli_thread_barrier( thread ); + bli_thrinfo_barrier( thread ); } diff --git a/sandbox/old/ref99/old/vars/blx_gemm_blk_var3.c b/sandbox/old/ref99/old/vars/blx_gemm_blk_var3.c index 6e8786268..6342f5ab6 100644 --- a/sandbox/old/ref99/old/vars/blx_gemm_blk_var3.c +++ b/sandbox/old/ref99/old/vars/blx_gemm_blk_var3.c @@ -73,14 +73,14 @@ void blx_gemm_blk_var3 bli_thrinfo_sub_node( thread ) ); - bli_thread_barrier( bli_thrinfo_sub_node( thread ) ); + bli_thrinfo_barrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it // only for the first iteration (and then BLIS_ONE for all others). // And since c is a locally aliased obj_t, we can simply overwrite // the internal beta scalar with BLIS_ONE once it has been used in - // the first iteration. + // the first iteration. if ( i == 0 ) bli_obj_scalar_reset( c ); } } diff --git a/sandbox/old/ref99/old/vars/blx_gemm_ker_var2.c b/sandbox/old/ref99/old/vars/blx_gemm_ker_var2.c index 10c6b81ad..09d5e2c51 100644 --- a/sandbox/old/ref99/old/vars/blx_gemm_ker_var2.c +++ b/sandbox/old/ref99/old/vars/blx_gemm_ker_var2.c @@ -265,10 +265,10 @@ void PASTECH2(blx_,ch,varname) \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ + dim_t ir_nt = bli_thrinfo_n_way( caucus ); \ + dim_t ir_tid = bli_thrinfo_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ diff --git a/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2rr.c b/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2rr.c index 7cbd402e0..2095f8bd2 100644 --- a/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2rr.c +++ b/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2rr.c @@ -262,10 +262,10 @@ void PASTECH2(blx_,ch,varname) \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ + dim_t ir_nt = bli_thrinfo_n_way( caucus ); \ + dim_t ir_tid = bli_thrinfo_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ diff --git a/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2sl.c b/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2sl.c index 2d46886b7..9a1c63a29 100644 --- a/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2sl.c +++ b/sandbox/old/ref99/old/vars/other/blx_gemm_ker_var2sl.c @@ -262,10 +262,10 @@ void PASTECH2(blx_,ch,varname) \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ /* Query the number of threads and thread ids for each loop. */ \ - dim_t jr_nt = bli_thread_n_way( thread ); \ - dim_t jr_tid = bli_thread_work_id( thread ); \ - dim_t ir_nt = bli_thread_n_way( caucus ); \ - dim_t ir_tid = bli_thread_work_id( caucus ); \ + dim_t jr_nt = bli_thrinfo_n_way( thread ); \ + dim_t jr_tid = bli_thrinfo_work_id( thread ); \ + dim_t ir_nt = bli_thrinfo_n_way( caucus ); \ + dim_t ir_tid = bli_thrinfo_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ diff --git a/sandbox/power10/bli_gemm_ex.c b/sandbox/power10/bli_gemm_ex.c index 3334dc4a5..d136c7e1b 100644 --- a/sandbox/power10/bli_gemm_ex.c +++ b/sandbox/power10/bli_gemm_ex.c @@ -52,7 +52,7 @@ void bli_gemm_ex obj_t* beta, obj_t* c, cntx_t* cntx, - rntm_t* rntm + rntm_t* rntm ) { bli_init_once(); @@ -73,7 +73,7 @@ void bli_gemm_ex // Invoke the operation's front end. bli_gemm_front ( - alpha, a, b, beta, c, cntx, rntm, NULL + alpha, a, b, beta, c, cntx, rntm ); } diff --git a/test/syrk_diagonal/syrk_diagonal_example2.c b/test/syrk_diagonal/syrk_diagonal_example2.c index 92371f48b..710db815c 100644 --- a/test/syrk_diagonal/syrk_diagonal_example2.c +++ b/test/syrk_diagonal/syrk_diagonal_example2.c @@ -226,8 +226,8 @@ void packm_diag /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ - const dim_t nt = bli_thread_n_way( thread ); - const dim_t tid = bli_thread_work_id( thread ); + const dim_t nt = bli_thrinfo_n_way( thread ); + const dim_t tid = bli_thrinfo_work_id( thread ); /* Determine the thread range and increment using the current thread's packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() diff --git a/test/syrk_diagonal/syrk_diagonal_example2.cxx b/test/syrk_diagonal/syrk_diagonal_example2.cxx index 8312a07ee..cc98d97ef 100644 --- a/test/syrk_diagonal/syrk_diagonal_example2.cxx +++ b/test/syrk_diagonal/syrk_diagonal_example2.cxx @@ -210,8 +210,8 @@ void packm_diag /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ - const dim_t nt = bli_thread_n_way( thread ); - const dim_t tid = bli_thread_work_id( thread ); + const dim_t nt = bli_thrinfo_n_way( thread ); + const dim_t tid = bli_thrinfo_work_id( thread ); /* Determine the thread range and increment using the current thread's packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() diff --git a/test/tensor_contraction/tcontract_example.cxx b/test/tensor_contraction/tcontract_example.cxx index 0b935c54d..caff3604b 100644 --- a/test/tensor_contraction/tcontract_example.cxx +++ b/test/tensor_contraction/tcontract_example.cxx @@ -431,12 +431,12 @@ void packm_tensor } /* Wait for the scatter vectors to be done. */ - bli_thread_barrier( thread ); + bli_thrinfo_barrier( thread ); /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ - auto nt = bli_thread_n_way( thread ); - auto tid = bli_thread_work_id( thread ); + auto nt = bli_thrinfo_n_way( thread ); + auto tid = bli_thrinfo_work_id( thread ); /* Determine the thread range and increment using the current thread's packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() @@ -656,7 +656,7 @@ void gemm_tensor } /* Wait for the scatter vectors to be done. */ - bli_thread_barrier( thread ); + bli_thrinfo_barrier( thread ); /* Compute number of primary and leftover components of the m and n dimensions. */ @@ -684,10 +684,10 @@ void gemm_tensor auto caucus = bli_thrinfo_sub_node( thread ); /* Query the number of threads and thread ids for each loop. */ - auto jr_nt = bli_thread_n_way( thread ); - auto jr_tid = bli_thread_work_id( thread ); - auto ir_nt = bli_thread_n_way( caucus ); - auto ir_tid = bli_thread_work_id( caucus ); + auto jr_nt = bli_thrinfo_n_way( thread ); + auto jr_tid = bli_thrinfo_work_id( thread ); + auto ir_nt = bli_thrinfo_n_way( caucus ); + auto ir_tid = bli_thrinfo_work_id( caucus ); /* Determine the thread range and increment for the 2nd and 1st loops. NOTE: The definition of bli_thread_range_jrir() will depend on whether diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 69ee4339d..f3b5f7b52 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -231,16 +231,12 @@ void libblis_test_gemm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); - rntm_t rntm; - bli_rntm_init( &rntm ); - bli_pba_rntm_set_pba( &rntm ); - // Transpose B to B^T for packing. bli_obj_induce_trans( &b ); // Create pack objects for a and b, and pack them to ap and bp, // respectively. - cntl_t* cntl_a = libblis_test_pobj_create + thrinfo_t* thread_a = libblis_test_pobj_create ( BLIS_MR, BLIS_KR, @@ -248,10 +244,9 @@ void libblis_test_gemm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - cntx, - &rntm + cntx ); - cntl_t* cntl_b = libblis_test_pobj_create + thrinfo_t* thread_b = libblis_test_pobj_create ( BLIS_NR, BLIS_KR, @@ -259,8 +254,7 @@ void libblis_test_gemm_ukr_experiment BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL, &b, &bp, - cntx, - &rntm + cntx ); // Transpose B^T back to B and Bp^T back to Bp. @@ -293,8 +287,8 @@ void libblis_test_gemm_ukr_experiment // Free the control tree nodes and release their cached mem_t entries // back to the pba. - bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + bli_thrinfo_free( thread_a ); + bli_thrinfo_free( thread_b ); // Free the test objects. bli_obj_free( &a ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index 44ba51587..480e49c2d 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -283,13 +283,9 @@ void libblis_test_gemmtrsm_ukr_experiment bli_copym( &b11, &c11 ); bli_copym( &c11, &c11_save ); - rntm_t rntm; - bli_rntm_init( &rntm ); - bli_pba_rntm_set_pba( &rntm ); - // Create pack objects for a and b, and pack them to ap and bp, // respectively. - cntl_t* cntl_a = libblis_test_pobj_create + thrinfo_t* thread_a = libblis_test_pobj_create ( BLIS_MR, BLIS_MR, @@ -297,8 +293,7 @@ void libblis_test_gemmtrsm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - cntx, - &rntm + cntx ); // Set the diagonal offset of ap. @@ -315,7 +310,7 @@ bli_printm( "a", &a, "%5.2f", "" ); bli_printm( "ap", &ap, "%5.2f", "" ); #endif - cntl_t* cntl_b = NULL; + thrinfo_t* thread_b = NULL; // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) @@ -325,7 +320,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Transpose B to B^T for packing. bli_obj_induce_trans( &b ); - cntl_b = libblis_test_pobj_create + thread_b = libblis_test_pobj_create ( BLIS_NR, BLIS_MR, @@ -333,8 +328,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL, &b, &bp, - cntx, - &rntm + cntx ); // Transpose B^T back to B and Bp^T back to Bp. @@ -362,9 +356,9 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // to perform the correctness check later. if ( i < n_repeats - 1 ) { - // Free the control tree nodes and release their cached mem_t entries + // Free the thread control tree nodes and release their cached mem_t entries // back to the memory broker. - bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + bli_thrinfo_free( thread_b ); } } @@ -401,11 +395,11 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c11, perf, resid ); - // Free the control tree nodes and release their cached mem_t entries + // Free the thread control tree nodes and release their cached mem_t entries // back to the pba. - bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - if ( cntl_b ) - bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + bli_thrinfo_free( thread_a ); + if ( thread_b ) + bli_thrinfo_free( thread_b ); // Free the test objects. bli_obj_free( &a_big ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index a355385a3..aec9357ae 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -2652,17 +2652,20 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c } -cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm ) +thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ) { - bool does_inv_diag; + bool does_inv_diag; if ( inv_diag == BLIS_NO_INVERT_DIAG ) does_inv_diag = FALSE; else does_inv_diag = TRUE; + rntm_t rntm; + bli_rntm_init( &rntm ); + // Create a control tree node for the packing operation. cntl_t* cntl = bli_packm_cntl_create_node ( - NULL, // we don't need the small block allocator from the runtime. + NULL, // pass NULL as the pool so that malloc() is used. NULL, // func ptr is not referenced b/c we don't call via l3 _int(). bmult_id_m, bmult_id_n, @@ -2674,12 +2677,17 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia NULL // no child node needed ); + thrinfo_t* thread = bli_l3_thrinfo_create( 0, &BLIS_SINGLE_COMM, NULL, &rntm, cntl ); + // Pack the contents of A to P. - bli_packm_blk_var1( a, p, cntx, rntm, cntl, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( a, p, cntx, cntl, thread ); + + // Free the control tree. + bli_l3_cntl_free( NULL, cntl ); - // Return the control tree pointer so the caller can free the cntl_t and its + // Return the thread control tree pointer so the caller can free the thrinfo_t and its // mem_t entry later on. - return cntl; + return thread; } diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index 9e38964ee..93c892c4f 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -420,7 +420,7 @@ void fill_string_with_n_spaces( char* str, unsigned int n_spaces ); // --- Create object --- void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, char storage, dim_t m, dim_t n, obj_t* a ); -cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm ); +thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ); void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x ); // --- Randomize/initialize object --- diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 5f4988e1c..ae5c9a814 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -232,13 +232,9 @@ void libblis_test_trsm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); - rntm_t rntm; - bli_rntm_init( &rntm ); - bli_pba_rntm_set_pba( &rntm ); - // Create pack objects for a and b, and pack them to ap and bp, // respectively. - cntl_t* cntl_a = libblis_test_pobj_create + thrinfo_t* thread_a = libblis_test_pobj_create ( BLIS_MR, BLIS_MR, @@ -246,8 +242,7 @@ void libblis_test_trsm_ukr_experiment BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, &a, &ap, - cntx, - &rntm + cntx ); // Set the diagonal offset of ap. @@ -271,7 +266,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Transpose B to B^T for packing. bli_obj_induce_trans( &b ); - cntl_t* cntl_b = libblis_test_pobj_create + thrinfo_t* thread_b = libblis_test_pobj_create ( BLIS_NR, BLIS_MR, @@ -279,8 +274,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_B_PANEL, &b, &bp, - cntx, - &rntm + cntx ); // Transpose B^T back to B and Bp^T back to Bp. @@ -297,7 +291,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Free the control tree nodes and release their cached mem_t entries // back to the memory broker. - bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + bli_thrinfo_free( thread_b ); } // Estimate the performance of the best experiment repeat. @@ -312,7 +306,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); // Free the control tree nodes and release their cached mem_t entries // back to the memory broker. - bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_thrinfo_free( thread_a ); // Free the test objects. bli_obj_free( &a ); From 29f79f030e939969d4f3876c4fdaac7b0c5daa63 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 31 Oct 2022 18:57:45 -0500 Subject: [PATCH 099/230] Fixed performance bug caused by redundant packing. (#680) Details: - Fixed a performance bug whereby multiple threads were redundantly packing the same (rather than separate) micropanels. This bug was caused by different parts of the code using the num_threads/thread_id field of the thrinfo_t vs. the n_way/work_id fields. The fix was to standardize on the latter and provide a "fake" thrinfo_t sub-prenode in the thrinfo tree which consists of single-member thread teams. The single team with multiple threads node is still required since it and only it can be used to perform barriers and broadcasts (e.g. of the packed buffer pointer). --- frame/1m/packm/bli_packm_blk_var1.c | 38 +++++++++++++++++------------ frame/1m/packm/bli_packm_int.c | 21 +++++++++++----- frame/3/bli_l3_thrinfo.c | 22 ++++++++++++++--- 3 files changed, 56 insertions(+), 25 deletions(-) diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index da49126a5..9ac9582db 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -54,11 +54,11 @@ static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md); void bli_packm_blk_var1 ( - const obj_t* c, - obj_t* p, - const cntx_t* cntx, - const cntl_t* cntl, - thrinfo_t* thread + const obj_t* c, + obj_t* p, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par ) { // Extract various fields from the control tree. @@ -67,12 +67,18 @@ void bli_packm_blk_var1 bool revifup = bli_cntl_packm_params_rev_iter_if_upper( cntl ); bool reviflo = bli_cntl_packm_params_rev_iter_if_lower( cntl ); - // Every thread initializes p and determines the size of memory - // block needed (which gets embedded into the otherwise "blank" mem_t - // entry in the control tree node). Return early if no packing is required. - if ( !bli_packm_init( c, p, cntx, cntl, thread ) ) + // Every thread initializes p and determines the size of memory block + // needed (which gets embedded into the otherwise "blank" mem_t entry + // in the control tree node). Return early if no packing is required. + if ( !bli_packm_init( c, p, cntx, cntl, bli_thrinfo_sub_node( thread_par ) ) ) return; + // Use the sub-prenode. In bli_l3_thrinfo_grow(), this node was created to + // represent the team of threads as a group of single-member thread teams. + // This is necessary since the all of the work distribution function depend + // on the work_id and n_way fields. + thrinfo_t* thread = bli_thrinfo_sub_prenode( thread_par ); + // Check parameters. if ( bli_error_checking_is_enabled() ) bli_packm_int_check( c, p, cntx ); @@ -134,11 +140,11 @@ void bli_packm_blk_var1 packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ]; } - /* Compute the total number of iterations we'll need. */ + // Compute the total number of iterations we'll need. dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); - /* Set the initial values and increments for indices related to C and P - based on whether reverse iteration was requested. */ + // Set the initial values and increments for indices related to C and P + // based on whether reverse iteration was requested. dim_t ic0, ip0; doff_t ic_inc, ip_inc; @@ -158,10 +164,10 @@ void bli_packm_blk_var1 ip_inc = 1; } - // Query the number of threads and thread ids from the current thread's - // packm thrinfo_t node. - const dim_t nt = bli_thrinfo_num_threads( thread ); - const dim_t tid = bli_thrinfo_thread_id( thread ); + // Query the number of threads (single-member thread teams) and the thread + // team ids from the current thread's packm thrinfo_t node. + const dim_t nt = bli_thrinfo_n_way( thread ); + const dim_t tid = bli_thrinfo_work_id( thread ); // Determine the thread range and increment using the current thread's // packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index fa4fcb47a..49d5a49a3 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -36,10 +36,10 @@ void bli_packm_int ( - const obj_t* a, - obj_t* p, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + obj_t* p, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -53,14 +53,23 @@ void bli_packm_int thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); bli_thrinfo_barrier( thread ); - // Invoke the variant with kappa_use. + // Invoke the packm variant. + // NOTE: The packing kernel uses two communicators: one which represents a + // single workgroup of many threads, and one which represents a group of + // many single-member workgroups. The former communicator is used for + // barriers and thread communication (i.e. broadcasting the pack buffer + // pointer), while the latter communicator is used for partitioning work. + // This is because all of the thread range functions rely on the work_id + // and number of workgroups (n_way). Thus, we pass along the parent + // thrinfo_t node which has these two communicators as the sub-node and + // sub-prenode, respectively. f ( a, p, cntx, cntl, - thread + thread_par ); // Barrier so that packing is done before computation. diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index 402497153..0b45abbf6 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -77,7 +77,19 @@ void bli_l3_thrinfo_grow thrinfo_t* thread_cur = bli_thrinfo_split( n_way, thread_par ); bli_thrinfo_set_sub_node( thread_cur, thread_par ); - if ( sub_prenode != NULL ) + if ( bszid == BLIS_NO_PART ) + { + // A hack: the packing code needs a thread communicator which represents + // a group of single-member thread teams working cooperatively However, + // the "normal" packm thrinfo_t node has a single team of multiple + // threads. Our solution (for now) is to create a sub-prenode on the + // thrinfo_t tree which splits this single team into multiple + // single-member thread teams. + const dim_t n_threads = bli_thrinfo_num_threads( thread_par ); + thrinfo_t* thread_pre = bli_thrinfo_split( n_threads, thread_par ); + bli_thrinfo_set_sub_prenode( thread_pre, thread_par ); + } + else if ( sub_prenode != NULL ) { // A pre-node is only used in the IC loop of trsm. In this case, // we cannot actually thread in the m dimension due to data dependencies @@ -88,8 +100,12 @@ void bli_l3_thrinfo_grow bli_rntm_set_ic_ways_only( 1, &rntm_l ); bli_rntm_set_jr_ways_only( ic_nway*jr_nway, &rntm_l ); - // Use thread_pre instead of thread_cur since we *don't* want to - // do any parallelism at this level. + // Use thread_pre instead of thread_cur since we *don't* want to do any + // parallelism at this level. So the thread_pre node gets attached to + // thread_par and not thread_cur! This results in a split "one level + // higher" than in the corresponding cntl_t tree. This is intentional + // since two different thrinfo_t nodes will be used at the cntl_t node + // for trsm blocked variant 1 (one for trsm, one for gemm). thrinfo_t* thread_pre = bli_thrinfo_split( 1, thread_par ); bli_thrinfo_set_sub_prenode( thread_pre, thread_par ); bli_l3_thrinfo_grow( thread_pre, &rntm_l, sub_prenode ); From 5eea6ad9eb25f37685d1ae4ae08c73cd1daca297 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 2 Nov 2022 17:07:54 -0500 Subject: [PATCH 100/230] Add mention of Wilkinson Prize to README.md. (#683) Details: - Added blurbs and links to Wilkinson Prize to README.md. - Added mention of both Best Paper and Wilkinson Prizes to the top of README.md. - Other minor tweaks. --- README.md | 66 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 012861366..e0e4238ca 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ +_Recipient of the **[2023 James H. Wilkinson Prize for Numerical Software](https://www.siam.org/prizes-recognition/major-prizes-lectures/detail/james-h-wilkinson-prize-for-numerical-software)**_ + +_Recipient of the **[2020 SIAM Activity Group on Supercomputing Best Paper Prize](https://www.siam.org/prizes-recognition/activity-group-prizes/detail/siag-sc-best-paper-prize)**_ + + ![The BLIS cat is sleeping.](http://www.cs.utexas.edu/users/field/blis_cat.png) [![Build Status](https://api.travis-ci.com/flame/blis.svg?branch=master)](https://app.travis-ci.com/github/flame/blis) @@ -22,12 +27,14 @@ Contents * **[Discussion](#discussion)** * **[Contributing](#contributing)** * **[Citations](#citations)** +* **[Awards](#awards)** * **[Funding](#funding)** Introduction ------------ -BLIS is a portable software framework for instantiating high-performance +BLIS is an [award-winning](#awards) +portable software framework for instantiating high-performance BLAS-like dense linear algebra libraries. The framework was designed to isolate essential kernels of computation that, when optimized, immediately enable optimized implementations of most of its commonly used and computationally @@ -99,16 +106,30 @@ all of which are available for free via the [edX platform](http://www.edx.org/). What's New ---------- + * **BLIS selected for the 2023 James H. Wilkinson Prize for Numerical Software!** We +are thrilled to announce that Field Van Zee and Devin Matthews were chosen to receive +the [2023 James H. Wilkinson Prize for Numerical Software](https://www.siam.org/prizes-recognition/major-prizes-lectures/detail/james-h-wilkinson-prize-for-numerical-software). +The selection committee sought to recognize the recipients "for the development of +BLIS, a portable open-source software framework that facilitates rapid instantiation +of high-performance BLAS and BLAS-like operations targeting modern CPUs." This prize +is awarded once every four years to the authors of an outstanding piece of numerical +software, or to individuals who have made an outstanding contribution to an existing +piece of numerical software. It is awarded to an entry that best addresses all phases +of the preparation of high-quality numerical software, and is intended to recognize +innovative software in scientific computing and to encourage researchers in the +earlier stages of their career. The prize will be awarded at the +[2023 SIAM Conference on Computational Science and Engineering](https://www.siam.org/conferences/cm/conference/cse23) in Amsterdam. + * **Join us on Discord!** In 2021, we soft-launched our [Discord](https://discord.com/) server by privately inviting current and former collaborators, attendees of our BLIS -Retreat, as well as other participants within the BLIS ecosystem. We've been thrilled by -the results thus far, and are happy to announce that our new community is now open to -the broader public! If you'd like to hang out with other BLIS users and developers, -ask a question, discuss future features, or just say hello, please feel free to join us! -We've put together a [step-by-step guide](docs/Discord.md) for creating an account and -joining our cozy enclave. We even have a monthly "BLIS happy hour" event where people -can casually come together for a video chat, Q&A, brainstorm session, or whatever it -happens to unfold into! +Retreat, as well as other participants within the BLIS ecosystem. We've been thrilled +by the results thus far, and are happy to announce that our new community is now open +to the broader public! If you'd like to hang out with other BLIS users and developers, +ask a question, discuss future features, or just say hello, please feel free to join +us! We've put together a [step-by-step guide](docs/Discord.md) for creating an account +and joining our cozy enclave. We even have a monthly "BLIS happy hour" event where +people can casually come together for a video chat, Q&A, brainstorm session, or +whatever it happens to unfold into! * **Addons feature now available!** Have you ever wanted to quickly extend BLIS's operation support or define new custom BLIS APIs for your application, but were @@ -622,10 +643,10 @@ releases. The source packages may build on other rpm-based distributions. the source rpms may build for others. * **GNU Guix**. Guix has BLIS packages, provides builds only for the generic -target and some specific x86_64 micro-architectures. +target and some specific `x86_64` micro-architectures. * **Conda**. conda channel [conda-forge](https://github.com/conda-forge/blis-feedstock) -has Linux, OSX and Windows binary packages for x86_64. +has Linux, OSX and Windows binary packages for `x86_64`. Discussion ---------- @@ -795,6 +816,29 @@ within the BLIS Framework}, } ``` +Awards +------ + + * **[2023 James H. Wilkinson Prize for Numerical Software.](https://www.siam.org/prizes-recognition/major-prizes-lectures/detail/james-h-wilkinson-prize-for-numerical-software)** +This prize is awarded once every four years to the authors of an outstanding piece of +numerical software, or to individuals who have made an outstanding contribution to an +existing piece of numerical software. The selection committee sought to recognize the +recipients "for the development of [BLIS](https://github.com/flame/blis), a portable +open-source software framework that facilitates rapid instantiation of +high-performance BLAS and BLAS-like operations targeting modern CPUs." The prize will +be awarded at the +[2023 SIAM Conference on Computational Science and Engineering](https://www.siam.org/conferences/cm/conference/cse23) in Amsterdam. + + * **[2020 SIAM Activity Group on Supercomputing Best Paper Prize.](https://www.siam.org/prizes-recognition/activity-group-prizes/detail/siag-sc-best-paper-prize)** +This prize is awarded once every two years to the authors of the most outstanding +paper, as determined by the selection committee, in the field of parallel scientific +and engineering computing published within the four calendar years preceding the +award year. The prize was chosen for the paper ["The BLIS Framework: Experiments in +Portability."](#citations) and awarded at the [2020 SIAM Conference on Parallel Processing for Scientific Computing](https://www.siam.org/conferences/cm/conference/pp20) in Seattle where Robert van de Geijn delivered [a talk on BLIS](https://meetings.siam.org/sess/dsp_programsess.cfm?SESSIONCODE=68266) and accepted the prize alongside other coauthors. +See also: + * [SIAM News | January 2020 Prize Spotlight](https://sinews.siam.org/Details-Page/january-2020-prize-spotlight#Field&Robert) + * [Oden Institute's SHPC Group Win SIAM Best Paper Prize](https://www.oden.utexas.edu/about/news/ScienceHighPerfomanceComputingSIAMBestPaperPrize/) + Funding ------- From edcc2f9940449f7d9cefcfc02159d27b013e7995 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 2 Nov 2022 19:04:49 -0500 Subject: [PATCH 101/230] Support --nosup, --sup configure options. (#684) Details: - Added --nosup and --sup as alternative ways of requesting that sup be disabled or enabled. These are analagous to --disable-sup-handling and --enable-sup-handling, respectively. (I got tired of typing out --disable-sup-handling and needed a shorthand notation.) - Tweaked message output by configure when sup is enable/disabled for clarity and specificity. - Whitespace changes. --- configure | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/configure b/configure index 37399fbde..49deec819 100755 --- a/configure +++ b/configure @@ -2570,12 +2570,15 @@ main() case $opt in -) case "$OPTARG" in + help) print_usage ;; + quiet) quiet_flag=1 ;; + prefix=*) prefix_flag=1 prefix=${OPTARG#*=} @@ -2596,6 +2599,7 @@ main() sharedir_flag=1 sharedir=${OPTARG#*=} ;; + enable-debug) debug_flag=1 debug_type=noopt @@ -2607,78 +2611,92 @@ main() disable-debug) debug_flag=0 ;; + enable-asan) enable_asan='yes' ;; disable-asan) enable_asan='no' ;; + enable-verbose-make) enable_verbose='yes' ;; disable-verbose-make) enable_verbose='no' ;; + enable-arg-max-hack) enable_arg_max_hack='yes' ;; disable-arg-max-hack) enable_arg_max_hack='no' ;; + enable-static) enable_static='yes' ;; disable-static) enable_static='no' ;; + enable-shared) enable_shared='yes' ;; disable-shared) enable_shared='no' ;; + enable-rpath) enable_rpath='yes' ;; disable-rpath) enable_rpath='no' ;; + export-shared=*) export_shared=${OPTARG#*=} ;; + enable-system) enable_system='yes' ;; disable-system) enable_system='no' ;; + enable-threading=*) threading_model=${OPTARG#*=} ;; disable-threading) threading_model='single' ;; + thread-part-jrir=*) thread_part_jrir=${OPTARG#*=} ;; + enable-pba-pools) enable_pba_pools='yes' ;; disable-pba-pools) enable_pba_pools='no' ;; + enable-sba-pools) enable_sba_pools='yes' ;; disable-sba-pools) enable_sba_pools='no' ;; + enable-mem-tracing) enable_mem_tracing='yes' ;; disable-mem-tracing) enable_mem_tracing='no' ;; + enable-addon=*) addon_flag=1 addon_name=${OPTARG#*=} @@ -2688,6 +2706,7 @@ main() disable-addon) addon_flag='' ;; + enable-sandbox=*) sandbox_flag=1 sandbox=${OPTARG#*=} @@ -2695,69 +2714,89 @@ main() disable-sandbox) sandbox_flag='' ;; + int-size=*) int_type_size=${OPTARG#*=} ;; + blas-int-size=*) blas_int_type_size=${OPTARG#*=} ;; + enable-blas) enable_blas='yes' ;; disable-blas) enable_blas='no' ;; + enable-cblas) enable_cblas='yes' ;; disable-cblas) enable_cblas='no' ;; + enable-mixed-dt) enable_mixed_dt='yes' ;; disable-mixed-dt) enable_mixed_dt='no' ;; + enable-mixed-dt-extra-mem) enable_mixed_dt_extra_mem='yes' ;; disable-mixed-dt-extra-mem) enable_mixed_dt_extra_mem='no' ;; + + sup) + enable_sup_handling='yes' + ;; enable-sup-handling) enable_sup_handling='yes' ;; + nosup) + enable_sup_handling='no' + ;; disable-sup-handling) enable_sup_handling='no' ;; + enable-amd-frame-tweaks) enable_amd_frame_tweaks='yes' ;; disable-amd-frame-tweaks) enable_amd_frame_tweaks='no' ;; + with-memkind) enable_memkind='yes' ;; without-memkind) enable_memkind='no' ;; + enable-trsm-preinversion) enable_trsm_preinversion='yes' ;; disable-trsm-preinversion) enable_trsm_preinversion='no' ;; + force-version=*) force_version=${OPTARG#*=} ;; + show-config-list) show_config_list=1 ;; + complex-return=*) complex_return=${OPTARG#*=} ;; + *) print_usage ;; @@ -3764,10 +3803,10 @@ main() enable_mixed_dt_01=0 fi if [ "x${enable_sup_handling}" = "xyes" ]; then - echo "${script_name}: small matrix handling is enabled." + echo "${script_name}: sup (skinny/unpacked) matrix handling is enabled." enable_sup_handling_01=1 else - echo "${script_name}: small matrix handling is disabled." + echo "${script_name}: sup (skinny/unpacked) matrix handling is disabled." enable_sup_handling_01=0 fi if [ "x${enable_trsm_preinversion}" = "xyes" ]; then From 872898d817f35702e7678ff7f3eeff0f12e641f5 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 2 Nov 2022 21:53:22 -0500 Subject: [PATCH 102/230] Fixed trmm[3]/trsm performance bug in cf7d616. (#685) Details: - Fixed a performance bug in the packing of micropanels that intersect the diagonal of triangular matrices (i.e., those found in trmm, trmm3, and trsm). This bug was introduced in cf7d616 and stemmed from an ill-formed boolean conditional expression in bli_packm_blk_var1(). This conditional would chose when to use round-robin parallel work allocation, but checked for the triangularity of the submatrix being packed while failing also to check for whether the current micropanel actually intersected the diagonal. The net result of this bug was that *all* micropanels of a triangular matrix, no matter where the upanels resided within the matrix, were assigned to threads via a round-robin policy. This affected some microarchitectures and threading configurations much worse than others, but it seems that overall the effect was universally negative, likely because of the reduced spatial locality during the packing with round-robin. Thanks to Leick Robinson for his tireless efforts in helping track down this issue. --- frame/1m/packm/bli_packm_blk_var1.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 9ac9582db..05263c4b7 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -190,15 +190,16 @@ void bli_packm_blk_var1 inc_t p_inc = ps_p; - // NOTE: We MUST use round-robin partitioning when packing - // micropanels of a triangular matrix. Hermitian/symmetric - // and general packing may use slab or round-robin, depending - // on which was selected at configure-time. - // The definition of bli_packm_my_iter() will depend on whether slab - // or round-robin partitioning was requested at configure-time. - bool my_iter = bli_is_triangular( strucc ) - ? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) - : bli_packm_my_iter ( it, it_start, it_end, tid, nt ); + // NOTE: We MUST use round-robin work allocation (bli_packm_my_iter_rr()) + // when packing micropanels of a triangular matrix. Hermitian/symmetric + // and general packing may use slab or round-robin (bli_packm_my_iter()), + // depending on which was selected at configure-time. + bool my_iter = ( bli_is_triangular( strucc ) && + bli_intersects_diag_n( diagoffc_i, panel_dim_i, + panel_len_full ) + ? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) + : bli_packm_my_iter ( it, it_start, it_end, tid, nt ) + ); if ( bli_is_triangular( strucc ) && bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) ) From 6774bf08c92fc6983706a91bbb93b960e8eef285 Mon Sep 17 00:00:00 2001 From: Lee Killough <15950023+leekillough@users.noreply.github.com> Date: Thu, 3 Nov 2022 15:20:47 -0500 Subject: [PATCH 103/230] Fix typo in configure --help text. (#686) Details: - Fixed a misspelling in the --help description for the --int-size (-i) configure option. --- configure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure b/configure index 49deec819..5bfa608cd 100755 --- a/configure +++ b/configure @@ -244,7 +244,7 @@ print_usage() echo " " echo " Set the size (in bits) of internal BLIS integers and" echo " integer types used in native BLIS interfaces. The" - echo " default inteter type size is architecture dependent." + echo " default integer type size is architecture dependent." echo " (Hint: You can always find this value printed at the" echo " beginning of the testsuite output.)" echo " " From 8d813f7f12732d52c95570ae884d5defbfd19234 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 3 Nov 2022 19:10:47 -0500 Subject: [PATCH 104/230] Some decluttering of the top-level directory. Details: - Relocated 'mpi_test' directory to test/mpi_test. - Relocated 'so_version' and 'version' files from top-level directory to 'build' directory. - Updated build/bump-version.sh script to accommodate relocation of 'version' file to 'build' directory. - Updated configure script to accommodate relocation of 'so_version' file to 'build' directory. - Updated INSTALL file to replace pointers to blis-devel mailing list with a pointer to docs/Discord.md. - Updated RELEASING file to contain a reminder to consider whether the so_version file should be updated prior to the release. --- INSTALL | 11 +++++++---- RELEASING | 22 +++++++++++++--------- build/bump-version.sh | 6 +++--- so_version => build/so_version | 0 version => build/version | 0 configure | 10 +++++----- {mpi_test => test/mpi_test}/Makefile | 0 {mpi_test => test/mpi_test}/test_gemm.c | 0 {mpi_test => test/mpi_test}/test_hemm.c | 0 {mpi_test => test/mpi_test}/test_her2k.c | 0 {mpi_test => test/mpi_test}/test_herk.c | 0 {mpi_test => test/mpi_test}/test_trmm.c | 0 {mpi_test => test/mpi_test}/test_trsm.c | 0 13 files changed, 28 insertions(+), 21 deletions(-) rename so_version => build/so_version (100%) rename version => build/version (100%) rename {mpi_test => test/mpi_test}/Makefile (100%) rename {mpi_test => test/mpi_test}/test_gemm.c (100%) rename {mpi_test => test/mpi_test}/test_hemm.c (100%) rename {mpi_test => test/mpi_test}/test_her2k.c (100%) rename {mpi_test => test/mpi_test}/test_herk.c (100%) rename {mpi_test => test/mpi_test}/test_trmm.c (100%) rename {mpi_test => test/mpi_test}/test_trsm.c (100%) diff --git a/INSTALL b/INSTALL index 9adc43867..75850a96b 100644 --- a/INSTALL +++ b/INSTALL @@ -17,11 +17,14 @@ viewing the file over GitHub via a web browser: This document will always contain the most up-to-date information related to instantiating a BLIS library from the framework source code. If you have any further questions or wish to provide feedback, please contact the BLIS -community by posting your message to the BLIS developer's mailing list: +community by either by joining our Discord community! Instructions for +joining may be found in: - https://groups.google.com/d/forum/blis-devel + docs/Discord.md -Thanks for your interest in the BLIS framework! +or in rendered form at: + + https://github.com/flame/blis/blob/master/docs/Discord.md -Field Van Zee +Thanks for your interest in the BLIS framework! diff --git a/RELEASING b/RELEASING index 351594c49..0996a560d 100644 --- a/RELEASING +++ b/RELEASING @@ -7,38 +7,42 @@ Here are the steps to follow to create a new release (version) of BLIS: If there are any commits upstream, merge them as appropriate. -2. Verify that the code builds properly. +2. Consider whether the so_version should be updated (via the so_version + file in the 'build' directory) due to any ABI changes since the previous + version. If so, commit that change now. + +3. Verify that the code builds properly. $ ./configure auto; make -3. Verify that the code passes BLIS and BLAS tests: +4. Verify that the code passes BLIS and BLAS tests: $ make check # BLIS testsuite (fast) + BLAS test drivers $ make checkblis # BLIS testsuite (full ex. mixed-datatype) $ make checkblis-md # BLIS testsuite (mixed-datatype only) $ make checkblis-salt # BLIS testsuite (fast + salt) -4. Draft a new announcement to blis-devel, crediting those who +5. Draft a new announcement to blis-devel, crediting those who contributed towards this version by browsing 'git log'. -5. Update CREDITS file if 'git log' reveals any new contributors. +6. Update CREDITS file if 'git log' reveals any new contributors. -6. Update docs/ReleaseNotes.md file with body of finalized announcement +7. Update docs/ReleaseNotes.md file with body of finalized announcement and the date of the release. -7. Commit changes from steps 5 and 6. +8. Commit changes from steps 5 and 6. -8. Bump the version number: +9. Bump the version number: $ ./build/bump-version.sh "0.3.2" This will result in two new commits: a version file update and a CHANGELOG file update. -9. Push the new commits and new tag associated with the new version: +10. Push the new commits and new tag associated with the new version: $ git push $ git push --tag -10. Send finalized announcement to blis-devel. +11. Send finalized announcement to blis-devel. diff --git a/build/bump-version.sh b/build/bump-version.sh index 65e1a2988..b72a09a40 100755 --- a/build/bump-version.sh +++ b/build/bump-version.sh @@ -98,10 +98,10 @@ main() # The name of the CHANGELOG file. changelog_file='CHANGELOG' - # The name of the default version file. - version_file_def='version' + # The name and location of the default version file. + version_file_def='build/version' - # The name of the specified version file. + # The name and location of the specified version file. version_file='' # Strings used during version query. diff --git a/so_version b/build/so_version similarity index 100% rename from so_version rename to build/so_version diff --git a/version b/build/version similarity index 100% rename from version rename to build/version diff --git a/configure b/configure index 5bfa608cd..fd4812b1b 100755 --- a/configure +++ b/configure @@ -2346,10 +2346,6 @@ main() # of the distribution and the directory in which we are building. cur_dirpath="." - # The file in which the version string is kept. - version_file="version" - version_filepath="${dist_path}/${version_file}" - # The name of and path to the directory named "build" in the top-level # directory of the source distribution. build_dir='build' @@ -2431,9 +2427,13 @@ main() # -- Version-related -- + # The file in which the version string is kept. + version_file="version" + version_filepath="${build_dirpath}/${version_file}" + # The shared library (.so) version file. so_version_file='so_version' - so_version_filepath="${dist_path}/${so_version_file}" + so_version_filepath="${build_dirpath}/${so_version_file}" # The major and minor/build .so version numbers. so_version_major='' diff --git a/mpi_test/Makefile b/test/mpi_test/Makefile similarity index 100% rename from mpi_test/Makefile rename to test/mpi_test/Makefile diff --git a/mpi_test/test_gemm.c b/test/mpi_test/test_gemm.c similarity index 100% rename from mpi_test/test_gemm.c rename to test/mpi_test/test_gemm.c diff --git a/mpi_test/test_hemm.c b/test/mpi_test/test_hemm.c similarity index 100% rename from mpi_test/test_hemm.c rename to test/mpi_test/test_hemm.c diff --git a/mpi_test/test_her2k.c b/test/mpi_test/test_her2k.c similarity index 100% rename from mpi_test/test_her2k.c rename to test/mpi_test/test_her2k.c diff --git a/mpi_test/test_herk.c b/test/mpi_test/test_herk.c similarity index 100% rename from mpi_test/test_herk.c rename to test/mpi_test/test_herk.c diff --git a/mpi_test/test_trmm.c b/test/mpi_test/test_trmm.c similarity index 100% rename from mpi_test/test_trmm.c rename to test/mpi_test/test_trmm.c diff --git a/mpi_test/test_trsm.c b/test/mpi_test/test_trsm.c similarity index 100% rename from mpi_test/test_trsm.c rename to test/mpi_test/test_trsm.c From 713d078075a4a563a43d83fd0880ab5091c2e4a4 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 3 Nov 2022 20:00:11 -0500 Subject: [PATCH 105/230] Delete mpi_test garbage. (#689) Details: - tlrmchlsmth: "What even is this? No comments, no commit message, not used by anything. Trash." --- test/mpi_test/Makefile | 283 ------------------------------------- test/mpi_test/test_gemm.c | 232 ------------------------------ test/mpi_test/test_hemm.c | 252 --------------------------------- test/mpi_test/test_her2k.c | 209 --------------------------- test/mpi_test/test_herk.c | 200 -------------------------- test/mpi_test/test_trmm.c | 246 -------------------------------- test/mpi_test/test_trsm.c | 282 ------------------------------------ 7 files changed, 1704 deletions(-) delete mode 100644 test/mpi_test/Makefile delete mode 100644 test/mpi_test/test_gemm.c delete mode 100644 test/mpi_test/test_hemm.c delete mode 100644 test/mpi_test/test_her2k.c delete mode 100644 test/mpi_test/test_herk.c delete mode 100644 test/mpi_test/test_trmm.c delete mode 100644 test/mpi_test/test_trsm.c diff --git a/test/mpi_test/Makefile b/test/mpi_test/Makefile deleted file mode 100644 index 00ca01e47..000000000 --- a/test/mpi_test/Makefile +++ /dev/null @@ -1,283 +0,0 @@ -# -# -# BLIS -# An object-based framework for developing high-performance BLAS-like -# libraries. -# -# Copyright (C) 2014, The University of Texas at Austin -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# - Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# - Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# - Neither the name(s) of the copyright holder(s) nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# - -# -# Makefile -# -# Field G. Van Zee -# -# Makefile for standalone BLIS test drivers. -# - -# -# --- Makefile PHONY target definitions ---------------------------------------- -# - -.PHONY: all \ - blis essl \ - clean cleanx - - - -# Comments: -# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given. -# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in -# the second case because CONFIG_NAME is not yet set. -ifneq ($(strip $(BLIS_INSTALL_PATH)),) -LIB_PATH := $(BLIS_INSTALL_PATH)/lib -INC_PATH := $(BLIS_INSTALL_PATH)/include/blis -SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis -else -DIST_PATH := .. -LIB_PATH = ../lib/$(CONFIG_NAME) -INC_PATH = ../include/$(CONFIG_NAME) -SHARE_PATH := .. -endif - - - -# -# --- Include common makefile definitions -------------------------------------- -# - -# Include the common makefile fragment. --include $(SHARE_PATH)/common.mk - - - -# -# --- BLAS and LAPACK implementations ------------------------------------------ -# - -# BLAS library path(s). This is where the BLAS libraries reside. -BLAS_LIB_PATH := $(HOME)/flame/lib -MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64/ -ESSL_LIB_PATH := /soft/libraries/essl/current/lib64 - -# OpenBLAS -OPENBLAS_LIB := $(BLAS_LIB_PATH)/libopenblas.a - -# ATLAS -ATLAS_LIB := $(BLAS_LIB_PATH)/libf77blas.a \ - $(BLAS_LIB_PATH)/libatlas.a - -# MKL -MKL_LIB := -L$(MKL_LIB_PATH) \ - -lmkl_sequential \ - -lmkl_core \ - -lmkl_intel_lp64 - -# ESSL -# Note: ESSL is named differently for SMP and/or BG -ESSL_LIB := $(ESSL_LIB_PATH)/libesslsmpbg.a \ - -L$(IBM_MAIN_DIR)/xlsmp/bg/3.1/bglib64/ \ - -L$(IBM_MAIN_DIR)/xlf/bg/14.1/bglib64/ \ - -lxlsmp -lxlf90_r -lxlfmath -lxl - -# Accelerate -MAC_LIB := -framework Accelerate - - - -# -# --- General build definitions ------------------------------------------------ -# - -TEST_SRC_PATH := . -TEST_OBJ_PATH := . - -# Gather all local object files. -TEST_OBJS := $(patsubst $(TEST_SRC_PATH)/%.c, \ - $(TEST_OBJ_PATH)/%.o, \ - $(wildcard $(TEST_SRC_PATH)/*.c)) - -# Override the value of CINCFLAGS so that the value of CFLAGS returned by -# get-user-cflags-for() is not cluttered up with include paths needed only -# while building BLIS. -CINCFLAGS := -I$(INC_PATH) - -# Use the "framework" CFLAGS for the configuration family. -CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME)) - -# Add local header paths to CFLAGS -CFLAGS += -I$(TEST_SRC_PATH) - -# Locate the libblis library to which we will link. -#LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L) - - - -# -# --- Targets/rules ------------------------------------------------------------ -# - -# Complete list of possible targets when defining 'all': -# -# blis openblas atlas mkl mac essl -# -all: blis essl - -blis: test_gemm_blis.x \ - test_hemm_blis.x \ - test_herk_blis.x \ - test_her2k_blis.x \ - test_trmm_blis.x \ - test_trsm_blis.x - -essl: test_gemm_essl.x \ - test_hemm_essl.x \ - test_herk_essl.x \ - test_her2k_essl.x \ - test_trmm_essl.x \ - test_trsm_essl.x - -openblas: test_gemv_openblas.x \ - test_ger_openblas.x \ - test_hemv_openblas.x \ - test_her_openblas.x \ - test_her2_openblas.x \ - test_trmv_openblas.x \ - test_trsv_openblas.x \ - \ - test_gemm_openblas.x \ - test_hemm_openblas.x \ - test_herk_openblas.x \ - test_her2k_openblas.x \ - test_trmm_openblas.x \ - test_trsm_openblas.x - -atlas: test_gemv_atlas.x \ - test_ger_atlas.x \ - test_hemv_atlas.x \ - test_her_atlas.x \ - test_her2_atlas.x \ - test_trmv_atlas.x \ - test_trsv_atlas.x \ - \ - test_gemm_atlas.x \ - test_hemm_atlas.x \ - test_herk_atlas.x \ - test_her2k_atlas.x \ - test_trmm_atlas.x \ - test_trsm_atlas.x - -mkl: test_gemv_mkl.x \ - test_ger_mkl.x \ - test_hemv_mkl.x \ - test_her_mkl.x \ - test_her2_mkl.x \ - test_trmv_mkl.x \ - test_trsv_mkl.x \ - \ - test_gemm_mkl.x \ - test_hemm_mkl.x \ - test_herk_mkl.x \ - test_her2k_mkl.x \ - test_trmm_mkl.x \ - test_trsm_mkl.x - -mac: test_gemv_mac.x \ - test_ger_mac.x \ - test_hemv_mac.x \ - test_her_mac.x \ - test_her2_mac.x \ - test_trmv_mac.x \ - test_trsv_mac.x \ - \ - test_gemm_mac.x \ - test_hemm_mac.x \ - test_herk_mac.x \ - test_her2k_mac.x \ - test_trmm_mac.x \ - test_trsm_mac.x - - - -# --Object file rules -- - -$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c - $(CC) $(CFLAGS) -c $< -o $@ - -test_%_openblas.o: test_%.c - $(CC) $(CFLAGS) -DBLAS=\"openblas\" -c $< -o $@ - -test_%_atlas.o: test_%.c - $(CC) $(CFLAGS) -DBLAS=\"atlas\" -c $< -o $@ - -test_%_mkl.o: test_%.c - $(CC) $(CFLAGS) -DBLAS=\"mkl\" -c $< -o $@ - -test_%_essl.o: test_%.c - $(CC) $(CFLAGS) -DBLAS=\"essl\" -c $< -o $@ - -test_%_mac.o: test_%.c - $(CC) $(CFLAGS) -DBLAS=\"mac\" -c $< -o $@ - -test_%_blis.o: test_%.c - $(CC) $(CFLAGS) -DBLIS -c $< -o $@ - - -# -- Executable file rules -- - -# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS -# on the link command line in case BLIS was configured with the BLAS -# compatibility layer. This prevents BLIS from inadvertently getting called -# for the BLAS routines we are trying to test with. - -test_%_openblas.x: test_%_openblas.o $(LIBBLIS_LINK) - $(LINKER) $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ - -test_%_atlas.x: test_%_atlas.o $(LIBBLIS_LINK) - $(LINKER) $< $(ATLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ - -test_%_mkl.x: test_%_mkl.o $(LIBBLIS_LINK) - $(LINKER) $< $(MKL_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ - -test_%_essl.x: test_%_essl.o $(LIBBLIS_LINK) - $(LINKER) $< $(ESSL_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ - -test_%_mac.x: test_%_mac.o $(LIBBLIS_LINK) - $(LINKER) $< $(MAC_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ - -test_%_blis.x: test_%_blis.o $(LIBBLIS_LINK) - $(LINKER) $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@ - - -# -- Clean rules -- - -clean: cleanx - -cleanx: - - $(RM_F) *.o *.x - diff --git a/test/mpi_test/test_gemm.c b/test/mpi_test/test_gemm.c deleted file mode 100644 index 8c5c58c23..000000000 --- a/test/mpi_test/test_gemm.c +++ /dev/null @@ -1,232 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include -#include "blis.h" -#include - -// transa transb m n k alpha a lda b ldb beta c ldc -//void dgemm_( char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* ); - -//#define PRINT - -int main( int argc, char** argv ) -{ - obj_t a, b, c; - obj_t c_save; - obj_t alpha, beta; - dim_t m, n, k; - dim_t p; - dim_t p_begin, p_end, p_inc; - int m_input, n_input, k_input; - num_t dt_a, dt_b, dt_c; - num_t dt_alpha, dt_beta; - int r, n_repeats; - - double dtime; - double dtime_save; - double gflops; - - bli_init(); - - n_repeats = 3; - - if( argc < 7 ) - { - printf("Usage:\n"); - printf("test_foo.x m n k p_begin p_inc p_end:\n"); - exit; - } - - int world_size, world_rank, provided; - MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); - MPI_Comm_size( MPI_COMM_WORLD, &world_size ); - MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); - - m_input = strtol( argv[1], NULL, 10 ); - n_input = strtol( argv[2], NULL, 10 ); - k_input = strtol( argv[3], NULL, 10 ); - p_begin = strtol( argv[4], NULL, 10 ); - p_inc = strtol( argv[5], NULL, 10 ); - p_end = strtol( argv[6], NULL, 10 ); - -#if 1 - dt_a = BLIS_DOUBLE; - dt_b = BLIS_DOUBLE; - dt_c = BLIS_DOUBLE; - dt_alpha = BLIS_DOUBLE; - dt_beta = BLIS_DOUBLE; -#else - dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX; -#endif - - for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) - { - - if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); - else m = ( dim_t ) m_input; - if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); - else n = ( dim_t ) n_input; - if ( k_input < 0 ) k = p * ( dim_t )abs(k_input); - else k = ( dim_t ) k_input; - - - bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); - bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); - - bli_obj_create( dt_a, m, k, 0, 0, &a ); - bli_obj_create( dt_b, k, n, 0, 0, &b ); - bli_obj_create( dt_c, m, n, 0, 0, &c ); - bli_obj_create( dt_c, m, n, 0, 0, &c_save ); - - bli_randm( &a ); - bli_randm( &b ); - bli_randm( &c ); - - - bli_setsc( (0.9/1.0), 0.2, &alpha ); - bli_setsc( (1.0/1.0), 0.0, &beta ); - - - bli_copym( &c, &c_save ); - - dtime_save = 1.0e9; - - for ( r = 0; r < n_repeats; ++r ) - { - bli_copym( &c_save, &c ); - - - dtime = bli_clock(); - -#ifdef BLIS - //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - - bli_gemm( &alpha, - //bli_gemm4m( &alpha, - &a, - &b, - &beta, - &c ); - -#else - if ( bli_is_real( dt_a ) ) - { - f77_char transa = 'N'; - f77_char transb = 'N'; - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int nn = bli_obj_width( &c ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldb = bli_obj_col_stride( &b ); - f77_int ldc = bli_obj_col_stride( &c ); - double* alphap = bli_obj_buffer( &alpha ); - double* ap = bli_obj_buffer( &a ); - double* bp = bli_obj_buffer( &b ); - double* betap = bli_obj_buffer( &beta ); - double* cp = bli_obj_buffer( &c ); - - dgemm_( &transa, - &transb, - &mm, - &nn, - &kk, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); - } - else - { - f77_char transa = 'N'; - f77_char transb = 'N'; - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int nn = bli_obj_width( &c ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldb = bli_obj_col_stride( &b ); - f77_int ldc = bli_obj_col_stride( &c ); - dcomplex* alphap = bli_obj_buffer( &alpha ); - dcomplex* ap = bli_obj_buffer( &a ); - dcomplex* bp = bli_obj_buffer( &b ); - dcomplex* betap = bli_obj_buffer( &beta ); - dcomplex* cp = bli_obj_buffer( &c ); - - zgemm_( &transa, - //zgemm3m_( &transa, - &transb, - &mm, - &nn, - &kk, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); - } -#endif - - dtime_save = bli_clock_min_diff( dtime_save, dtime ); - } - - gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 ); - - if ( bli_is_complex( dt_a ) ) gflops *= 4.0; - -#ifdef BLIS - printf( "data_gemm_blis" ); -#else - printf( "data_gemm_%s", BLAS ); -#endif - printf( "( %2lu, 1:5 ) = [ %4lu %4lu %4lu %10.3e %6.3f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, - ( unsigned long )m, - ( unsigned long )k, - ( unsigned long )n, dtime_save, gflops ); - - bli_obj_free( &alpha ); - bli_obj_free( &beta ); - - bli_obj_free( &a ); - bli_obj_free( &b ); - bli_obj_free( &c ); - bli_obj_free( &c_save ); - } - - bli_finalize(); - - return 0; -} - diff --git a/test/mpi_test/test_hemm.c b/test/mpi_test/test_hemm.c deleted file mode 100644 index 1934de013..000000000 --- a/test/mpi_test/test_hemm.c +++ /dev/null @@ -1,252 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include -#include "blis.h" -#include - -// side uploa m n alpha a lda b ldb beta c ldc -//void dsymm_( char*, char*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* ); - -//#define PRINT - -int main( int argc, char** argv ) -{ - obj_t a, b, c; - obj_t c_save; - obj_t alpha, beta; - dim_t m, n; - dim_t p; - dim_t p_begin, p_end, p_inc; - int m_input, n_input; - num_t dt_a, dt_b, dt_c; - num_t dt_alpha, dt_beta; - int r, n_repeats; - side_t side; - uplo_t uplo; - - double dtime; - double dtime_save; - double gflops; - - bli_init(); - - n_repeats = 3; - - if( argc < 7 ) - { - printf("Usage:\n"); - printf("test_foo.x m n k p_begin p_inc p_end:\n"); - exit; - } - - int world_size, world_rank, provided; - MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); - MPI_Comm_size( MPI_COMM_WORLD, &world_size ); - MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); - - m_input = strtol( argv[1], NULL, 10 ); - n_input = strtol( argv[2], NULL, 10 ); - p_begin = strtol( argv[4], NULL, 10 ); - p_inc = strtol( argv[5], NULL, 10 ); - p_end = strtol( argv[6], NULL, 10 ); - -#if 1 - dt_a = BLIS_DOUBLE; - dt_b = BLIS_DOUBLE; - dt_c = BLIS_DOUBLE; - dt_alpha = BLIS_DOUBLE; - dt_beta = BLIS_DOUBLE; -#else - dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX; -#endif - - side = BLIS_LEFT; - //side = BLIS_RIGHT; - - uplo = BLIS_LOWER; - //uplo = BLIS_UPPER; - - for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) - { - - if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); - else m = ( dim_t ) m_input; - if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); - else n = ( dim_t ) n_input; - - - bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); - bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); - - if ( bli_is_left( side ) ) - bli_obj_create( dt_a, m, m, 0, 0, &a ); - else - bli_obj_create( dt_a, n, n, 0, 0, &a ); - bli_obj_create( dt_b, m, n, 0, 0, &b ); - bli_obj_create( dt_c, m, n, 0, 0, &c ); - bli_obj_create( dt_c, m, n, 0, 0, &c_save ); - - bli_randm( &a ); - bli_randm( &b ); - bli_randm( &c ); - - bli_obj_set_struc( BLIS_HERMITIAN, &a ); - bli_obj_set_uplo( uplo, &a ); - - // Randomize A, make it densely Hermitian, and zero the unstored - // triangle to ensure the implementation reads only from the stored - // region. - bli_randm( &a ); - bli_mkherm( &a ); - bli_mktrim( &a ); -/* - bli_obj_toggle_uplo( &a ); - bli_obj_inc_diag_offset( 1, &a ); - bli_setm( &BLIS_ZERO, &a ); - bli_obj_inc_diag_offset( -1, &a ); - bli_obj_toggle_uplo( &a ); - bli_obj_set_diag( BLIS_NONUNIT_DIAG, &a ); - bli_scalm( &BLIS_TWO, &a ); - bli_scalm( &BLIS_TWO, &a ); -*/ - - - bli_setsc( (2.0/1.0), 1.0, &alpha ); - bli_setsc( (1.0/1.0), 0.0, &beta ); - - - bli_copym( &c, &c_save ); - - dtime_save = 1.0e9; - - for ( r = 0; r < n_repeats; ++r ) - { - bli_copym( &c_save, &c ); - - - dtime = bli_clock(); - -#ifdef PRINT -/* - obj_t ar, ai; - bli_obj_alias_to( &a, &ar ); - bli_obj_alias_to( &a, &ai ); - bli_obj_set_dt( BLIS_DOUBLE, &ar ); ar.rs *= 2; ar.cs *= 2; - bli_obj_set_dt( BLIS_DOUBLE, &ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1; - bli_printm( "ar", &ar, "%4.1f", "" ); - bli_printm( "ai", &ai, "%4.1f", "" ); -*/ - - bli_printm( "a", &a, "%4.1f", "" ); - bli_printm( "b", &b, "%4.1f", "" ); - bli_printm( "c", &c, "%4.1f", "" ); -#endif - -#ifdef BLIS - - //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - - bli_hemm( side, - //bli_hemm4m( side, - &alpha, - &a, - &b, - &beta, - &c ); -#else - - f77_char side = 'L'; - f77_char uplo = 'L'; - f77_int mm = bli_obj_length( &c ); - f77_int nn = bli_obj_width( &c ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldb = bli_obj_col_stride( &b ); - f77_int ldc = bli_obj_col_stride( &c ); - double* alphap = bli_obj_buffer( &alpha ); - double* ap = bli_obj_buffer( &a ); - double* bp = bli_obj_buffer( &b ); - double* betap = bli_obj_buffer( &beta ); - double* cp = bli_obj_buffer( &c ); - - dsymm_( &side, - &uplo, - &mm, - &nn, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); -#endif - -#ifdef PRINT - bli_printm( "c after", &c, "%9.5f", "" ); - exit(1); -#endif - - dtime_save = bli_clock_min_diff( dtime_save, dtime ); - } - - if ( bli_is_left( side ) ) - gflops = ( 2.0 * m * m * n ) / ( dtime_save * 1.0e9 ); - else - gflops = ( 2.0 * m * n * n ) / ( dtime_save * 1.0e9 ); - - if ( bli_is_complex( dt_a ) ) gflops *= 4.0; - -#ifdef BLIS - printf( "data_hemm_blis" ); -#else - printf( "data_hemm_%s", BLAS ); -#endif - printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, - ( unsigned long )m, - ( unsigned long )n, dtime_save, gflops ); - - bli_obj_free( &alpha ); - bli_obj_free( &beta ); - - bli_obj_free( &a ); - bli_obj_free( &b ); - bli_obj_free( &c ); - bli_obj_free( &c_save ); - } - - bli_finalize(); - - return 0; -} - diff --git a/test/mpi_test/test_her2k.c b/test/mpi_test/test_her2k.c deleted file mode 100644 index 6aa15038a..000000000 --- a/test/mpi_test/test_her2k.c +++ /dev/null @@ -1,209 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include -#include "blis.h" -#include - -// uploa transa m k alpha a lda b ldb beta c ldc -//void dsyr2k_( char*, char*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* ); - -//#define PRINT - -int main( int argc, char** argv ) -{ - obj_t a, b, c; - obj_t c_save; - obj_t alpha, beta; - dim_t m, k; - dim_t p; - dim_t p_begin, p_end, p_inc; - int m_input, k_input; - num_t dt_a, dt_b, dt_c; - num_t dt_alpha, dt_beta; - int r, n_repeats; - uplo_t uplo; - - double dtime; - double dtime_save; - double gflops; - - bli_init(); - - n_repeats = 3; - - if( argc < 7 ) - { - printf("Usage:\n"); - printf("test_foo.x m n k p_begin p_inc p_end:\n"); - exit; - } - - int world_size, world_rank, provided; - MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); - MPI_Comm_size( MPI_COMM_WORLD, &world_size ); - MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); - - m_input = strtol( argv[1], NULL, 10 ); - k_input = strtol( argv[3], NULL, 10 ); - p_begin = strtol( argv[4], NULL, 10 ); - p_inc = strtol( argv[5], NULL, 10 ); - p_end = strtol( argv[6], NULL, 10 ); - - dt_a = BLIS_DOUBLE; - dt_b = BLIS_DOUBLE; - dt_c = BLIS_DOUBLE; - dt_alpha = BLIS_DOUBLE; - dt_beta = BLIS_DOUBLE; - - uplo = BLIS_LOWER; - - for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) - { - - if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); - else m = ( dim_t ) m_input; - if ( k_input < 0 ) k = p * ( dim_t )abs(k_input); - else k = ( dim_t ) k_input; - - - bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); - bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); - - bli_obj_create( dt_a, m, k, 0, 0, &a ); - bli_obj_create( dt_b, m, k, 0, 0, &b ); - bli_obj_create( dt_c, m, m, 0, 0, &c ); - bli_obj_create( dt_c, m, m, 0, 0, &c_save ); - - bli_randm( &a ); - bli_randm( &b ); - bli_randm( &c ); - - bli_obj_set_struc( BLIS_HERMITIAN, &c ); - bli_obj_set_uplo( uplo, &c ); - - - bli_setsc( (2.0/1.0), 0.0, &alpha ); - bli_setsc( (1.0/1.0), 0.0, &beta ); - - - bli_copym( &c, &c_save ); - - dtime_save = 1.0e9; - - for ( r = 0; r < n_repeats; ++r ) - { - bli_copym( &c_save, &c ); - - - dtime = bli_clock(); - -#ifdef PRINT - bli_printm( "a", &a, "%4.1f", "" ); - bli_printm( "b", &b, "%4.1f", "" ); - bli_printm( "c", &c, "%4.1f", "" ); -#endif - -#ifdef BLIS - - //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - - bli_her2k( &alpha, - &a, - &b, - &beta, - &c ); - -#else - - f77_char uploa = 'L'; - f77_char transa = 'N'; - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldb = bli_obj_col_stride( &b ); - f77_int ldc = bli_obj_col_stride( &c ); - double* alphap = bli_obj_buffer( &alpha ); - double* ap = bli_obj_buffer( &a ); - double* bp = bli_obj_buffer( &b ); - double* betap = bli_obj_buffer( &beta ); - double* cp = bli_obj_buffer( &c ); - - dsyr2k_( &uploa, - &transa, - &mm, - &kk, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); -#endif - -#ifdef PRINT - bli_printm( "c after", &c, "%4.1f", "" ); - exit(1); -#endif - - - dtime_save = bli_clock_min_diff( dtime_save, dtime ); - } - - gflops = ( 2.0 * m * k * m ) / ( dtime_save * 1.0e9 ); - -#ifdef BLIS - printf( "data_her2k_blis" ); -#else - printf( "data_her2k_%s", BLAS ); -#endif - printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, - ( unsigned long )m, - ( unsigned long )k, dtime_save, gflops ); - - - bli_obj_free( &alpha ); - bli_obj_free( &beta ); - - bli_obj_free( &a ); - bli_obj_free( &b ); - bli_obj_free( &c ); - bli_obj_free( &c_save ); - } - - bli_finalize(); - - return 0; -} - diff --git a/test/mpi_test/test_herk.c b/test/mpi_test/test_herk.c deleted file mode 100644 index 06e11afe1..000000000 --- a/test/mpi_test/test_herk.c +++ /dev/null @@ -1,200 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include -#include "blis.h" -#include - -// uploa transa m k alpha a lda beta c ldc -//void dsyrk_( char*, char*, int*, int*, double*, double*, int*, double*, double*, int* ); - -//#define PRINT - -int main( int argc, char** argv ) -{ - obj_t a, c; - obj_t c_save; - obj_t alpha, beta; - dim_t m, k; - dim_t p; - dim_t p_begin, p_end, p_inc; - int m_input, k_input; - num_t dt_a, dt_c; - num_t dt_alpha, dt_beta; - int r, n_repeats; - uplo_t uplo; - - double dtime; - double dtime_save; - double gflops; - - bli_init(); - - n_repeats = 3; - - if( argc < 7 ) - { - printf("Usage:\n"); - printf("test_foo.x m n k p_begin p_inc p_end:\n"); - exit; - } - - int world_size, world_rank, provided; - MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); - MPI_Comm_size( MPI_COMM_WORLD, &world_size ); - MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); - - m_input = strtol( argv[1], NULL, 10 ); - k_input = strtol( argv[3], NULL, 10 ); - p_begin = strtol( argv[4], NULL, 10 ); - p_inc = strtol( argv[5], NULL, 10 ); - p_end = strtol( argv[6], NULL, 10 ); - - dt_a = BLIS_DOUBLE; - dt_c = BLIS_DOUBLE; - dt_alpha = BLIS_DOUBLE; - dt_beta = BLIS_DOUBLE; - - uplo = BLIS_LOWER; - - for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) - { - - if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); - else m = ( dim_t ) m_input; - if ( k_input < 0 ) k = p * ( dim_t )abs(k_input); - else k = ( dim_t ) k_input; - - - bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); - bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); - - bli_obj_create( dt_a, m, k, 0, 0, &a ); - bli_obj_create( dt_c, m, m, 0, 0, &c ); - bli_obj_create( dt_c, m, m, 0, 0, &c_save ); - - bli_randm( &a ); - bli_randm( &c ); - - bli_obj_set_struc( BLIS_HERMITIAN, &c ); - bli_obj_set_uplo( uplo, &c ); - - - bli_setsc( (2.0/1.0), 0.0, &alpha ); - bli_setsc( (1.0/1.0), 0.0, &beta ); - - - bli_copym( &c, &c_save ); - - dtime_save = 1.0e9; - - for ( r = 0; r < n_repeats; ++r ) - { - bli_copym( &c_save, &c ); - - - dtime = bli_clock(); - -#ifdef PRINT - bli_printm( "a", &a, "%4.1f", "" ); - bli_printm( "c", &c, "%4.1f", "" ); -#endif - -#ifdef BLIS - - //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - - bli_herk( &alpha, - &a, - &beta, - &c ); - -#else - - f77_char uploa = 'L'; - f77_char transa = 'N'; - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldc = bli_obj_col_stride( &c ); - double* alphap = bli_obj_buffer( &alpha ); - double* ap = bli_obj_buffer( &a ); - double* betap = bli_obj_buffer( &beta ); - double* cp = bli_obj_buffer( &c ); - - dsyrk_( &uploa, - &transa, - &mm, - &kk, - alphap, - ap, &lda, - betap, - cp, &ldc ); -#endif - -#ifdef PRINT - bli_printm( "c after", &c, "%4.1f", "" ); - exit(1); -#endif - - - dtime_save = bli_clock_min_diff( dtime_save, dtime ); - } - - gflops = ( 1.0 * m * k * m ) / ( dtime_save * 1.0e9 ); - -#ifdef BLIS - printf( "data_herk_blis" ); -#else - printf( "data_herk_%s", BLAS ); -#endif - printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, - ( unsigned long )m, - ( unsigned long )k, dtime_save, gflops ); - - - bli_obj_free( &alpha ); - bli_obj_free( &beta ); - - bli_obj_free( &a ); - bli_obj_free( &c ); - bli_obj_free( &c_save ); - } - - bli_finalize(); - - return 0; -} - diff --git a/test/mpi_test/test_trmm.c b/test/mpi_test/test_trmm.c deleted file mode 100644 index 2ba1c6a79..000000000 --- a/test/mpi_test/test_trmm.c +++ /dev/null @@ -1,246 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include -#include "blis.h" -#include - -// side uplo trans diag m n alpha a lda b ldb -//void dtrmm_( char*, char*, char*, char*, int*, int*, double*, double*, int*, double*, int* ); - -//#define PRINT - -int main( int argc, char** argv ) -{ - obj_t a, b, c; - obj_t c_save; - obj_t alpha, beta; - dim_t m, n; - dim_t p; - dim_t p_begin, p_end, p_inc; - int m_input, n_input; - num_t dt_a, dt_b, dt_c; - num_t dt_alpha, dt_beta; - int r, n_repeats; - side_t side; - uplo_t uplo; - - double dtime; - double dtime_save; - double gflops; - - bli_init(); - - n_repeats = 3; - - if( argc < 7 ) - { - printf("Usage:\n"); - printf("test_foo.x m n p_begin p_inc p_end:\n"); - exit; - } - - int world_size, world_rank, provided; - MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); - MPI_Comm_size( MPI_COMM_WORLD, &world_size ); - MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); - - m_input = strtol( argv[1], NULL, 10 ); - n_input = strtol( argv[2], NULL, 10 ); - p_begin = strtol( argv[4], NULL, 10 ); - p_inc = strtol( argv[5], NULL, 10 ); - p_end = strtol( argv[6], NULL, 10 ); - -#if 1 - dt_a = BLIS_DOUBLE; - dt_b = BLIS_DOUBLE; - dt_c = BLIS_DOUBLE; - dt_alpha = BLIS_DOUBLE; - dt_beta = BLIS_DOUBLE; -#else - dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX; -#endif - - side = BLIS_LEFT; - //side = BLIS_RIGHT; - - uplo = BLIS_LOWER; - //uplo = BLIS_UPPER; - - for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) - { - - if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); - else m = ( dim_t ) m_input; - if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); - else n = ( dim_t ) n_input; - - - bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); - bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); - - if ( bli_is_left( side ) ) - bli_obj_create( dt_a, m, m, 0, 0, &a ); - else - bli_obj_create( dt_a, n, n, 0, 0, &a ); - bli_obj_create( dt_b, m, n, 0, 0, &b ); - bli_obj_create( dt_c, m, n, 0, 0, &c ); - bli_obj_create( dt_c, m, n, 0, 0, &c_save ); - - bli_obj_set_struc( BLIS_TRIANGULAR, &a ); - bli_obj_set_uplo( uplo, &a ); - - bli_randm( &a ); - bli_randm( &c ); - bli_randm( &b ); - -/* - bli_obj_toggle_uplo( &a ); - bli_obj_inc_diag_offset( -1, &a ); - bli_setm( &BLIS_ZERO, &a ); - bli_obj_inc_diag_offset( 1, &a ); - bli_obj_toggle_uplo( &a ); - bli_obj_set_diag( BLIS_NONUNIT_DIAG, &a ); - bli_scalm( &BLIS_TWO, &a ); - //bli_scalm( &BLIS_TWO, &a ); -*/ - - - - bli_setsc( (2.0/1.0), 0.0, &alpha ); - bli_setsc( (1.0/1.0), 0.0, &beta ); - - - bli_copym( &c, &c_save ); - - dtime_save = 1.0e9; - - for ( r = 0; r < n_repeats; ++r ) - { - bli_copym( &c_save, &c ); - - dtime = bli_clock(); - - -#ifdef PRINT - -/* - obj_t ar, ai; - bli_obj_alias_to( &a, &ar ); - bli_obj_alias_to( &a, &ai ); - bli_obj_set_dt( BLIS_DOUBLE, &ar ); ar.rs *= 2; ar.cs *= 2; - bli_obj_set_dt( BLIS_DOUBLE, &ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1; - bli_printm( "ar", &ar, "%4.1f", "" ); - bli_printm( "ai", &ai, "%4.1f", "" ); -*/ - bli_printm( "a", &a, "%4.1f", "" ); - bli_printm( "c", &c, "%4.1f", "" ); -#endif - -#ifdef BLIS - bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - - bli_trmm( side, - //bli_trmm4m( side, - &alpha, - &a, - &c ); - -#else - - f77_char side = 'L'; - f77_char uplo = 'L'; - f77_char transa = 'N'; - f77_char diag = 'N'; - f77_int mm = bli_obj_length( &c ); - f77_int nn = bli_obj_width( &c ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldc = bli_obj_col_stride( &c ); - double* alphap = bli_obj_buffer( &alpha ); - double* ap = bli_obj_buffer( &a ); - double* cp = bli_obj_buffer( &c ); - - dtrmm_( &side, - &uplo, - &transa, - &diag, - &mm, - &nn, - alphap, - ap, &lda, - cp, &ldc ); -#endif - -#ifdef PRINT - bli_printm( "c after", &c, "%4.1f", "" ); - exit(1); -#endif - - - - dtime_save = bli_clock_min_diff( dtime_save, dtime ); - } - - if ( bli_is_left( side ) ) - gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 ); - else - gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 ); - - if ( bli_is_complex( dt_a ) ) gflops *= 4.0; - -#ifdef BLIS - printf( "data_trmm_blis" ); -#else - printf( "data_trmm_%s", BLAS ); -#endif - printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, - ( unsigned long )m, - ( unsigned long )n, dtime_save, gflops ); - - - bli_obj_free( &alpha ); - bli_obj_free( &beta ); - - bli_obj_free( &a ); - bli_obj_free( &b ); - bli_obj_free( &c ); - bli_obj_free( &c_save ); - } - - bli_finalize(); - - return 0; -} - diff --git a/test/mpi_test/test_trsm.c b/test/mpi_test/test_trsm.c deleted file mode 100644 index 12fc54232..000000000 --- a/test/mpi_test/test_trsm.c +++ /dev/null @@ -1,282 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include -#include "blis.h" -#include - -// side uplo trans diag m n alpha a lda b ldb -//void dtrsm_( char*, char*, char*, char*, int*, int*, double*, double*, int*, double*, int* ); - -//#define PRINT - -int main( int argc, char** argv ) -{ - obj_t a, b, c; - obj_t c_save; - obj_t alpha, beta; - dim_t m, n; - dim_t p; - dim_t p_begin, p_end, p_inc; - int m_input, n_input; - num_t dt_a, dt_b, dt_c; - num_t dt_alpha, dt_beta; - int r, n_repeats; - side_t side; - uplo_t uplo; - - double dtime; - double dtime_save; - double gflops; - - bli_init(); - - n_repeats = 3; - - if( argc < 7 ) - { - printf("Usage:\n"); - printf("test_foo.x m n k p_begin p_inc p_end:\n"); - exit; - } - - int world_size, world_rank, provided; - MPI_Init_thread( NULL, NULL, MPI_THREAD_FUNNELED, &provided ); - MPI_Comm_size( MPI_COMM_WORLD, &world_size ); - MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); - - m_input = strtol( argv[1], NULL, 10 ); - n_input = strtol( argv[2], NULL, 10 ); - p_begin = strtol( argv[4], NULL, 10 ); - p_inc = strtol( argv[5], NULL, 10 ); - p_end = strtol( argv[6], NULL, 10 ); - -#if 1 - dt_a = BLIS_DOUBLE; - dt_b = BLIS_DOUBLE; - dt_c = BLIS_DOUBLE; - dt_alpha = BLIS_DOUBLE; - dt_beta = BLIS_DOUBLE; -#else - dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_FLOAT; - //dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_SCOMPLEX; -#endif - - side = BLIS_LEFT; - //side = BLIS_RIGHT; - - uplo = BLIS_LOWER; - //uplo = BLIS_UPPER; - - for ( p = p_begin + world_rank * p_inc; p <= p_end; p += p_inc * world_size ) - { - - if ( m_input < 0 ) m = p * ( dim_t )abs(m_input); - else m = ( dim_t ) m_input; - if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); - else n = ( dim_t ) n_input; - - - bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); - bli_obj_create( dt_beta, 1, 1, 0, 0, &beta ); - - if ( bli_is_left( side ) ) - bli_obj_create( dt_a, m, m, 0, 0, &a ); - else - bli_obj_create( dt_a, n, n, 0, 0, &a ); - bli_obj_create( dt_b, m, n, 0, 0, &b ); - bli_obj_create( dt_c, m, n, 0, 0, &c ); - bli_obj_create( dt_c, m, n, 0, 0, &c_save ); - - bli_obj_set_struc( BLIS_TRIANGULAR, &a ); - bli_obj_set_uplo( uplo, &a ); - //bli_obj_set_diag( BLIS_UNIT_DIAG, &a ); - - bli_randm( &a ); - bli_randm( &c ); - bli_randm( &b ); - -/* - { - obj_t a2; - - bli_obj_alias_to( &a, &a2 ); - bli_obj_toggle_uplo( &a2 ); - bli_obj_inc_diag_offset( 1, &a2 ); - bli_setm( &BLIS_ZERO, &a2 ); - bli_obj_inc_diag_offset( -2, &a2 ); - bli_obj_toggle_uplo( &a2 ); - bli_obj_set_diag( BLIS_NONUNIT_DIAG, &a2 ); - bli_scalm( &BLIS_TWO, &a2 ); - //bli_scalm( &BLIS_TWO, &a ); - } -*/ - - bli_setsc( (2.0/1.0), 0.0, &alpha ); - bli_setsc( (1.0/1.0), 0.0, &beta ); - - - bli_copym( &c, &c_save ); - - dtime_save = 1.0e9; - - for ( r = 0; r < n_repeats; ++r ) - { - bli_copym( &c_save, &c ); - - dtime = bli_clock(); - - -#ifdef PRINT -/* - obj_t ar, ai; - bli_obj_alias_to( &a, &ar ); - bli_obj_alias_to( &a, &ai ); - bli_obj_set_dt( BLIS_DOUBLE, &ar ); ar.rs *= 2; ar.cs *= 2; - bli_obj_set_dt( BLIS_DOUBLE, &ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1; - - bli_printm( "ar", &ar, "%4.1f", "" ); - bli_printm( "ai", &ai, "%4.1f", "" ); -*/ - - bli_invertd( &a ); - bli_printm( "a", &a, "%4.1f", "" ); - bli_invertd( &a ); - bli_printm( "c", &c, "%4.1f", "" ); -#endif - -#ifdef BLIS - //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - - bli_trsm( side, - //bli_trsm4m( side, - //bli_trsm3m( side, - &alpha, - &a, - &c ); -#else - - if ( bli_is_real( dt_a ) ) - { - f77_char side = 'L'; - f77_char uplo = 'L'; - f77_char transa = 'N'; - f77_char diag = 'N'; - f77_int mm = bli_obj_length( &c ); - f77_int nn = bli_obj_width( &c ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldc = bli_obj_col_stride( &c ); - float * alphap = bli_obj_buffer( &alpha ); - float * ap = bli_obj_buffer( &a ); - float * cp = bli_obj_buffer( &c ); - - strsm_( &side, - &uplo, - &transa, - &diag, - &mm, - &nn, - alphap, - ap, &lda, - cp, &ldc ); - } - else // if ( bli_is_complex( dt_a ) ) - { - f77_char side = 'L'; - f77_char uplo = 'L'; - f77_char transa = 'N'; - f77_char diag = 'N'; - f77_int mm = bli_obj_length( &c ); - f77_int nn = bli_obj_width( &c ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldc = bli_obj_col_stride( &c ); - scomplex* alphap = bli_obj_buffer( &alpha ); - scomplex* ap = bli_obj_buffer( &a ); - scomplex* cp = bli_obj_buffer( &c ); - - ctrsm_( &side, - //ztrsm_( &side, - &uplo, - &transa, - &diag, - &mm, - &nn, - alphap, - ap, &lda, - cp, &ldc ); - } - -#endif - -#ifdef PRINT - bli_printm( "c after", &c, "%4.1f", "" ); - exit(1); -#endif - - - dtime_save = bli_clock_min_diff( dtime_save, dtime ); - } - - if ( bli_is_left( side ) ) - gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 ); - else - gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 ); - - if ( bli_is_complex( dt_a ) ) gflops *= 4.0; - -#ifdef BLIS - printf( "data_trsm_blis" ); -#else - printf( "data_trsm_%s", BLAS ); -#endif - printf( "( %2lu, 1:4 ) = [ %4lu %4lu %10.3e %6.3f ];\n", - ( unsigned long )(p - p_begin + 1)/p_inc + 1, - ( unsigned long )m, - ( unsigned long )n, dtime_save, gflops ); - - - bli_obj_free( &alpha ); - bli_obj_free( &beta ); - - bli_obj_free( &a ); - bli_obj_free( &b ); - bli_obj_free( &c ); - bli_obj_free( &c_save ); - } - - bli_finalize(); - - return 0; -} - From dc6e5f3f5770074ba38554541b8b64711a68c084 Mon Sep 17 00:00:00 2001 From: leekillough <15950023+leekillough@users.noreply.github.com> Date: Thu, 3 Nov 2022 18:33:08 -0500 Subject: [PATCH 106/230] Enhance emacs formatting of C files to remove trailing whitespace and ensure a newline at the end of file --- .dir-locals.el | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/.dir-locals.el b/.dir-locals.el index fccb20502..711f4a63d 100644 --- a/.dir-locals.el +++ b/.dir-locals.el @@ -1,9 +1,11 @@ -;; First (minimal) attempt at configuring Emacs CC mode for the BLIS -;; layout requirements. +;; Emacs C mode formatting for the BLIS layout requirements. ((c-mode . ((c-file-style . "stroustrup") - (c-basic-offset . 4) - (comment-start . "// ") - (comment-end . "") - (indent-tabs-mode . t) - (tab-width . 4) - (parens-require-spaces . nil)))) + (c-basic-offset . 4) + (comment-start . "// ") + (comment-end . "") + (indent-tabs-mode . t) + (tab-width . 4) + (parens-require-spaces . nil) + (require-final-newline . t) + (eval add-hook `before-save-hook `delete-trailing-whitespace) + ))) From e1ea25da43508925e33d4e57e420cfc0a9de793f Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 11 Nov 2022 12:07:51 -0600 Subject: [PATCH 107/230] Fixed subtle barrier_fpa bug in bli_thrcomm.c. (#690) Details: - In bli_thrcommo.c, correctly initialize the BLIS_OPENMP element of the barrier function pointer array (barrier_fpa) to NULL when BLIS_ENABLE_OPENMP is *not* defined. Similarly, initialize the BLIS_POSIX element of barrier_fpa to NULL when BLIS_ENABLE_PTHREADS is not enabled. This bug was introduced in a1a5a9b and was likely the result of an incomplete edit. The effects of the bug would have likely manifested when querying a thrcomm_t that was initialized with a timpl_t value corresponding to a threading implementation that was omitted from the -t option at configure-time. --- frame/thread/bli_thrcomm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index 0547d296e..f0bba205a 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -115,17 +115,17 @@ static thrcomm_barrier_ft barrier_fpa[ BLIS_NUM_THREAD_IMPLS ] = #if defined(BLIS_ENABLE_OPENMP) bli_thrcomm_barrier_openmp, #elif defined(BLIS_ENABLE_PTHREADS) - bli_thrcomm_barrier_pthreads, + NULL, #else - bli_thrcomm_barrier_single, + NULL, #endif [BLIS_POSIX] = #if defined(BLIS_ENABLE_PTHREADS) bli_thrcomm_barrier_pthreads, #elif defined(BLIS_ENABLE_OPENMP) - bli_thrcomm_barrier_openmp, + NULL, #else - bli_thrcomm_barrier_single, + NULL, #endif }; From 2b05948ad2c9785bc53f376d53a7141cbc917447 Mon Sep 17 00:00:00 2001 From: ct-clmsn Date: Sun, 13 Nov 2022 17:40:22 -0500 Subject: [PATCH 108/230] blis support for hpx (#682) Implement threading backend via HPX. HPX is an asynchronous many task runtime system used in high performance computing applications. The runtime implements the ISO C++ parallelism specification and provides a user-space thread implementation. This PR provides BLIS a thread backend implementation using HPX and resolves feature request #681. The configuration script, makefiles, and testsuite have been updated to support an HPX build option. The addition of HPX support provides other developers an exemplar for integrating other C++ threading backends into BLIS. Co-authored-by: ctaylor Co-authored-by: Devin Matthews --- Makefile | 10 + README.md | 2 +- blastest/src/cblat1.c | 40 +- blastest/src/cblat2.c | 426 ++++++++--------- blastest/src/cblat3.c | 602 ++++++++++++------------ blastest/src/dblat1.c | 164 +++---- blastest/src/dblat2.c | 444 +++++++++--------- blastest/src/dblat3.c | 428 ++++++++--------- blastest/src/sblat1.c | 180 ++++---- blastest/src/sblat2.c | 354 +++++++------- blastest/src/sblat3.c | 380 +++++++-------- blastest/src/zblat1.c | 48 +- blastest/src/zblat2.c | 518 +++++++++++---------- blastest/src/zblat3.c | 636 +++++++++++++------------- build/bli_config.h.in | 7 + build/config.mk.in | 1 + build/libblis-symbols.def | 2 + common.mk | 43 +- configure | 41 +- docs/FAQ.md | 4 +- docs/Multithreading.md | 6 +- frame/3/bli_l3_decor.c | 7 +- frame/base/bli_info.c | 19 +- frame/base/bli_info.h | 2 + frame/include/bli_config_macro_defs.h | 10 +- frame/include/bli_type_defs.h | 4 +- frame/thread/bli_thrcomm.c | 24 +- frame/thread/bli_thrcomm.h | 8 + frame/thread/bli_thrcomm_hpx.cpp | 92 ++++ frame/thread/bli_thrcomm_hpx.h | 48 ++ frame/thread/bli_thread.c | 17 +- frame/thread/bli_thread.h | 1 + frame/thread/bli_thread_hpx.cpp | 85 ++++ frame/thread/bli_thread_hpx.h | 54 +++ testsuite/src/test_libblis.c | 24 +- 35 files changed, 2648 insertions(+), 2083 deletions(-) create mode 100644 frame/thread/bli_thrcomm_hpx.cpp create mode 100644 frame/thread/bli_thrcomm_hpx.h create mode 100644 frame/thread/bli_thread_hpx.cpp create mode 100644 frame/thread/bli_thread_hpx.h diff --git a/Makefile b/Makefile index 04cdca421..33641f8c8 100644 --- a/Makefile +++ b/Makefile @@ -552,6 +552,16 @@ else @echo "Compiling $$@" $(call get-frame-text-for,$(1)) @$(CC) $(call get-frame-cflags-for,$(1)) -c $$< -o $$@ endif + +ifneq ($(findstring hpx,$(THREADING_MODEL)),) +$(BASE_OBJ_FRAME_PATH)/%.o: $(FRAME_PATH)/%.cpp $(BLIS_H_FLAT) $(MAKE_DEFS_MK_PATHS) +ifeq ($(ENABLE_VERBOSE),yes) + $(CXX) $(call get-frame-cxxflags-for,$(1)) -c $$< -o $$@ +else + @echo "Compiling $$@" $(call get-frame-cxxtext-for,$(1)) + @$(CXX) $(call get-frame-cxxflags-for,$(1)) -c $$< -o $$@ +endif +endif endef # first argument: a kernel set (name) being targeted (e.g. haswell). diff --git a/README.md b/README.md index e0e4238ca..68c937f52 100644 --- a/README.md +++ b/README.md @@ -286,7 +286,7 @@ writing complex kernels. * **Advanced multithreading support.** BLIS allows multiple levels of symmetric multithreading for nearly all level-3 operations. (Currently, users -may choose to obtain parallelism via either OpenMP or POSIX threads). This +may choose to obtain parallelism via OpenMP, POSIX threads, or HPX). This means that matrices may be partitioned in multiple dimensions simultaneously to attain scalable, high-performance parallelism on multicore and many-core architectures. The key to this innovation is a thread-specific control tree diff --git a/blastest/src/cblat1.c b/blastest/src/cblat1.c index 606511662..656294684 100644 --- a/blastest/src/cblat1.c +++ b/blastest/src/cblat1.c @@ -68,6 +68,11 @@ static real c_b52 = 0.f; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "cblat1"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ static real sfac = 9.765625e-4f; @@ -136,7 +141,12 @@ static real c_b52 = 0.f; } s_stop("", (ftnlen)0); - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int header_(void) @@ -230,7 +240,7 @@ static real c_b52 = 0.f; complex q__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -238,15 +248,15 @@ static real c_b52 = 0.f; integer i__; complex cx[8]; integer np1, len; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, - integer *), ctest_(integer *, complex *, complex *, complex *, + extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + integer *), ctest_(integer *, complex *, complex *, complex *, real *); complex mwpcs[5], mwpct[5]; extern real scnrm2_(integer *, complex *, integer *); extern /* Subroutine */ int itest1_(integer *, integer *), stest1_(real *, real *, real *, real *); extern integer icamax_(integer *, complex *, integer *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer *); extern real scasum_(integer *, complex *, integer *); @@ -465,7 +475,7 @@ static real c_b52 = 0.f; complex q__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -481,9 +491,9 @@ static real c_b52 = 0.f; #else complex cdotc_( #endif - integer *, complex *, integer + integer *, complex *, integer *, complex *, integer *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, complex *, integer *); extern /* Complex */ #ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL @@ -491,13 +501,13 @@ complex cdotc_( #else complex cdotu_( #endif - integer *, complex *, integer + integer *, complex *, integer *, complex *, integer *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, - complex *, integer *), ctest_(integer *, complex *, complex *, + extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + complex *, integer *), ctest_(integer *, complex *, complex *, complex *, real *); integer ksize; - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); /* Fortran I/O blocks */ @@ -691,7 +701,7 @@ complex cdotu_( sfac) { real scomp[1], strue[1]; - extern /* Subroutine */ int stest_(integer *, real *, real *, real *, + extern /* Subroutine */ int stest_(integer *, real *, real *, real *, real *); /* ************************* STEST1 ***************************** */ @@ -733,7 +743,7 @@ real sdiff_(real *sa, real *sb) return ret_val; } /* sdiff_ */ -/* Subroutine */ int ctest_(integer *len, complex *ccomp, complex *ctrue, +/* Subroutine */ int ctest_(integer *len, complex *ccomp, complex *ctrue, complex *csize, real *sfac) { /* System generated locals */ @@ -745,7 +755,7 @@ real sdiff_(real *sa, real *sb) /* Local variables */ integer i__; real scomp[20], ssize[20], strue[20]; - extern /* Subroutine */ int stest_(integer *, real *, real *, real *, + extern /* Subroutine */ int stest_(integer *, real *, real *, real *, real *); /* **************************** CTEST ***************************** */ diff --git a/blastest/src/cblat2.c b/blastest/src/cblat2.c index 2916a36a4..08d215aee 100644 --- a/blastest/src/cblat2.c +++ b/blastest/src/cblat2.c @@ -158,10 +158,15 @@ static logical c_false = FALSE_; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "cblat2"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*17] = "CGEMV " "CGBMV " "CHEMV " "CHBMV " "CHPMV " - "CTRMV " "CTBMV " "CTPMV " "CTRSV " "CTBSV " "CTPSV " "CGERC " + static char snames[6*17] = "CGEMV " "CGBMV " "CHEMV " "CHBMV " "CHPMV " + "CTRMV " "CTBMV " "CTPMV " "CTRSV " "CTBSV " "CTPSV " "CGERC " "CGERU " "CHER " "CHPR " "CHER2 " "CHPR2 "; /* Format strings */ @@ -209,10 +214,10 @@ static logical c_false = FALSE_; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -234,42 +239,42 @@ static logical c_false = FALSE_; integer ninc, nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, integer *, integer *, complex *, integer *, complex *, - integer *, integer *, integer *, integer *, complex *, complex *, - complex *, complex *, complex *, complex *, complex *, complex *, - complex *, complex *, real *, ftnlen), cchk2_(char *, real *, - real *, integer *, integer *, logical *, logical *, logical *, - integer *, integer *, integer *, integer *, integer *, complex *, - integer *, complex *, integer *, integer *, integer *, integer *, - complex *, complex *, complex *, complex *, complex *, complex *, - complex *, complex *, complex *, complex *, real *, ftnlen), - cchk3_(char *, real *, real *, integer *, integer *, logical *, - logical *, logical *, integer *, integer *, integer *, integer *, - integer *, integer *, integer *, integer *, complex *, complex *, - complex *, complex *, complex *, complex *, complex *, real *, - complex *, ftnlen), cchk4_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, complex *, integer *, integer *, integer *, integer *, - complex *, complex *, complex *, complex *, complex *, complex *, - complex *, complex *, complex *, complex *, real *, complex *, - ftnlen), cchk5_(char *, real *, real *, integer *, integer *, - logical *, logical *, logical *, integer *, integer *, integer *, - complex *, integer *, integer *, integer *, integer *, complex *, - complex *, complex *, complex *, complex *, complex *, complex *, - complex *, complex *, complex *, real *, complex *, ftnlen), - cchk6_(char *, real *, real *, integer *, integer *, logical *, - logical *, logical *, integer *, integer *, integer *, complex *, - integer *, integer *, integer *, integer *, complex *, complex *, - complex *, complex *, complex *, complex *, complex *, complex *, + extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, integer *, integer *, complex *, integer *, complex *, + integer *, integer *, integer *, integer *, complex *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, real *, ftnlen), cchk2_(char *, real *, + real *, integer *, integer *, logical *, logical *, logical *, + integer *, integer *, integer *, integer *, integer *, complex *, + integer *, complex *, integer *, integer *, integer *, integer *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, complex *, real *, ftnlen), + cchk3_(char *, real *, real *, integer *, integer *, logical *, + logical *, logical *, integer *, integer *, integer *, integer *, + integer *, integer *, integer *, integer *, complex *, complex *, + complex *, complex *, complex *, complex *, complex *, real *, + complex *, ftnlen), cchk4_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, complex *, integer *, integer *, integer *, integer *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, complex *, real *, complex *, + ftnlen), cchk5_(char *, real *, real *, integer *, integer *, + logical *, logical *, logical *, integer *, integer *, integer *, + complex *, integer *, integer *, integer *, integer *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, real *, complex *, ftnlen), + cchk6_(char *, real *, real *, integer *, integer *, logical *, + logical *, logical *, integer *, integer *, integer *, complex *, + integer *, integer *, integer *, integer *, complex *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, complex *, complex *, real *, complex *, ftnlen), cchke_(integer * , char *, integer *, ftnlen); logical fatal, trace; integer nidim; extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *, complex *, real *, complex *, real *, real *, + , integer *, complex *, real *, complex *, real *, real *, logical *, integer *, logical *, ftnlen); char snaps[32], trans[1]; integer isnum; @@ -618,7 +623,7 @@ static logical c_false = FALSE_; goto L80; } for (i__ = 1; i__ <= 17; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L70; } @@ -677,7 +682,7 @@ static logical c_false = FALSE_; /* YY holds the exact result. On exit from CMVCH YT holds */ /* the result computed by CMVCH. */ *(unsigned char *)trans = 'N'; - cmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c__1, &c_b1, y, &c__1, yt, g, + cmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c__1, &c_b1, y, &c__1, yt, g, yy, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1); same = lce_(yy, yt, &n); if (! same || err != 0.f) { @@ -690,7 +695,7 @@ static logical c_false = FALSE_; s_stop("", (ftnlen)0); } *(unsigned char *)trans = 'T'; - cmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c_n1, &c_b1, y, &c_n1, yt, g, + cmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c_n1, &c_b1, y, &c_n1, yt, g, yy, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1); same = lce_(yy, yt, &n); if (! same || err != 0.f) { @@ -751,44 +756,44 @@ static logical c_false = FALSE_; /* Test CGEMV, 01, and CGBMV, 02. */ L140: cchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test CHEMV, 03, CHBMV, 04, and CHPMV, 05. */ L150: cchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test CTRMV, 06, CTBMV, 07, CTPMV, 08, */ /* CTRSV, 09, CTBSV, 10, and CTPSV, 11. */ L160: cchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, &c__65, &c__2, a, aa, as, y, yy, ys, yt, g, z__, (ftnlen) 6); goto L200; /* Test CGERC, 12, CGERU, 13. */ L170: cchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test CHER, 14, and CHPR, 15. */ L180: cchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test CHER2, 16, and CHPR2, 17. */ L190: cchk6_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); L200: @@ -830,15 +835,20 @@ static logical c_false = FALSE_; /* End of CBLAT2. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int cchk1_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, integer * - nalf, complex *alf, integer *nbet, complex *bet, integer *ninc, + nalf, complex *alf, integer *nbet, complex *bet, integer *ninc, integer *inc, integer *nmax, integer *incmax, complex *a, complex *aa, - complex *as, complex *x, complex *xx, complex *xs, complex *y, + complex *as, complex *x, complex *xx, complex *xs, complex *y, complex *yy, complex *ys, complex *yt, real *g, ftnlen sname_len) { /* Initialized data */ @@ -867,7 +877,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9; alist al__1; @@ -887,26 +897,26 @@ static logical c_false = FALSE_; logical same; integer incx, incy; logical full, tran, null; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, integer *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, integer *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; logical isame[13]; extern /* Subroutine */ int cgbmv_(char *, integer *, integer *, integer * , integer *, complex *, complex *, integer *, complex *, integer * - , complex *, complex *, integer *, ftnlen), cgemv_(char *, - integer *, integer *, complex *, complex *, integer *, complex *, + , complex *, complex *, integer *, ftnlen), cgemv_(char *, + integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *, ftnlen), cmvch_(char * , integer *, integer *, complex *, complex *, integer *, complex * - , integer *, complex *, complex *, integer *, complex *, real *, - complex *, real *, real *, logical *, integer *, logical *, + , integer *, complex *, complex *, integer *, complex *, real *, + complex *, real *, real *, logical *, integer *, logical *, ftnlen); integer nargs; logical reset; integer incxs, incys; char trans[1]; logical banded; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; complex transl; @@ -1089,9 +1099,9 @@ static logical c_false = FALSE_; transl.r = 0.f, transl.i = 0.f; i__7 = abs(incy); i__8 = ml - 1; - cmake_("GE", " ", " ", &c__1, &ml, &y[1], + cmake_("GE", " ", " ", &c__1, &ml, &y[1], &c__1, &yy[1], &i__7, &c__0, & - i__8, &reset, &transl, (ftnlen)2, + i__8, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1099,7 +1109,7 @@ static logical c_false = FALSE_; /* Save every datum before calling the */ /* subroutine. */ - *(unsigned char *)transs = *(unsigned + *(unsigned char *)transs = *(unsigned char *)trans; ms = m; ns = n; @@ -1110,7 +1120,7 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__7; ++i__) { i__8 = i__; i__9 = i__; - as[i__8].r = aa[i__9].r, as[i__8].i = + as[i__8].r = aa[i__9].r, as[i__8].i = aa[i__9].i; /* L10: */ } @@ -1119,7 +1129,7 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__7; ++i__) { i__8 = i__; i__9 = i__; - xs[i__8].r = xx[i__9].r, xs[i__8].i = + xs[i__8].r = xx[i__9].r, xs[i__8].i = xx[i__9].i; /* L20: */ } @@ -1129,7 +1139,7 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__7; ++i__) { i__8 = i__; i__9 = i__; - ys[i__8].r = yy[i__9].r, ys[i__8].i = + ys[i__8].r = yy[i__9].r, ys[i__8].i = yy[i__9].i; /* L30: */ } @@ -1166,7 +1176,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - cgemv_(trans, &m, &n, &alpha, &aa[1], + cgemv_(trans, &m, &n, &alpha, &aa[1], &lda, &xx[1], &incx, &beta, & yy[1], &incy, (ftnlen)1); } else if (banded) { @@ -1225,7 +1235,7 @@ static logical c_false = FALSE_; isame[1] = ms == m; isame[2] = ns == n; if (full) { - isame[3] = als.r == alpha.r && als.i + isame[3] = als.r == alpha.r && als.i == alpha.i; isame[4] = lce_(&as[1], &aa[1], &laa); isame[5] = ldas == lda; @@ -1247,13 +1257,13 @@ static logical c_false = FALSE_; } else if (banded) { isame[3] = kls == kl; isame[4] = kus == ku; - isame[5] = als.r == alpha.r && als.i + isame[5] = als.r == alpha.r && als.i == alpha.i; isame[6] = lce_(&as[1], &aa[1], &laa); isame[7] = ldas == lda; isame[8] = lce_(&xs[1], &xx[1], &lx); isame[9] = incxs == incx; - isame[10] = bls.r == beta.r && bls.i + isame[10] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[11] = lce_(&ys[1], &yy[1], & @@ -1295,8 +1305,8 @@ static logical c_false = FALSE_; cmvch_(trans, &m, &n, &alpha, &a[ a_offset], nmax, &x[1], &incx, - &beta, &y[1], &incy, &yt[1], - &g[1], &yy[1], eps, &err, + &beta, &y[1], &incy, &yt[1], + &g[1], &yy[1], eps, &err, fatal, nout, &c_true, (ftnlen) 1); errmax = max(errmax,err); @@ -1401,11 +1411,11 @@ static logical c_false = FALSE_; } /* cchk1_ */ /* Subroutine */ int cchk2_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, integer * - nalf, complex *alf, integer *nbet, complex *bet, integer *ninc, + nalf, complex *alf, integer *nbet, complex *bet, integer *ninc, integer *inc, integer *nmax, integer *incmax, complex *a, complex *aa, - complex *as, complex *x, complex *xx, complex *xs, complex *y, + complex *as, complex *x, complex *xx, complex *xs, complex *y, complex *yy, complex *ys, complex *yt, real *g, ftnlen sname_len) { /* Initialized data */ @@ -1438,7 +1448,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9; alist al__1; @@ -1447,7 +1457,7 @@ static logical c_false = FALSE_; f_rew(alist *); /* Local variables */ - integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, + integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, laa, lda; extern logical lce_(complex *, complex *, integer *); complex als, bls; @@ -1458,18 +1468,18 @@ static logical c_false = FALSE_; integer incx, incy; logical full, null; char uplo[1]; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, integer *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, integer *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; logical isame[13]; extern /* Subroutine */ int chbmv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *, ftnlen), chemv_(char *, integer *, complex *, - complex *, integer *, complex *, integer *, complex *, complex *, + , integer *, ftnlen), chemv_(char *, integer *, complex *, + complex *, integer *, complex *, integer *, complex *, complex *, integer *, ftnlen), cmvch_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *, complex *, real *, complex *, real *, real *, + , integer *, complex *, real *, complex *, real *, real *, logical *, integer *, logical *, ftnlen); integer nargs; extern /* Subroutine */ int chpmv_(char *, integer *, complex *, complex * @@ -1478,7 +1488,7 @@ static logical c_false = FALSE_; integer incxs, incys; char uplos[1]; logical banded, packed; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; complex transl; @@ -1643,7 +1653,7 @@ static logical c_false = FALSE_; i__8 = n - 1; cmake_("GE", " ", " ", &c__1, &n, &y[1], & c__1, &yy[1], &i__7, &c__0, &i__8, & - reset, &transl, (ftnlen)2, (ftnlen)1, + reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1795,13 +1805,13 @@ static logical c_false = FALSE_; unsigned char *)uplos; isame[1] = ns == n; if (full) { - isame[2] = als.r == alpha.r && als.i == + isame[2] = als.r == alpha.r && als.i == alpha.i; isame[3] = lce_(&as[1], &aa[1], &laa); isame[4] = ldas == lda; isame[5] = lce_(&xs[1], &xx[1], &lx); isame[6] = incxs == incx; - isame[7] = bls.r == beta.r && bls.i == + isame[7] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[8] = lce_(&ys[1], &yy[1], &ly); @@ -1814,13 +1824,13 @@ static logical c_false = FALSE_; isame[9] = incys == incy; } else if (banded) { isame[2] = ks == k; - isame[3] = als.r == alpha.r && als.i == + isame[3] = als.r == alpha.r && als.i == alpha.i; isame[4] = lce_(&as[1], &aa[1], &laa); isame[5] = ldas == lda; isame[6] = lce_(&xs[1], &xx[1], &lx); isame[7] = incxs == incx; - isame[8] = bls.r == beta.r && bls.i == + isame[8] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[9] = lce_(&ys[1], &yy[1], &ly); @@ -1832,12 +1842,12 @@ static logical c_false = FALSE_; } isame[10] = incys == incy; } else if (packed) { - isame[2] = als.r == alpha.r && als.i == + isame[2] = als.r == alpha.r && als.i == alpha.i; isame[3] = lce_(&as[1], &aa[1], &laa); isame[4] = lce_(&xs[1], &xx[1], &lx); isame[5] = incxs == incx; - isame[6] = bls.r == beta.r && bls.i == + isame[6] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[7] = lce_(&ys[1], &yy[1], &ly); @@ -1875,8 +1885,8 @@ static logical c_false = FALSE_; /* Check the result. */ - cmvch_("N", &n, &n, &alpha, &a[a_offset], - nmax, &x[1], &incx, &beta, &y[1], + cmvch_("N", &n, &n, &alpha, &a[a_offset], + nmax, &x[1], &incx, &beta, &y[1], &incy, &yt[1], &g[1], &yy[1], eps, &err, fatal, nout, &c_true, ( ftnlen)1); @@ -1987,10 +1997,10 @@ static logical c_false = FALSE_; } /* cchk2_ */ /* Subroutine */ int cchk3_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, integer * - ninc, integer *inc, integer *nmax, integer *incmax, complex *a, - complex *aa, complex *as, complex *x, complex *xx, complex *xs, + ninc, integer *inc, integer *nmax, integer *incmax, complex *a, + complex *aa, complex *as, complex *x, complex *xx, complex *xs, complex *xt, real *g, complex *z__, ftnlen sname_len) { /* Initialized data */ @@ -2040,36 +2050,36 @@ static logical c_false = FALSE_; integer incx; logical full, null; char uplo[1]; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, integer *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, integer *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); char diags[1]; logical isame[13]; extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *, complex *, real *, complex *, real *, real *, + , integer *, complex *, real *, complex *, real *, real *, logical *, integer *, logical *, ftnlen); integer nargs; - extern /* Subroutine */ int ctbmv_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, ftnlen, - ftnlen, ftnlen), ctbsv_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, ftnlen, + extern /* Subroutine */ int ctbmv_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, ftnlen, + ftnlen, ftnlen), ctbsv_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, ftnlen, ftnlen, ftnlen); logical reset; integer incxs; char trans[1]; - extern /* Subroutine */ int ctpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ int ctpmv_(char *, char *, char *, integer *, complex *, complex *, integer *, ftnlen, ftnlen, ftnlen), ctrmv_( char *, char *, char *, integer *, complex *, integer *, complex * - , integer *, ftnlen, ftnlen, ftnlen), ctpsv_(char *, char *, char - *, integer *, complex *, complex *, integer *, ftnlen, ftnlen, + , integer *, ftnlen, ftnlen, ftnlen), ctpsv_(char *, char *, char + *, integer *, complex *, complex *, integer *, ftnlen, ftnlen, ftnlen); char uplos[1]; - extern /* Subroutine */ int ctrsv_(char *, char *, char *, integer *, - complex *, integer *, complex *, integer *, ftnlen, ftnlen, + extern /* Subroutine */ int ctrsv_(char *, char *, char *, integer *, + complex *, integer *, complex *, integer *, ftnlen, ftnlen, ftnlen); logical banded, packed; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; complex transl; @@ -2197,13 +2207,13 @@ static logical c_false = FALSE_; ; for (icd = 1; icd <= 2; ++icd) { - *(unsigned char *)diag = *(unsigned char *)&ichd[icd + *(unsigned char *)diag = *(unsigned char *)&ichd[icd - 1]; /* Generate the matrix A. */ transl.r = 0.f, transl.i = 0.f; - cmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], + cmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], nmax, &aa[1], &lda, &k, &k, &reset, &transl, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2258,7 +2268,7 @@ static logical c_false = FALSE_; /* Call the subroutine. */ - if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) + if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) == 0) { if (full) { if (*trace) { @@ -2311,7 +2321,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - ctbmv_(uplo, trans, diag, &n, &k, &aa[1], + ctbmv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2392,7 +2402,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - ctbsv_(uplo, trans, diag, &n, &k, &aa[1], + ctbsv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2434,11 +2444,11 @@ static logical c_false = FALSE_; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplo == *(unsigned + isame[0] = *(unsigned char *)uplo == *(unsigned char *)uplos; - isame[1] = *(unsigned char *)trans == *(unsigned + isame[1] = *(unsigned char *)trans == *(unsigned char *)transs; - isame[2] = *(unsigned char *)diag == *(unsigned + isame[2] = *(unsigned char *)diag == *(unsigned char *)diags; isame[3] = ns == n; if (full) { @@ -2508,7 +2518,7 @@ static logical c_false = FALSE_; cmvch_(trans, &n, &n, &c_b2, &a[a_offset], nmax, &x[1], &incx, &c_b1, &z__[ - 1], &incx, &xt[1], &g[1], &xx[1], + 1], &incx, &xt[1], &g[1], &xx[1], eps, &err, fatal, nout, &c_true, ( ftnlen)1); } else if (s_cmp(sname + 3, "SV", (ftnlen)2, ( @@ -2520,18 +2530,18 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__4; ++i__) { i__5 = i__; i__6 = (i__ - 1) * abs(incx) + 1; - z__[i__5].r = xx[i__6].r, z__[i__5].i + z__[i__5].r = xx[i__6].r, z__[i__5].i = xx[i__6].i; i__5 = (i__ - 1) * abs(incx) + 1; i__6 = i__; - xx[i__5].r = x[i__6].r, xx[i__5].i = + xx[i__5].r = x[i__6].r, xx[i__5].i = x[i__6].i; /* L50: */ } cmvch_(trans, &n, &n, &c_b2, &a[a_offset], nmax, &z__[1], &incx, &c_b1, &x[ - 1], &incx, &xt[1], &g[1], &xx[1], - eps, &err, fatal, nout, &c_false, + 1], &incx, &xt[1], &g[1], &xx[1], + eps, &err, fatal, nout, &c_false, (ftnlen)1); } errmax = max(errmax,err); @@ -2634,10 +2644,10 @@ static logical c_false = FALSE_; } /* cchk3_ */ /* Subroutine */ int cchk4_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * - ninc, integer *inc, integer *nmax, integer *incmax, complex *a, - complex *aa, complex *as, complex *x, complex *xx, complex *xs, + ninc, integer *inc, integer *nmax, integer *incmax, complex *a, + complex *aa, complex *as, complex *x, complex *xx, complex *xs, complex *y, complex *yy, complex *ys, complex *yt, real *g, complex * z__, ftnlen sname_len) { @@ -2681,23 +2691,23 @@ static logical c_false = FALSE_; logical same, conj; integer incx, incy; logical null; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, integer *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, integer *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen), cgerc_( - integer *, integer *, complex *, complex *, integer *, complex *, + integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); complex alpha; logical isame[13]; extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *, complex *, real *, complex *, real *, real *, - logical *, integer *, logical *, ftnlen), cgeru_(integer *, - integer *, complex *, complex *, integer *, complex *, integer *, + , integer *, complex *, real *, complex *, real *, real *, + logical *, integer *, logical *, ftnlen), cgeru_(integer *, + integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); integer nargs; logical reset; integer incxs, incys; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; complex transl; @@ -2801,7 +2811,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = m - 1; cmake_("GE", " ", " ", &c__1, &m, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (m > 1) { i__3 = m / 2; @@ -2840,7 +2850,7 @@ static logical c_false = FALSE_; transl.r = 0.f, transl.i = 0.f; i__5 = m - 1; i__6 = n - 1; - cmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], + cmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2999,9 +3009,9 @@ static logical c_false = FALSE_; r_cnjg(&q__1, w); w[0].r = q__1.r, w[0].i = q__1.i; } - cmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, + cmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, w, &c__1, &c_b2, &a[j * a_dim1 + 1], & - c__1, &yt[1], &g[1], &aa[(j - 1) * + c__1, &yt[1], &g[1], &aa[(j - 1) * lda + 1], eps, &err, fatal, nout, & c_true, (ftnlen)1); errmax = max(errmax,err); @@ -3082,10 +3092,10 @@ static logical c_false = FALSE_; } /* cchk4_ */ /* Subroutine */ int cchk5_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * - ninc, integer *inc, integer *nmax, integer *incmax, complex *a, - complex *aa, complex *as, complex *x, complex *xx, complex *xs, + ninc, integer *inc, integer *nmax, integer *incmax, complex *a, + complex *aa, complex *as, complex *x, complex *xx, complex *xs, complex *y, complex *yy, complex *ys, complex *yt, real *g, complex * z__, ftnlen sname_len) { @@ -3130,24 +3140,24 @@ static logical c_false = FALSE_; integer ia, ja, ic, nc, jj, lj, in, ix, ns, lx, laa, lda; extern logical lce_(complex *, complex *, integer *); real err; - extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, + extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, integer *, complex *, integer *, ftnlen); integer ldas; logical same; - extern /* Subroutine */ int chpr_(char *, integer *, real *, complex *, + extern /* Subroutine */ int chpr_(char *, integer *, real *, complex *, integer *, complex *, ftnlen); real rals; integer incx; logical full, null; char uplo[1]; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, integer *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, integer *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; logical isame[13]; extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *, complex *, real *, complex *, real *, real *, + , integer *, complex *, real *, complex *, real *, real *, logical *, integer *, logical *, ftnlen); integer nargs; logical reset; @@ -3156,7 +3166,7 @@ static logical c_false = FALSE_; char uplos[1]; logical packed; real ralpha; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; complex transl; @@ -3261,7 +3271,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; cmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { i__3 = n / 2; @@ -3336,7 +3346,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - cher_(uplo, &n, &ralpha, &xx[1], &incx, &aa[1], &lda, + cher_(uplo, &n, &ralpha, &xx[1], &incx, &aa[1], &lda, (ftnlen)1); } else if (packed) { if (*trace) { @@ -3446,9 +3456,9 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - cmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, - &c__1, &c_b2, &a[jj + j * a_dim1], &c__1, - &yt[1], &g[1], &aa[ja], eps, &err, fatal, + cmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, + &c__1, &c_b2, &a[jj + j * a_dim1], &c__1, + &yt[1], &g[1], &aa[ja], eps, &err, fatal, nout, &c_true, (ftnlen)1); if (full) { if (upper) { @@ -3547,10 +3557,10 @@ static logical c_false = FALSE_; } /* cchk5_ */ /* Subroutine */ int cchk6_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * - ninc, integer *inc, integer *nmax, integer *incmax, complex *a, - complex *aa, complex *as, complex *x, complex *xx, complex *xs, + ninc, integer *inc, integer *nmax, integer *incmax, complex *a, + complex *aa, complex *as, complex *x, complex *xx, complex *xs, complex *y, complex *yy, complex *ys, complex *yt, real *g, complex * z__, ftnlen sname_len) { @@ -3580,7 +3590,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, + integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; complex q__1, q__2, q__3; alist al__1; @@ -3603,17 +3613,17 @@ static logical c_false = FALSE_; logical full, null; char uplo[1]; extern /* Subroutine */ int cher2_(char *, integer *, complex *, complex * - , integer *, complex *, integer *, complex *, integer *, ftnlen), - chpr2_(char *, integer *, complex *, complex *, integer *, - complex *, integer *, complex *, ftnlen), cmake_(char *, char *, - char *, integer *, integer *, complex *, integer *, complex *, - integer *, integer *, integer *, logical *, complex *, ftnlen, + , integer *, complex *, integer *, complex *, integer *, ftnlen), + chpr2_(char *, integer *, complex *, complex *, integer *, + complex *, integer *, complex *, ftnlen), cmake_(char *, char *, + char *, integer *, integer *, complex *, integer *, complex *, + integer *, integer *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; logical isame[13]; extern /* Subroutine */ int cmvch_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *, complex *, real *, complex *, real *, real *, + , integer *, complex *, real *, complex *, real *, real *, logical *, integer *, logical *, ftnlen); integer nargs; logical reset; @@ -3621,7 +3631,7 @@ static logical c_false = FALSE_; logical upper; char uplos[1]; logical packed; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; complex transl; @@ -3728,7 +3738,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; cmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { i__3 = n / 2; @@ -3768,7 +3778,7 @@ static logical c_false = FALSE_; transl.r = 0.f, transl.i = 0.f; i__5 = n - 1; i__6 = n - 1; - cmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], + cmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -3956,14 +3966,14 @@ static logical c_false = FALSE_; i__5 = n; for (j = 1; j <= i__5; ++j) { r_cnjg(&q__2, &z__[j + (z_dim1 << 1)]); - q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, - q__1.i = alpha.r * q__2.i + alpha.i * + q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, + q__1.i = alpha.r * q__2.i + alpha.i * q__2.r; w[0].r = q__1.r, w[0].i = q__1.i; r_cnjg(&q__2, &alpha); r_cnjg(&q__3, &z__[j + z_dim1]); - q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, - q__1.i = q__2.r * q__3.i + q__2.i * + q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, + q__1.i = q__2.r * q__3.i + q__2.i * q__3.r; w[1].r = q__1.r, w[1].i = q__1.i; if (upper) { @@ -3973,8 +3983,8 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - cmvch_("N", &lj, &c__2, &c_b2, &z__[jj + - z_dim1], nmax, w, &c__1, &c_b2, &a[jj + cmvch_("N", &lj, &c__2, &c_b2, &z__[jj + + z_dim1], nmax, w, &c__1, &c_b2, &a[jj + j * a_dim1], &c__1, &yt[1], &g[1], & aa[ja], eps, &err, fatal, nout, & c_true, (ftnlen)1); @@ -4079,7 +4089,7 @@ static logical c_false = FALSE_; } /* cchk6_ */ -/* Subroutine */ int cchke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int cchke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -4093,40 +4103,40 @@ static logical c_false = FALSE_; /* Local variables */ complex a[1] /* was [1][1] */, x[1], y[1], beta; - extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, + extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, integer *, complex *, integer *, ftnlen), chpr_(char *, integer *, - real *, complex *, integer *, complex *, ftnlen), cher2_(char *, - integer *, complex *, complex *, integer *, complex *, integer *, + real *, complex *, integer *, complex *, ftnlen), cher2_(char *, + integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *, ftnlen), chpr2_(char *, integer *, complex * - , complex *, integer *, complex *, integer *, complex *, ftnlen), - cgerc_(integer *, integer *, complex *, complex *, integer *, + , complex *, integer *, complex *, integer *, complex *, ftnlen), + cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); complex alpha; extern /* Subroutine */ int cgbmv_(char *, integer *, integer *, integer * , integer *, complex *, complex *, integer *, complex *, integer * - , complex *, complex *, integer *, ftnlen), chbmv_(char *, - integer *, integer *, complex *, complex *, integer *, complex *, + , complex *, complex *, integer *, ftnlen), chbmv_(char *, + integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *, ftnlen), cgemv_(char * , integer *, integer *, complex *, complex *, integer *, complex * , integer *, complex *, complex *, integer *, ftnlen), chemv_( - char *, integer *, complex *, complex *, integer *, complex *, + char *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *, ftnlen), cgeru_( - integer *, integer *, complex *, complex *, integer *, complex *, - integer *, complex *, integer *), ctbmv_(char *, char *, char *, - integer *, integer *, complex *, integer *, complex *, integer *, - ftnlen, ftnlen, ftnlen), chpmv_(char *, integer *, complex *, - complex *, complex *, integer *, complex *, complex *, integer *, - ftnlen), ctbsv_(char *, char *, char *, integer *, integer *, - complex *, integer *, complex *, integer *, ftnlen, ftnlen, - ftnlen), ctpmv_(char *, char *, char *, integer *, complex *, - complex *, integer *, ftnlen, ftnlen, ftnlen), ctrmv_(char *, - char *, char *, integer *, complex *, integer *, complex *, + integer *, integer *, complex *, complex *, integer *, complex *, + integer *, complex *, integer *), ctbmv_(char *, char *, char *, + integer *, integer *, complex *, integer *, complex *, integer *, + ftnlen, ftnlen, ftnlen), chpmv_(char *, integer *, complex *, + complex *, complex *, integer *, complex *, complex *, integer *, + ftnlen), ctbsv_(char *, char *, char *, integer *, integer *, + complex *, integer *, complex *, integer *, ftnlen, ftnlen, + ftnlen), ctpmv_(char *, char *, char *, integer *, complex *, + complex *, integer *, ftnlen, ftnlen, ftnlen), ctrmv_(char *, + char *, char *, integer *, complex *, integer *, complex *, integer *, ftnlen, ftnlen, ftnlen), ctpsv_(char *, char *, char *, - integer *, complex *, complex *, integer *, ftnlen, ftnlen, - ftnlen), ctrsv_(char *, char *, char *, integer *, complex *, + integer *, complex *, complex *, integer *, ftnlen, ftnlen, + ftnlen), ctrsv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *, ftnlen, ftnlen, ftnlen); real ralpha; - extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical + extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -4655,9 +4665,9 @@ static logical c_false = FALSE_; } /* cchke_ */ -/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, - integer *n, complex *a, integer *nmax, complex *aa, integer *lda, - integer *kl, integer *ku, logical *reset, complex *transl, ftnlen +/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, + integer *n, complex *a, integer *nmax, complex *aa, integer *lda, + integer *kl, integer *ku, logical *reset, complex *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -4718,7 +4728,7 @@ static logical c_false = FALSE_; i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { if (gen || upper && i__ <= j || lower && i__ >= j) { - if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) + if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) { i__3 = i__ + j * a_dim1; cbeg_(&q__2, reset); @@ -4953,8 +4963,8 @@ static logical c_false = FALSE_; /* Subroutine */ int cmvch_(char *trans, integer *m, integer *n, complex * alpha, complex *a, integer *nmax, complex *x, integer *incx, complex * - beta, complex *y, integer *incy, complex *yt, real *g, complex *yy, - real *eps, real *err, logical *fatal, integer *nout, logical *mv, + beta, complex *y, integer *incy, complex *yt, real *g, complex *yy, + real *eps, real *err, logical *fatal, integer *nout, logical *mv, ftnlen trans_len) { /* Format strings */ @@ -5057,15 +5067,15 @@ static logical c_false = FALSE_; i__4 = iy; i__5 = j + i__ * a_dim1; i__6 = jx; - q__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, + q__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, q__2.i = a[i__5].r * x[i__6].i + a[i__5].i * x[i__6] .r; q__1.r = yt[i__4].r + q__2.r, q__1.i = yt[i__4].i + q__2.i; yt[i__3].r = q__1.r, yt[i__3].i = q__1.i; i__3 = j + i__ * a_dim1; i__4 = jx; - g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[j - + i__ * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, + g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[j + + i__ * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, abs(r__3)) + (r__4 = r_imag(&x[jx]), abs(r__4))); jx += incxl; /* L10: */ @@ -5077,14 +5087,14 @@ static logical c_false = FALSE_; i__4 = iy; r_cnjg(&q__3, &a[j + i__ * a_dim1]); i__5 = jx; - q__2.r = q__3.r * x[i__5].r - q__3.i * x[i__5].i, q__2.i = + q__2.r = q__3.r * x[i__5].r - q__3.i * x[i__5].i, q__2.i = q__3.r * x[i__5].i + q__3.i * x[i__5].r; q__1.r = yt[i__4].r + q__2.r, q__1.i = yt[i__4].i + q__2.i; yt[i__3].r = q__1.r, yt[i__3].i = q__1.i; i__3 = j + i__ * a_dim1; i__4 = jx; - g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[j - + i__ * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, + g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[j + + i__ * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, abs(r__3)) + (r__4 = r_imag(&x[jx]), abs(r__4))); jx += incxl; /* L20: */ @@ -5096,7 +5106,7 @@ static logical c_false = FALSE_; i__4 = iy; i__5 = i__ + j * a_dim1; i__6 = jx; - q__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, + q__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, q__2.i = a[i__5].r * x[i__6].i + a[i__5].i * x[i__6] .r; q__1.r = yt[i__4].r + q__2.r, q__1.i = yt[i__4].i + q__2.i; @@ -5104,7 +5114,7 @@ static logical c_false = FALSE_; i__3 = i__ + j * a_dim1; i__4 = jx; g[iy] += ((r__1 = a[i__3].r, abs(r__1)) + (r__2 = r_imag(&a[ - i__ + j * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, + i__ + j * a_dim1]), abs(r__2))) * ((r__3 = x[i__4].r, abs(r__3)) + (r__4 = r_imag(&x[jx]), abs(r__4))); jx += incxl; /* L30: */ @@ -5112,7 +5122,7 @@ static logical c_false = FALSE_; } i__2 = iy; i__3 = iy; - q__2.r = alpha->r * yt[i__3].r - alpha->i * yt[i__3].i, q__2.i = + q__2.r = alpha->r * yt[i__3].r - alpha->i * yt[i__3].i, q__2.i = alpha->r * yt[i__3].i + alpha->i * yt[i__3].r; i__4 = iy; q__3.r = beta->r * y[i__4].r - beta->i * y[i__4].i, q__3.i = beta->r * @@ -5121,7 +5131,7 @@ static logical c_false = FALSE_; yt[i__2].r = q__1.r, yt[i__2].i = q__1.i; i__2 = iy; g[iy] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha), abs( - r__2))) * g[iy] + ((r__3 = beta->r, abs(r__3)) + (r__4 = + r__2))) * g[iy] + ((r__3 = beta->r, abs(r__3)) + (r__4 = r_imag(beta), abs(r__4))) * ((r__5 = y[i__2].r, abs(r__5)) + ( r__6 = r_imag(&y[iy]), abs(r__6))); iy += incyl; @@ -5410,7 +5420,7 @@ real sdiff_(real *x, real *y) } /* sdiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/blastest/src/cblat3.c b/blastest/src/cblat3.c index a5b870f0f..e3d5e32a3 100644 --- a/blastest/src/cblat3.c +++ b/blastest/src/cblat3.c @@ -140,9 +140,14 @@ static integer c_n1 = -1; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "cblat3"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*9] = "CGEMM " "CHEMM " "CSYMM " "CTRMM " "CTRSM " + static char snames[6*9] = "CGEMM " "CHEMM " "CSYMM " "CTRMM " "CTRSM " "CHERK " "CSYRK " "CHER2K" "CSYR2K"; /* Format strings */ @@ -186,10 +191,10 @@ static integer c_n1 = -1; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -209,34 +214,34 @@ static integer c_n1 = -1; integer nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, complex *, integer *, complex *, integer *, complex *, - complex *, complex *, complex *, complex *, complex *, complex *, - complex *, complex *, complex *, real *, ftnlen), cchk2_(char *, - real *, real *, integer *, integer *, logical *, logical *, - logical *, integer *, integer *, integer *, complex *, integer *, - complex *, integer *, complex *, complex *, complex *, complex *, - complex *, complex *, complex *, complex *, complex *, complex *, - real *, ftnlen), cchk3_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, complex *, integer *, complex *, complex *, complex *, - complex *, complex *, complex *, complex *, real *, complex *, - ftnlen), cchk4_(char *, real *, real *, integer *, integer *, - logical *, logical *, logical *, integer *, integer *, integer *, - complex *, integer *, complex *, integer *, complex *, complex *, - complex *, complex *, complex *, complex *, complex *, complex *, - complex *, complex *, real *, ftnlen), cchk5_(char *, real *, - real *, integer *, integer *, logical *, logical *, logical *, - integer *, integer *, integer *, complex *, integer *, complex *, - integer *, complex *, complex *, complex *, complex *, complex *, - complex *, complex *, complex *, complex *, real *, complex *, + extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, complex *, integer *, complex *, integer *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, real *, ftnlen), cchk2_(char *, + real *, real *, integer *, integer *, logical *, logical *, + logical *, integer *, integer *, integer *, complex *, integer *, + complex *, integer *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + real *, ftnlen), cchk3_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, complex *, integer *, complex *, complex *, complex *, + complex *, complex *, complex *, complex *, real *, complex *, + ftnlen), cchk4_(char *, real *, real *, integer *, integer *, + logical *, logical *, logical *, integer *, integer *, integer *, + complex *, integer *, complex *, integer *, complex *, complex *, + complex *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, real *, ftnlen), cchk5_(char *, real *, + real *, integer *, integer *, logical *, logical *, logical *, + integer *, integer *, integer *, complex *, integer *, complex *, + integer *, complex *, complex *, complex *, complex *, complex *, + complex *, complex *, complex *, complex *, real *, complex *, ftnlen), cchke_(integer *, char *, integer *, ftnlen); logical fatal; - extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - complex *, complex *, integer *, complex *, real *, complex *, - integer *, real *, real *, logical *, integer *, logical *, + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); logical trace; integer nidim; @@ -508,7 +513,7 @@ static integer c_n1 = -1; goto L60; } for (i__ = 1; i__ <= 9; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L50; } @@ -571,7 +576,7 @@ static integer c_n1 = -1; *(unsigned char *)transa = 'N'; *(unsigned char *)transb = 'N'; cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lce_(cc, ct, &n); if (! same || err != 0.f) { @@ -586,7 +591,7 @@ static integer c_n1 = -1; } *(unsigned char *)transb = 'C'; cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lce_(cc, ct, &n); if (! same || err != 0.f) { @@ -619,7 +624,7 @@ static integer c_n1 = -1; *(unsigned char *)transa = 'C'; *(unsigned char *)transb = 'N'; cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lce_(cc, ct, &n); if (! same || err != 0.f) { @@ -634,7 +639,7 @@ static integer c_n1 = -1; } *(unsigned char *)transb = 'C'; cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lce_(cc, ct, &n); if (! same || err != 0.f) { @@ -688,34 +693,34 @@ static integer c_n1 = -1; /* Test CGEMM, 01. */ L140: cchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test CHEMM, 02, CSYMM, 03. */ L150: cchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test CTRMM, 04, CTRSM, 05. */ L160: cchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, (ftnlen)6); goto L190; /* Test CHERK, 06, CSYRK, 07. */ L170: cchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test CHER2K, 08, CSYR2K, 09. */ L180: cchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, ( ftnlen)6); goto L190; @@ -759,14 +764,19 @@ static integer c_n1 = -1; /* End of CBLAT3. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int cchk1_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * - as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, complex *ct, real *g, ftnlen sname_len) { /* Initialized data */ @@ -791,7 +801,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; alist al__1; @@ -800,7 +810,7 @@ static integer c_n1 = -1; f_rew(alist *); /* Local variables */ - integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, + integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, ica, icb, laa, lbb, lda, lcc, ldb, ldc; extern logical lce_(complex *, complex *, integer *); complex als, bls; @@ -808,21 +818,21 @@ static integer c_n1 = -1; complex beta; integer ldas, ldbs, ldcs; logical same, null; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, logical *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - complex *, complex *, integer *, ftnlen, ftnlen), cmmch_(char *, - char *, integer *, integer *, integer *, complex *, complex *, - integer *, complex *, integer *, complex *, complex *, integer *, + extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, ftnlen, ftnlen), cmmch_(char *, + char *, integer *, integer *, integer *, complex *, complex *, + integer *, complex *, integer *, complex *, complex *, integer *, complex *, real *, complex *, integer *, real *, real *, logical * , integer *, logical *, ftnlen, ftnlen); logical isame[13], trana, tranb; integer nargs; logical reset; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); char tranas[1], tranbs[1], transa[1], transb[1]; real errmax; @@ -915,7 +925,7 @@ static integer c_n1 = -1; for (ica = 1; ica <= 3; ++ica) { *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] ; - trana = *(unsigned char *)transa == 'T' || *(unsigned + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; if (trana) { @@ -943,9 +953,9 @@ static integer c_n1 = -1; ftnlen)1); for (icb = 1; icb <= 3; ++icb) { - *(unsigned char *)transb = *(unsigned char *)&ich[icb + *(unsigned char *)transb = *(unsigned char *)&ich[icb - 1]; - tranb = *(unsigned char *)transb == 'T' || *(unsigned + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; if (tranb) { @@ -1086,13 +1096,13 @@ static integer c_n1 = -1; isame[2] = ms == m; isame[3] = ns == n; isame[4] = ks == k; - isame[5] = als.r == alpha.r && als.i == + isame[5] = als.r == alpha.r && als.i == alpha.i; isame[6] = lce_(&as[1], &aa[1], &laa); isame[7] = ldas == lda; isame[8] = lce_(&bs[1], &bb[1], &lbb); isame[9] = ldbs == ldb; - isame[10] = bls.r == beta.r && bls.i == + isame[10] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[11] = lce_(&cs[1], &cc[1], &lcc); @@ -1130,9 +1140,9 @@ static integer c_n1 = -1; cmmch_(transa, transb, &m, &n, &k, &alpha, &a[a_offset], nmax, &b[b_offset], - nmax, &beta, &c__[c_offset], + nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, - eps, &err, fatal, nout, &c_true, + eps, &err, fatal, nout, &c_true, (ftnlen)1, (ftnlen)1); errmax = max(errmax,err); /* If got really bad answer, report and */ @@ -1214,10 +1224,10 @@ static integer c_n1 = -1; } /* cchk1_ */ /* Subroutine */ int cchk2_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * - as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, complex *ct, real *g, ftnlen sname_len) { /* Initialized data */ @@ -1243,7 +1253,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; alist al__1; @@ -1252,7 +1262,7 @@ static integer c_n1 = -1; integer *, char *, ftnlen), e_wsfe(void), f_rew(alist *); /* Local variables */ - integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, + integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, ldb, ldc; extern logical lce_(complex *, complex *, integer *); integer ics; @@ -1265,26 +1275,26 @@ static integer c_n1 = -1; char side[1]; logical conj, left, null; char uplo[1]; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, logical *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; - extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - complex *, complex *, integer *, complex *, real *, complex *, - integer *, real *, real *, logical *, integer *, logical *, - ftnlen, ftnlen), chemm_(char *, char *, integer *, integer *, - complex *, complex *, integer *, complex *, integer *, complex *, + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *, + ftnlen, ftnlen), chemm_(char *, char *, integer *, integer *, + complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); logical isame[13]; char sides[1]; integer nargs; logical reset; - extern /* Subroutine */ int csymm_(char *, char *, integer *, integer *, - complex *, complex *, integer *, complex *, integer *, complex *, + extern /* Subroutine */ int csymm_(char *, char *, integer *, integer *, + complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); char uplos[1]; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; @@ -1426,7 +1436,7 @@ static integer c_n1 = -1; /* Generate the matrix C. */ - cmake_("GE", " ", " ", &m, &n, &c__[c_offset], + cmake_("GE", " ", " ", &m, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b1, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1522,9 +1532,9 @@ static integer c_n1 = -1; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)sides == *(unsigned + isame[0] = *(unsigned char *)sides == *(unsigned char *)side; - isame[1] = *(unsigned char *)uplos == *(unsigned + isame[1] = *(unsigned char *)uplos == *(unsigned char *)uplo; isame[2] = ms == m; isame[3] = ns == n; @@ -1569,14 +1579,14 @@ static integer c_n1 = -1; if (left) { cmmch_("N", "N", &m, &n, &m, &alpha, &a[ - a_offset], nmax, &b[b_offset], + a_offset], nmax, &b[b_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { cmmch_("N", "N", &m, &n, &n, &alpha, &b[ - b_offset], nmax, &a[a_offset], + b_offset], nmax, &a[a_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( @@ -1657,9 +1667,9 @@ static integer c_n1 = -1; } /* cchk2_ */ /* Subroutine */ int cchk3_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * - nmax, complex *a, complex *aa, complex *as, complex *b, complex *bb, + nmax, complex *a, complex *aa, complex *as, complex *b, complex *bb, complex *bs, complex *ct, real *g, complex *c__, ftnlen sname_len) { /* Initialized data */ @@ -1686,7 +1696,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; complex q__1; alist al__1; @@ -1708,27 +1718,27 @@ static integer c_n1 = -1; char side[1]; logical left, null; char uplo[1]; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, logical *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; char diags[1]; - extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - complex *, complex *, integer *, complex *, real *, complex *, - integer *, real *, real *, logical *, integer *, logical *, + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); logical isame[13]; char sides[1]; integer nargs; logical reset; - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, - integer *, integer *, complex *, complex *, integer *, complex *, + extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + integer *, integer *, complex *, complex *, integer *, complex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), ctrsm_(char *, char *, - char *, char *, integer *, integer *, complex *, complex *, + char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen); char uplos[1]; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); char tranas[1], transa[1]; real errmax; @@ -1867,7 +1877,7 @@ static integer c_n1 = -1; /* Generate the matrix B. */ - cmake_("GE", " ", " ", &m, &n, &b[b_offset], + cmake_("GE", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, &reset, &c_b1, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1939,7 +1949,7 @@ static integer c_n1 = -1; } ctrmm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } else if (s_cmp(sname + 3, "SM", (ftnlen)2, ( ftnlen)2) == 0) { @@ -1972,7 +1982,7 @@ static integer c_n1 = -1; } ctrsm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } @@ -1998,7 +2008,7 @@ static integer c_n1 = -1; unsigned char *)diag; isame[4] = ms == m; isame[5] = ns == n; - isame[6] = als.r == alpha.r && als.i == + isame[6] = als.r == alpha.r && als.i == alpha.i; isame[7] = lce_(&as[1], &aa[1], &laa); isame[8] = ldas == lda; @@ -2042,18 +2052,18 @@ static integer c_n1 = -1; cmmch_(transa, "N", &m, &n, &m, & alpha, &a[a_offset], nmax, &b[b_offset], nmax, & - c_b1, &c__[c_offset], + c_b1, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { cmmch_("N", transa, &m, &n, &n, & alpha, &b[b_offset], nmax, &a[a_offset], nmax, & - c_b1, &c__[c_offset], + c_b1, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } @@ -2066,14 +2076,14 @@ static integer c_n1 = -1; i__4 = n; for (j = 1; j <= i__4; ++j) { i__5 = m; - for (i__ = 1; i__ <= i__5; ++i__) + for (i__ = 1; i__ <= i__5; ++i__) { i__6 = i__ + j * c_dim1; i__7 = i__ + (j - 1) * ldb; c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i; i__6 = i__ + (j - 1) * ldb; i__7 = i__ + j * b_dim1; - q__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, + q__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, q__1.i = alpha.r * b[i__7].i + alpha.i * b[ i__7].r; bb[i__6].r = q__1.r, bb[i__6].i = q__1.i; @@ -2084,20 +2094,20 @@ static integer c_n1 = -1; if (left) { cmmch_(transa, "N", &m, &n, &m, & - c_b2, &a[a_offset], nmax, + c_b2, &a[a_offset], nmax, &c__[c_offset], nmax, & - c_b1, &b[b_offset], nmax, + c_b1, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & - ldb, eps, &err, fatal, + ldb, eps, &err, fatal, nout, &c_false, (ftnlen)1, (ftnlen)1); } else { cmmch_("N", transa, &m, &n, &n, & - c_b2, &c__[c_offset], - nmax, &a[a_offset], nmax, + c_b2, &c__[c_offset], + nmax, &a[a_offset], nmax, &c_b1, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & - ldb, eps, &err, fatal, + ldb, eps, &err, fatal, nout, &c_false, (ftnlen)1, (ftnlen)1); } @@ -2179,10 +2189,10 @@ static integer c_n1 = -1; } /* cchk3_ */ /* Subroutine */ int cchk4_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * - as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, complex *ct, real *g, ftnlen sname_len) { /* Initialized data */ @@ -2213,7 +2223,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; complex q__1; alist al__1; @@ -2236,16 +2246,16 @@ static integer c_n1 = -1; real rals; logical tran, null; char uplo[1]; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, logical *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; - extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - complex *, complex *, integer *, complex *, real *, complex *, - integer *, real *, real *, logical *, integer *, logical *, - ftnlen, ftnlen), cherk_(char *, char *, integer *, integer *, - real *, complex *, integer *, real *, complex *, integer *, + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *, + ftnlen, ftnlen), cherk_(char *, char *, integer *, integer *, + real *, complex *, integer *, real *, complex *, integer *, ftnlen, ftnlen); real rbeta; logical isame[13]; @@ -2254,12 +2264,12 @@ static integer c_n1 = -1; logical reset; char trans[1]; logical upper; - extern /* Subroutine */ int csyrk_(char *, char *, integer *, integer *, - complex *, complex *, integer *, complex *, complex *, integer *, + extern /* Subroutine */ int csyrk_(char *, char *, integer *, integer *, + complex *, complex *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); char uplos[1]; real ralpha; - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; char transs[1], transt[1]; @@ -2402,7 +2412,7 @@ static integer c_n1 = -1; } null = n <= 0; if (conj) { - null = null || (k <= 0 || ralpha == 0.f) && + null = null || (k <= 0 || ralpha == 0.f) && rbeta == 1.f; } @@ -2481,7 +2491,7 @@ static integer c_n1 = -1; f_rew(&al__1); } cherk_(uplo, trans, &n, &k, &ralpha, &aa[1], & - lda, &rbeta, &cc[1], &ldc, (ftnlen)1, + lda, &rbeta, &cc[1], &ldc, (ftnlen)1, (ftnlen)1); } else { if (*trace) { @@ -2528,16 +2538,16 @@ static integer c_n1 = -1; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; if (conj) { isame[4] = rals == ralpha; } else { - isame[4] = als.r == alpha.r && als.i == + isame[4] = als.r == alpha.r && als.i == alpha.i; } isame[5] = lce_(&as[1], &aa[1], &laa); @@ -2545,7 +2555,7 @@ static integer c_n1 = -1; if (conj) { isame[7] = rbets == rbeta; } else { - isame[7] = bets.r == beta.r && bets.i == + isame[7] = bets.r == beta.r && bets.i == beta.i; } if (null) { @@ -2599,19 +2609,19 @@ static integer c_n1 = -1; } if (tran) { cmmch_(transt, "N", &lj, &c__1, &k, & - alpha, &a[jj * a_dim1 + 1], - nmax, &a[j * a_dim1 + 1], - nmax, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { cmmch_("N", transt, &lj, &c__1, &k, & - alpha, &a[jj + a_dim1], nmax, + alpha, &a[jj + a_dim1], nmax, &a[j + a_dim1], nmax, &beta, & c__[jj + j * c_dim1], nmax, & - ct[1], &g[1], &cc[jc], &ldc, + ct[1], &g[1], &cc[jc], &ldc, eps, &err, fatal, nout, & c_true, (ftnlen)1, (ftnlen)1); } @@ -2720,10 +2730,10 @@ static integer c_n1 = -1; } /* cchk4_ */ /* Subroutine */ int cchk5_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, complex *alf, integer * nbet, complex *bet, integer *nmax, complex *ab, complex *aa, complex * - as, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, + as, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, complex *ct, real *g, complex *w, ftnlen sname_len) { /* Initialized data */ @@ -2778,14 +2788,14 @@ static integer c_n1 = -1; complex bets; logical tran, null; char uplo[1]; - extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, - integer *, complex *, integer *, complex *, integer *, logical *, + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, complex *, ftnlen, ftnlen, ftnlen); complex alpha; - extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - complex *, complex *, integer *, complex *, real *, complex *, - integer *, real *, real *, logical *, integer *, logical *, + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); real rbeta; logical isame[13]; @@ -2795,12 +2805,12 @@ static integer c_n1 = -1; char trans[1]; logical upper; char uplos[1]; - extern /* Subroutine */ int cher2k_(char *, char *, integer *, integer *, - complex *, complex *, integer *, complex *, integer *, real *, - complex *, integer *, ftnlen, ftnlen), csyr2k_(char *, char *, - integer *, integer *, complex *, complex *, integer *, complex *, + extern /* Subroutine */ int cher2k_(char *, char *, integer *, integer *, + complex *, complex *, integer *, complex *, integer *, real *, + complex *, integer *, ftnlen, ftnlen), csyr2k_(char *, char *, + integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); - extern logical lceres_(char *, char *, integer *, integer *, complex *, + extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real errmax; char transs[1], transt[1]; @@ -2957,7 +2967,7 @@ static integer c_n1 = -1; } null = n <= 0; if (conj) { - null = null || (k <= 0 || alpha.r == 0.f && + null = null || (k <= 0 || alpha.r == 0.f && alpha.i == 0.f) && rbeta == 1.f; } @@ -3092,9 +3102,9 @@ static integer c_n1 = -1; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; @@ -3106,7 +3116,7 @@ static integer c_n1 = -1; if (conj) { isame[9] = rbets == rbeta; } else { - isame[9] = bets.r == beta.r && bets.i == + isame[9] = bets.r == beta.r && bets.i == beta.i; } if (null) { @@ -3162,20 +3172,20 @@ static integer c_n1 = -1; i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { i__7 = i__; - i__8 = (j - 1 << 1) * *nmax + k + + i__8 = (j - 1 << 1) * *nmax + k + i__; - q__1.r = alpha.r * ab[i__8].r - - alpha.i * ab[i__8].i, + q__1.r = alpha.r * ab[i__8].r - + alpha.i * ab[i__8].i, q__1.i = alpha.r * ab[ i__8].i + alpha.i * ab[ i__8].r; - w[i__7].r = q__1.r, w[i__7].i = + w[i__7].r = q__1.r, w[i__7].i = q__1.i; if (conj) { i__7 = k + i__; r_cnjg(&q__2, &alpha); i__8 = (j - 1 << 1) * *nmax + i__; - q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i, + q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i, q__1.i = q__2.r * ab[i__8].i + q__2.i * ab[ i__8].r; w[i__7].r = q__1.r, w[i__7].i = q__1.i; @@ -3183,7 +3193,7 @@ static integer c_n1 = -1; i__7 = k + i__; i__8 = (j - 1 << 1) * *nmax + i__; q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; w[i__7].r = q__1.r, w[i__7].i = q__1.i; } @@ -3194,9 +3204,9 @@ static integer c_n1 = -1; i__8 = *nmax << 1; cmmch_(transt, "N", &lj, &c__1, &i__6, &c_b2, &ab[jjab], &i__7, &w[ - 1], &i__8, &beta, &c__[jj + j + 1], &i__8, &beta, &c__[jj + j * c_dim1], nmax, &ct[1], &g[1] - , &cc[jc], &ldc, eps, &err, + , &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { @@ -3205,14 +3215,14 @@ static integer c_n1 = -1; if (conj) { i__7 = i__; r_cnjg(&q__2, &ab[(k + i__ - 1) * *nmax + j]); - q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, - q__1.i = alpha.r * q__2.i + alpha.i * + q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, + q__1.i = alpha.r * q__2.i + alpha.i * q__2.r; w[i__7].r = q__1.r, w[i__7].i = q__1.i; i__7 = k + i__; i__8 = (i__ - 1) * *nmax + j; q__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, q__2.i = alpha.r * ab[i__8].i + alpha.i + .i, q__2.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; r_cnjg(&q__1, &q__2); w[i__7].r = q__1.r, w[i__7].i = q__1.i; @@ -3220,13 +3230,13 @@ static integer c_n1 = -1; i__7 = i__; i__8 = (k + i__ - 1) * *nmax + j; q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; w[i__7].r = q__1.r, w[i__7].i = q__1.i; i__7 = k + i__; i__8 = (i__ - 1) * *nmax + j; q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; w[i__7].r = q__1.r, w[i__7].i = q__1.i; } @@ -3236,9 +3246,9 @@ static integer c_n1 = -1; i__7 = *nmax << 1; cmmch_("N", "N", &lj, &c__1, &i__6, & c_b2, &ab[jj], nmax, &w[1], & - i__7, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } @@ -3351,7 +3361,7 @@ static integer c_n1 = -1; } /* cchk5_ */ -/* Subroutine */ int cchke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int cchke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -3364,34 +3374,34 @@ static integer c_n1 = -1; integer s_wsfe(cilist *), do_fio(integer *, char *, ftnlen), e_wsfe(void); /* Local variables */ - complex a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] + complex a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] /* was [2][1] */, beta, alpha; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - complex *, complex *, integer *, ftnlen, ftnlen), chemm_(char *, - char *, integer *, integer *, complex *, complex *, integer *, - complex *, integer *, complex *, complex *, integer *, ftnlen, - ftnlen), cherk_(char *, char *, integer *, integer *, real *, - complex *, integer *, real *, complex *, integer *, ftnlen, + extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, ftnlen, ftnlen), chemm_(char *, + char *, integer *, integer *, complex *, complex *, integer *, + complex *, integer *, complex *, complex *, integer *, ftnlen, + ftnlen), cherk_(char *, char *, integer *, integer *, real *, + complex *, integer *, real *, complex *, integer *, ftnlen, ftnlen); real rbeta; - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, - integer *, integer *, complex *, complex *, integer *, complex *, + extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + integer *, integer *, complex *, complex *, integer *, complex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), csymm_(char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, - integer *, complex *, complex *, integer *, ftnlen, ftnlen), - ctrsm_(char *, char *, char *, char *, integer *, integer *, - complex *, complex *, integer *, complex *, integer *, ftnlen, - ftnlen, ftnlen, ftnlen), csyrk_(char *, char *, integer *, - integer *, complex *, complex *, integer *, complex *, complex *, - integer *, ftnlen, ftnlen), cher2k_(char *, char *, integer *, - integer *, complex *, complex *, integer *, complex *, integer *, - real *, complex *, integer *, ftnlen, ftnlen), csyr2k_(char *, - char *, integer *, integer *, complex *, complex *, integer *, - complex *, integer *, complex *, complex *, integer *, ftnlen, + integer *, complex *, complex *, integer *, ftnlen, ftnlen), + ctrsm_(char *, char *, char *, char *, integer *, integer *, + complex *, complex *, integer *, complex *, integer *, ftnlen, + ftnlen, ftnlen, ftnlen), csyrk_(char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, complex *, + integer *, ftnlen, ftnlen), cher2k_(char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + real *, complex *, integer *, ftnlen, ftnlen), csyr2k_(char *, + char *, integer *, integer *, complex *, complex *, integer *, + complex *, integer *, complex *, complex *, integer *, ftnlen, ftnlen); real ralpha; - extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical + extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -3451,302 +3461,302 @@ static integer c_n1 = -1; } L10: infoc_1.infot = 1; - cgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 1; - cgemm_("/", "C", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("/", "C", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 1; - cgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - cgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - cgemm_("C", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - cgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("N", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("C", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("C", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("C", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("T", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - cgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("N", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("C", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("C", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("C", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("T", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - cgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("N", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("C", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("C", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("C", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("T", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - cgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, + cgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("C", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("C", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, + cgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("T", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - cgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, + cgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, + cgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("N", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("C", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("T", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("C", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - cgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + cgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + cgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + cgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("C", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("C", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("C", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("C", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("T", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - cgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + cgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); @@ -4926,9 +4936,9 @@ static integer c_n1 = -1; } /* cchke_ */ -/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, - integer *n, complex *a, integer *nmax, complex *aa, integer *lda, - logical *reset, complex *transl, ftnlen type_len, ftnlen uplo_len, +/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, + integer *n, complex *a, integer *nmax, complex *aa, integer *lda, + logical *reset, complex *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -5114,10 +5124,10 @@ static integer c_n1 = -1; } /* cmake_ */ /* Subroutine */ int cmmch_(char *transa, char *transb, integer *m, integer * - n, integer *kk, complex *alpha, complex *a, integer *lda, complex *b, - integer *ldb, complex *beta, complex *c__, integer *ldc, complex *ct, + n, integer *kk, complex *alpha, complex *a, integer *lda, complex *b, + integer *ldb, complex *beta, complex *c__, integer *ldc, complex *ct, real *g, complex *cc, integer *ldcc, real *eps, real *err, logical * - fatal, integer *nout, logical *mv, ftnlen transa_len, ftnlen + fatal, integer *nout, logical *mv, ftnlen transa_len, ftnlen transb_len) { /* Format strings */ @@ -5131,7 +5141,7 @@ static integer c_n1 = -1; " \002,i3)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; real r__1, r__2, r__3, r__4, r__5, r__6; complex q__1, q__2, q__3, q__4; @@ -5190,9 +5200,9 @@ static integer c_n1 = -1; cc -= cc_offset; /* Function Body */ - trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; - tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; ctrana = *(unsigned char *)transa == 'C'; ctranb = *(unsigned char *)transb == 'C'; @@ -5220,17 +5230,17 @@ static integer c_n1 = -1; i__5 = i__; i__6 = i__ + k * a_dim1; i__7 = k + j * b_dim1; - q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, q__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[ i__7].r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = i__ + k * a_dim1; i__5 = k + j * b_dim1; g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag( &a[i__ + k * a_dim1]), abs(r__2))) * ((r__3 = b[ - i__5].r, abs(r__3)) + (r__4 = r_imag(&b[k + j * + i__5].r, abs(r__3)) + (r__4 = r_imag(&b[k + j * b_dim1]), abs(r__4))); /* L20: */ } @@ -5246,15 +5256,15 @@ static integer c_n1 = -1; i__5 = i__; r_cnjg(&q__3, &a[k + i__ * a_dim1]); i__6 = k + j * b_dim1; - q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, + q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, q__2.i = q__3.r * b[i__6].i + q__3.i * b[i__6] .r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = k + i__ * a_dim1; i__5 = k + j * b_dim1; - g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * (( r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( &b[k + j * b_dim1]), abs(r__4))); @@ -5274,12 +5284,12 @@ static integer c_n1 = -1; q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] .i, q__2.i = a[i__6].r * b[i__7].i + a[i__6] .i * b[i__7].r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = k + i__ * a_dim1; i__5 = k + j * b_dim1; - g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * (( r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( &b[k + j * b_dim1]), abs(r__4))); @@ -5298,15 +5308,15 @@ static integer c_n1 = -1; i__5 = i__; i__6 = i__ + k * a_dim1; r_cnjg(&q__3, &b[j + k * b_dim1]); - q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, - q__2.i = a[i__6].r * q__3.i + a[i__6].i * + q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, + q__2.i = a[i__6].r * q__3.i + a[i__6].i * q__3.r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = i__ + k * a_dim1; i__5 = j + k * b_dim1; - g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * (( r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( &b[j + k * b_dim1]), abs(r__4))); @@ -5326,12 +5336,12 @@ static integer c_n1 = -1; q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] .i, q__2.i = a[i__6].r * b[i__7].i + a[i__6] .i * b[i__7].r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = i__ + k * a_dim1; i__5 = j + k * b_dim1; - g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * (( r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( &b[j + k * b_dim1]), abs(r__4))); @@ -5351,17 +5361,17 @@ static integer c_n1 = -1; i__5 = i__; r_cnjg(&q__3, &a[k + i__ * a_dim1]); r_cnjg(&q__4, &b[j + k * b_dim1]); - q__2.r = q__3.r * q__4.r - q__3.i * q__4.i, - q__2.i = q__3.r * q__4.i + q__3.i * + q__2.r = q__3.r * q__4.r - q__3.i * q__4.i, + q__2.i = q__3.r * q__4.i + q__3.i * q__4.r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[k + i__ * a_dim1]), abs(r__2))) - * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(&b[j + k * b_dim1]), abs(r__4))); /* L120: */ } @@ -5376,17 +5386,17 @@ static integer c_n1 = -1; i__5 = i__; r_cnjg(&q__3, &a[k + i__ * a_dim1]); i__6 = j + k * b_dim1; - q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, + q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, q__2.i = q__3.r * b[i__6].i + q__3.i * b[ i__6].r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[k + i__ * a_dim1]), abs(r__2))) - * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(&b[j + k * b_dim1]), abs(r__4))); /* L140: */ } @@ -5403,17 +5413,17 @@ static integer c_n1 = -1; i__5 = i__; i__6 = k + i__ * a_dim1; r_cnjg(&q__3, &b[j + k * b_dim1]); - q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, - q__2.i = a[i__6].r * q__3.i + a[i__6].i * + q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, + q__2.i = a[i__6].r * q__3.i + a[i__6].i * q__3.r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[k + i__ * a_dim1]), abs(r__2))) - * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(&b[j + k * b_dim1]), abs(r__4))); /* L160: */ } @@ -5429,16 +5439,16 @@ static integer c_n1 = -1; i__6 = k + i__ * a_dim1; i__7 = j + k * b_dim1; q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[ - i__7].i, q__2.i = a[i__6].r * b[i__7].i + + i__7].i, q__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[i__7].r; - q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + q__2.i; ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(&a[k + i__ * a_dim1]), abs(r__2))) - * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(&b[j + k * b_dim1]), abs(r__4))); /* L180: */ } @@ -5451,17 +5461,17 @@ static integer c_n1 = -1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = i__; i__4 = i__; - q__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, q__2.i = + q__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, q__2.i = alpha->r * ct[i__4].i + alpha->i * ct[i__4].r; i__5 = i__ + j * c_dim1; - q__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, q__3.i = + q__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, q__3.i = beta->r * c__[i__5].i + beta->i * c__[i__5].r; q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; ct[i__3].r = q__1.r, ct[i__3].i = q__1.i; i__3 = i__ + j * c_dim1; - g[i__] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha), + g[i__] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha), abs(r__2))) * g[i__] + ((r__3 = beta->r, abs(r__3)) + ( - r__4 = r_imag(beta), abs(r__4))) * ((r__5 = c__[i__3].r, + r__4 = r_imag(beta), abs(r__4))) * ((r__5 = c__[i__3].r, abs(r__5)) + (r__6 = r_imag(&c__[i__ + j * c_dim1]), abs( r__6))); /* L200: */ @@ -5772,7 +5782,7 @@ real sdiff_(real *x, real *y) } /* sdiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/blastest/src/dblat1.c b/blastest/src/dblat1.c index 14665d844..e84867178 100644 --- a/blastest/src/dblat1.c +++ b/blastest/src/dblat1.c @@ -70,6 +70,11 @@ static real c_b81 = 0.f; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "dblat1"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ static doublereal sfac = 9.765625e-4; @@ -85,7 +90,7 @@ static real c_b81 = 0.f; /* Local variables */ integer ic; - extern /* Subroutine */ int check0_(doublereal *), check1_(doublereal *), + extern /* Subroutine */ int check0_(doublereal *), check1_(doublereal *), check2_(doublereal *), check3_(doublereal *), header_(void); /* Fortran I/O blocks */ @@ -124,11 +129,11 @@ static real c_b81 = 0.f; combla_1.incy = 9999; if (combla_1.icase == 3 || combla_1.icase == 11) { check0_(&sfac); - } else if (combla_1.icase == 7 || combla_1.icase == 8 || + } else if (combla_1.icase == 7 || combla_1.icase == 8 || combla_1.icase == 9 || combla_1.icase == 10) { check1_(&sfac); - } else if (combla_1.icase == 1 || combla_1.icase == 2 || - combla_1.icase == 5 || combla_1.icase == 6 || combla_1.icase + } else if (combla_1.icase == 1 || combla_1.icase == 2 || + combla_1.icase == 5 || combla_1.icase == 6 || combla_1.icase == 12 || combla_1.icase == 13) { check2_(&sfac); } else if (combla_1.icase == 4) { @@ -143,7 +148,12 @@ static real c_b81 = 0.f; } s_stop("", (ftnlen)0); - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int header_(void) @@ -201,17 +211,17 @@ static real c_b81 = 0.f; static doublereal dc1[8] = { .6,.8,-.6,.8,.6,1.,0.,1. }; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); /* Local variables */ integer i__, k; doublereal sa, sb, sc, ss, dtemp[9]; - extern /* Subroutine */ int drotg_(doublereal *, doublereal *, doublereal - *, doublereal *), stest_(integer *, doublereal *, doublereal *, - doublereal *, doublereal *), stest1_(doublereal *, doublereal *, - doublereal *, doublereal *), drotmg_(doublereal *, doublereal *, + extern /* Subroutine */ int drotg_(doublereal *, doublereal *, doublereal + *, doublereal *), stest_(integer *, doublereal *, doublereal *, + doublereal *, doublereal *), stest1_(doublereal *, doublereal *, + doublereal *, doublereal *), drotmg_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); /* Fortran I/O blocks */ @@ -319,7 +329,7 @@ static real c_b81 = 0.f; doublereal d__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -328,12 +338,12 @@ static real c_b81 = 0.f; doublereal sx[8]; integer np1, len; extern doublereal dnrm2_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, integer *); extern doublereal dasum_(integer *, doublereal *, integer *); doublereal stemp[1], strue[8]; - extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, - doublereal *, doublereal *), itest1_(integer *, integer *), + extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, + doublereal *, doublereal *), itest1_(integer *, integer *), stest1_(doublereal *, doublereal *, doublereal *, doublereal *); extern integer idamax_(integer *, doublereal *, integer *); @@ -375,11 +385,11 @@ static real c_b81 = 0.f; stest1_(&d__1, stemp, stemp, sfac); } else if (combla_1.icase == 9) { /* .. DSCAL .. */ - dscal_(&combla_1.n, &sa[(combla_1.incx - 1) * 5 + np1 - 1], + dscal_(&combla_1.n, &sa[(combla_1.incx - 1) * 5 + np1 - 1], sx, &combla_1.incx); i__1 = len; for (i__ = 1; i__ <= i__1; ++i__) { - strue[i__ - 1] = dtrue5[i__ + (np1 + combla_1.incx * 5 << + strue[i__ - 1] = dtrue5[i__ + (np1 + combla_1.incx * 5 << 3) - 49]; /* L40: */ } @@ -446,71 +456,71 @@ static real c_b81 = 0.f; -3.,-4.,5.,0.,0.,2.,-3.,0.,1.,5.,2.,0.,-4. }; static struct { doublereal e_1[448]; - } equiv_3 = {{ .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., - .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, - 0., 0., 0., 0., 0., 0., -.8, 0., 0., 0., 0., 0., 0., -.9, 0., - 0., 0., 0., 0., 0., 3.5, 0., 0., 0., 0., 0., 0., .6, .1, 0., - 0., 0., 0., 0., -.8, 3.8, 0., 0., 0., 0., 0., -.9, 2.8, 0., + } equiv_3 = {{ .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., + .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, + 0., 0., 0., 0., 0., 0., -.8, 0., 0., 0., 0., 0., 0., -.9, 0., + 0., 0., 0., 0., 0., 3.5, 0., 0., 0., 0., 0., 0., .6, .1, 0., + 0., 0., 0., 0., -.8, 3.8, 0., 0., 0., 0., 0., -.9, 2.8, 0., 0., 0., 0., 0., 3.5, -.4, 0., 0., 0., 0., 0., .6, .1, -.5, .8, 0., 0., 0., -.8, 3.8, -2.2, -1.2, 0., 0., 0., -.9, 2.8, -1.4, - -1.3, 0., 0., 0., 3.5, -.4, -2.2, 4.7, 0., 0., 0., .6, 0., - 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., - 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., - 0., 0., 0., -.8, 0., 0., 0., 0., 0., 0., -.9, 0., 0., 0., 0., - 0., 0., 3.5, 0., 0., 0., 0., 0., 0., .6, .1, -.5, 0., 0., 0., + -1.3, 0., 0., 0., 3.5, -.4, -2.2, 4.7, 0., 0., 0., .6, 0., + 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., + 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., + 0., 0., 0., -.8, 0., 0., 0., 0., 0., 0., -.9, 0., 0., 0., 0., + 0., 0., 3.5, 0., 0., 0., 0., 0., 0., .6, .1, -.5, 0., 0., 0., 0., 0., .1, -3., 0., 0., 0., 0., -.3, .1, -2., 0., 0., 0., 0., - 3.3, .1, -2., 0., 0., 0., 0., .6, .1, -.5, .8, .9, -.3, -.4, - -2., .1, 1.4, .8, .6, -.3, -2.8, -1.8, .1, 1.3, .8, 0., -.3, - -1.9, 3.8, .1, -3.1, .8, 4.8, -.3, -1.5, .6, 0., 0., 0., 0., - 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., - 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., + 3.3, .1, -2., 0., 0., 0., 0., .6, .1, -.5, .8, .9, -.3, -.4, + -2., .1, 1.4, .8, .6, -.3, -2.8, -1.8, .1, 1.3, .8, 0., -.3, + -1.9, 3.8, .1, -3.1, .8, 4.8, -.3, -1.5, .6, 0., 0., 0., 0., + 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., + 0., .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., -.8, 0., 0., 0., 0., 0., 0., -.9, 0., 0., 0., 0., 0., 0., 3.5, 0., 0., 0., 0., 0., 0., .6, .1, -.5, 0., 0., 0., 0., 4.8, .1, - -3., 0., 0., 0., 0., 3.3, .1, -2., 0., 0., 0., 0., 2.1, .1, - -2., 0., 0., 0., 0., .6, .1, -.5, .8, .9, -.3, -.4, -1.6, .1, - -2.2, .8, 5.4, -.3, -2.8, -1.5, .1, -1.4, .8, 3.6, -.3, -1.9, + -3., 0., 0., 0., 0., 3.3, .1, -2., 0., 0., 0., 0., 2.1, .1, + -2., 0., 0., 0., 0., .6, .1, -.5, .8, .9, -.3, -.4, -1.6, .1, + -2.2, .8, 5.4, -.3, -2.8, -1.5, .1, -1.4, .8, 3.6, -.3, -1.9, 3.7, .1, -2.2, .8, 3.6, -.3, -1.5, .6, 0., 0., 0., 0., 0., 0., - .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, - 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., -.8, 0., - 0., 0., 0., 0., 0., -.9, 0., 0., 0., 0., 0., 0., 3.5, 0., 0., - 0., 0., 0., 0., .6, .1, 0., 0., 0., 0., 0., -.8, -1., 0., 0., + .6, 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., .6, + 0., 0., 0., 0., 0., 0., .6, 0., 0., 0., 0., 0., 0., -.8, 0., + 0., 0., 0., 0., 0., -.9, 0., 0., 0., 0., 0., 0., 3.5, 0., 0., + 0., 0., 0., 0., .6, .1, 0., 0., 0., 0., 0., -.8, -1., 0., 0., 0., 0., 0., -.9, -.8, 0., 0., 0., 0., 0., 3.5, .8, 0., 0., 0., 0., 0., .6, .1, -.5, .8, 0., 0., 0., -.8, -1., 1.4, -1.6, 0., - 0., 0., -.9, -.8, 1.3, -1.6, 0., 0., 0., 3.5, .8, -3.1, 4.8, + 0., 0., -.9, -.8, 1.3, -1.6, 0., 0., 0., 3.5, .8, -3.1, 4.8, 0., 0., 0. }}; static struct { doublereal e_1[448]; - } equiv_7 = {{ .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., - .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, - 0., 0., 0., 0., 0., 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., + } equiv_7 = {{ .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., + .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, + 0., 0., 0., 0., 0., 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., 0., -2.6, 0., 0., 0., 0., 0., 0., .5, -.9, 0., - 0., 0., 0., 0., .7, -4.8, 0., 0., 0., 0., 0., 1.7, -.7, 0., - 0., 0., 0., 0., -2.6, 3.5, 0., 0., 0., 0., 0., .5, -.9, .3, - .7, 0., 0., 0., .7, -4.8, 3., 1.1, 0., 0., 0., 1.7, -.7, -.7, + 0., 0., 0., 0., .7, -4.8, 0., 0., 0., 0., 0., 1.7, -.7, 0., + 0., 0., 0., 0., -2.6, 3.5, 0., 0., 0., 0., 0., .5, -.9, .3, + .7, 0., 0., 0., .7, -4.8, 3., 1.1, 0., 0., 0., 1.7, -.7, -.7, 2.3, 0., 0., 0., -2.6, 3.5, -.7, -3.6, 0., 0., 0., .5, 0., 0., - 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., - 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., - 0., 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., + 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., + 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., + 0., 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., 0., -2.6, 0., 0., 0., 0., 0., 0., .5, -.9, .3, 0., 0., 0., 0., - 4., -.9, -.3, 0., 0., 0., 0., -.5, -.9, 1.5, 0., 0., 0., 0., + 4., -.9, -.3, 0., 0., 0., 0., -.5, -.9, 1.5, 0., 0., 0., 0., -1.5, -.9, -1.8, 0., 0., 0., 0., .5, -.9, .3, .7, -.6, .2, .8, - 3.7, -.9, -1.2, .7, -1.5, .2, 2.2, -.3, -.9, 2.1, .7, -1.6, - .2, 2., -1.6, -.9, -2.1, .7, 2.9, .2, -3.8, .5, 0., 0., 0., - 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., - 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., - 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., 0., + 3.7, -.9, -1.2, .7, -1.5, .2, 2.2, -.3, -.9, 2.1, .7, -1.6, + .2, 2., -1.6, -.9, -2.1, .7, 2.9, .2, -3.8, .5, 0., 0., 0., + 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., + 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., + 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., 0., -2.6, 0., 0., 0., 0., 0., 0., .5, -.9, 0., 0., 0., 0., 0., 4., - -6.3, 0., 0., 0., 0., 0., -.5, .3, 0., 0., 0., 0., 0., -1.5, - 3., 0., 0., 0., 0., 0., .5, -.9, .3, .7, 0., 0., 0., 3.7, - -7.2, 3., 1.7, 0., 0., 0., -.3, .9, -.7, 1.9, 0., 0., 0., - -1.6, 2.7, -.7, -3.4, 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., - .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, - 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .7, 0., + -6.3, 0., 0., 0., 0., 0., -.5, .3, 0., 0., 0., 0., 0., -1.5, + 3., 0., 0., 0., 0., 0., .5, -.9, .3, .7, 0., 0., 0., 3.7, + -7.2, 3., 1.7, 0., 0., 0., -.3, .9, -.7, 1.9, 0., 0., 0., + -1.6, 2.7, -.7, -3.4, 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., + .5, 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .5, + 0., 0., 0., 0., 0., 0., .5, 0., 0., 0., 0., 0., 0., .7, 0., 0., 0., 0., 0., 0., 1.7, 0., 0., 0., 0., 0., 0., -2.6, 0., 0., - 0., 0., 0., 0., .5, -.9, .3, 0., 0., 0., 0., .7, -.9, 1.2, + 0., 0., 0., 0., .5, -.9, .3, 0., 0., 0., 0., .7, -.9, 1.2, 0., 0., 0., 0., 1.7, -.9, .5, 0., 0., 0., 0., -2.6, -.9, -1.3, - 0., 0., 0., 0., .5, -.9, .3, .7, -.6, .2, .8, .7, -.9, 1.2, + 0., 0., 0., 0., .5, -.9, .3, .7, -.6, .2, .8, .7, -.9, 1.2, .7, -1.5, .2, 1.6, 1.7, -.9, .5, .7, -1.6, .2, 2.4, -2.6, -.9, -1.3, .7, 2.9, .2, -4. }}; @@ -521,7 +531,7 @@ static real c_b81 = 0.f; doublereal d__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -532,7 +542,7 @@ static real c_b81 = 0.f; doublereal sx[7], sy[7]; integer kni; doublereal stx[7], sty[7]; - extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, + extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *); integer kpar, lenx, leny; #define dt19x ((doublereal *)&equiv_3) @@ -547,16 +557,16 @@ static real c_b81 = 0.f; #define dt19yc ((doublereal *)&equiv_7 + 224) #define dt19yd ((doublereal *)&equiv_7 + 336) extern doublereal dsdot_(integer *, real *, integer *, real *, integer *); - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer ksize; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, - integer *, doublereal *, integer *), drotm_(integer *, doublereal + extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + integer *, doublereal *, integer *), drotm_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *), dswap_( integer *, doublereal *, integer *, doublereal *, integer *); doublereal ssize[7]; - extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, - doublereal *, doublereal *), stest1_(doublereal *, doublereal *, + extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, + doublereal *, doublereal *), stest1_(doublereal *, doublereal *, doublereal *, doublereal *); /* Fortran I/O blocks */ @@ -616,7 +626,7 @@ static real c_b81 = 0.f; /* .. DDOT .. */ d__1 = ddot_(&combla_1.n, sx, &combla_1.incx, sy, & combla_1.incy); - stest1_(&d__1, &dt7[kn + (ki << 2) - 5], &ssize1[kn - 1], + stest1_(&d__1, &dt7[kn + (ki << 2) - 5], &ssize1[kn - 1], sfac); } else if (combla_1.icase == 2) { /* .. DAXPY .. */ @@ -653,9 +663,9 @@ static real c_b81 = 0.f; for (i__ = 1; i__ <= 7; ++i__) { sx[i__ - 1] = dx1[i__ - 1]; sy[i__ - 1] = dy1[i__ - 1]; - stx[i__ - 1] = dt19x[i__ + (kpar + (kni << 2)) * 7 - + stx[i__ - 1] = dt19x[i__ + (kpar + (kni << 2)) * 7 - 36]; - sty[i__ - 1] = dt19y[i__ + (kpar + (kni << 2)) * 7 - + sty[i__ - 1] = dt19y[i__ + (kpar + (kni << 2)) * 7 - 36]; } @@ -746,7 +756,7 @@ static real c_b81 = 0.f; 1.17,1.17,1.17,1.17,1.17,1.17,1.17 }; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -755,13 +765,13 @@ static real c_b81 = 0.f; doublereal sx[7], sy[7], stx[7], sty[7]; integer lenx, leny; doublereal mwpc[11]; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer mwpn[11]; doublereal mwps[11], mwpx[5], mwpy[5]; integer ksize; doublereal copyx[5], copyy[5]; - extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal mwptx[55] /* was [11][5] */, mwpty[55] /* was [11][5] */; @@ -1090,11 +1100,11 @@ static real c_b81 = 0.f; } /* testdsdot_ */ -/* Subroutine */ int stest1_(doublereal *scomp1, doublereal *strue1, +/* Subroutine */ int stest1_(doublereal *scomp1, doublereal *strue1, doublereal *ssize, doublereal *sfac) { doublereal scomp[1], strue[1]; - extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, doublereal *, doublereal *); /* ************************* STEST1 ***************************** */ diff --git a/blastest/src/dblat2.c b/blastest/src/dblat2.c index 0cdc8f16f..7982c67c5 100644 --- a/blastest/src/dblat2.c +++ b/blastest/src/dblat2.c @@ -155,10 +155,15 @@ static logical c_false = FALSE_; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "dblat2"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*16] = "DGEMV " "DGBMV " "DSYMV " "DSBMV " "DSPMV " - "DTRMV " "DTBMV " "DTPMV " "DTRSV " "DTBSV " "DTPSV " "DGER " + static char snames[6*16] = "DGEMV " "DGBMV " "DSYMV " "DSBMV " "DSPMV " + "DTRMV " "DTBMV " "DTPMV " "DTRSV " "DTBSV " "DTPSV " "DGER " "DSYR " "DSPR " "DSYR2 " "DSPR2 "; /* Format strings */ @@ -204,10 +209,10 @@ static logical c_false = FALSE_; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -227,50 +232,50 @@ static logical c_false = FALSE_; integer ninc, nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int dchk1_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, integer *, integer *, doublereal *, integer - *, doublereal *, integer *, integer *, integer *, integer *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, ftnlen), dchk2_(char *, - doublereal *, doublereal *, integer *, integer *, logical *, - logical *, logical *, integer *, integer *, integer *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, - integer *, integer *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, ftnlen), dchk3_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, integer *, integer *, integer *, integer *, + extern /* Subroutine */ int dchk1_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, integer *, integer *, doublereal *, integer + *, doublereal *, integer *, integer *, integer *, integer *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, ftnlen), dchk2_(char *, + doublereal *, doublereal *, integer *, integer *, logical *, + logical *, logical *, integer *, integer *, integer *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + integer *, integer *, integer *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, ftnlen), dchk3_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, integer *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, ftnlen), dchk4_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, doublereal *, integer *, integer *, integer - *, integer *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, ftnlen), dchk5_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, doublereal *, integer *, integer *, integer - *, integer *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, ftnlen), dchk6_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, doublereal *, integer *, integer *, integer - *, integer *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, ftnlen), dchke_(integer *, char *, integer *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, ftnlen), dchk4_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, doublereal *, integer *, integer *, integer + *, integer *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, ftnlen), dchk5_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, doublereal *, integer *, integer *, integer + *, integer *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, ftnlen), dchk6_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, doublereal *, integer *, integer *, integer + *, integer *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, ftnlen), dchke_(integer *, char *, integer *, ftnlen); logical fatal, trace; integer nidim; - extern /* Subroutine */ int dmvch_(char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmvch_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, logical *, integer *, + doublereal *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); char snaps[32], trans[1]; integer isnum; @@ -621,7 +626,7 @@ static logical c_false = FALSE_; goto L80; } for (i__ = 1; i__ <= 16; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L70; } @@ -668,7 +673,7 @@ static logical c_false = FALSE_; } i__1 = n; for (j = 1; j <= i__1; ++j) { - yy[j - 1] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - + yy[j - 1] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3); /* L130: */ } @@ -748,44 +753,44 @@ static logical c_false = FALSE_; /* Test DGEMV, 01, and DGBMV, 02. */ L140: dchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test DSYMV, 03, DSBMV, 04, and DSPMV, 05. */ L150: dchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test DTRMV, 06, DTBMV, 07, DTPMV, 08, */ /* DTRSV, 09, DTBSV, 10, and DTPSV, 11. */ L160: dchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, &c__65, &c__2, a, aa, as, y, yy, ys, yt, g, z__, (ftnlen) 6); goto L200; /* Test DGER, 12. */ L170: dchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test DSYR, 13, and DSPR, 14. */ L180: dchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test DSYR2, 15, and DSPR2, 16. */ L190: dchk6_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); L200: @@ -827,16 +832,21 @@ static logical c_false = FALSE_; /* End of DBLAT2. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ -/* Subroutine */ int dchk1_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk1_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, - integer *nalf, doublereal *alf, integer *nbet, doublereal *bet, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublereal *a, doublereal *aa, doublereal *as, doublereal *x, - doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, + fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, + integer *nalf, doublereal *alf, integer *nbet, doublereal *bet, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublereal *a, doublereal *aa, doublereal *as, doublereal *x, + doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, doublereal *ys, doublereal *yt, doublereal *g, ftnlen sname_len) { /* Initialized data */ @@ -881,21 +891,21 @@ static logical c_false = FALSE_; logical same; integer incx, incy; logical full, tran, null; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, - integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; logical isame[13]; extern /* Subroutine */ int dgbmv_(char *, integer *, integer *, integer * - , integer *, doublereal *, doublereal *, integer *, doublereal *, + , integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen), dgemv_( - char *, integer *, integer *, doublereal *, doublereal *, integer + char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, - ftnlen), dmvch_(char *, integer *, integer *, doublereal *, - doublereal *, integer *, doublereal *, integer *, doublereal *, + ftnlen), dmvch_(char *, integer *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, logical *, integer *, logical *, + doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); integer nargs; logical reset; @@ -1079,9 +1089,9 @@ static logical c_false = FALSE_; transl = 0.; i__7 = abs(incy); i__8 = ml - 1; - dmake_("GE", " ", " ", &c__1, &ml, &y[1], + dmake_("GE", " ", " ", &c__1, &ml, &y[1], &c__1, &yy[1], &i__7, &c__0, & - i__8, &reset, &transl, (ftnlen)2, + i__8, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1089,7 +1099,7 @@ static logical c_false = FALSE_; /* Save every datum before calling the */ /* subroutine. */ - *(unsigned char *)transs = *(unsigned + *(unsigned char *)transs = *(unsigned char *)trans; ms = m; ns = n; @@ -1149,7 +1159,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - dgemv_(trans, &m, &n, &alpha, &aa[1], + dgemv_(trans, &m, &n, &alpha, &aa[1], &lda, &xx[1], &incx, &beta, & yy[1], &incy, (ftnlen)1); } else if (banded) { @@ -1276,8 +1286,8 @@ static logical c_false = FALSE_; dmvch_(trans, &m, &n, &alpha, &a[ a_offset], nmax, &x[1], &incx, - &beta, &y[1], &incy, &yt[1], - &g[1], &yy[1], eps, &err, + &beta, &y[1], &incy, &yt[1], + &g[1], &yy[1], eps, &err, fatal, nout, &c_true, (ftnlen) 1); errmax = max(errmax,err); @@ -1381,13 +1391,13 @@ static logical c_false = FALSE_; } /* dchk1_ */ -/* Subroutine */ int dchk2_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk2_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, - integer *nalf, doublereal *alf, integer *nbet, doublereal *bet, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublereal *a, doublereal *aa, doublereal *as, doublereal *x, - doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, + fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, + integer *nalf, doublereal *alf, integer *nbet, doublereal *bet, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublereal *a, doublereal *aa, doublereal *as, doublereal *x, + doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, doublereal *ys, doublereal *yt, doublereal *g, ftnlen sname_len) { /* Initialized data */ @@ -1425,7 +1435,7 @@ static logical c_false = FALSE_; f_rew(alist *); /* Local variables */ - integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, + integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, laa, lda; extern logical lde_(doublereal *, doublereal *, integer *); doublereal als, bls, err, beta; @@ -1434,29 +1444,29 @@ static logical c_false = FALSE_; integer incx, incy; logical full, null; char uplo[1]; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, - integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; logical isame[13]; - extern /* Subroutine */ int dmvch_(char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmvch_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, logical *, integer *, + doublereal *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); integer nargs; - extern /* Subroutine */ int dsbmv_(char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dsbmv_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen); logical reset; integer incxs, incys; - extern /* Subroutine */ int dspmv_(char *, integer *, doublereal *, + extern /* Subroutine */ int dspmv_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen); char uplos[1]; - extern /* Subroutine */ int dsymv_(char *, integer *, doublereal *, - doublereal *, integer *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dsymv_(char *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen); logical banded, packed; extern logical lderes_(char *, char *, integer *, integer *, doublereal *, @@ -1619,7 +1629,7 @@ static logical c_false = FALSE_; i__8 = n - 1; dmake_("GE", " ", " ", &c__1, &n, &y[1], & c__1, &yy[1], &i__7, &c__0, &i__8, & - reset, &transl, (ftnlen)2, (ftnlen)1, + reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1836,8 +1846,8 @@ static logical c_false = FALSE_; /* Check the result. */ - dmvch_("N", &n, &n, &alpha, &a[a_offset], - nmax, &x[1], &incx, &beta, &y[1], + dmvch_("N", &n, &n, &alpha, &a[a_offset], + nmax, &x[1], &incx, &beta, &y[1], &incy, &yt[1], &g[1], &yy[1], eps, &err, fatal, nout, &c_true, ( ftnlen)1); @@ -1947,12 +1957,12 @@ static logical c_false = FALSE_; } /* dchk2_ */ -/* Subroutine */ int dchk3_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk3_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublereal *a, doublereal *aa, doublereal *as, doublereal *x, - doublereal *xx, doublereal *xs, doublereal *xt, doublereal *g, + fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublereal *a, doublereal *aa, doublereal *as, doublereal *x, + doublereal *xx, doublereal *xs, doublereal *xt, doublereal *g, doublereal *z__, ftnlen sname_len) { /* Initialized data */ @@ -2002,36 +2012,36 @@ static logical c_false = FALSE_; integer incx; logical full, null; char uplo[1]; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, - integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); char diags[1]; logical isame[13]; - extern /* Subroutine */ int dmvch_(char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmvch_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, logical *, integer *, + doublereal *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); integer nargs; - extern /* Subroutine */ int dtbmv_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dtbmv_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen); logical reset; - extern /* Subroutine */ int dtbsv_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dtbsv_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen); integer incxs; char trans[1]; - extern /* Subroutine */ int dtpmv_(char *, char *, char *, integer *, - doublereal *, doublereal *, integer *, ftnlen, ftnlen, ftnlen), + extern /* Subroutine */ int dtpmv_(char *, char *, char *, integer *, + doublereal *, doublereal *, integer *, ftnlen, ftnlen, ftnlen), dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, - doublereal *, integer *, ftnlen, ftnlen, ftnlen), dtpsv_(char *, - char *, char *, integer *, doublereal *, doublereal *, integer *, + doublereal *, integer *, ftnlen, ftnlen, ftnlen), dtpsv_(char *, + char *, char *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen, ftnlen); char uplos[1]; - extern /* Subroutine */ int dtrsv_(char *, char *, char *, integer *, - doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, + extern /* Subroutine */ int dtrsv_(char *, char *, char *, integer *, + doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen); logical banded, packed; extern logical lderes_(char *, char *, integer *, integer *, doublereal *, @@ -2160,13 +2170,13 @@ static logical c_false = FALSE_; ; for (icd = 1; icd <= 2; ++icd) { - *(unsigned char *)diag = *(unsigned char *)&ichd[icd + *(unsigned char *)diag = *(unsigned char *)&ichd[icd - 1]; /* Generate the matrix A. */ transl = 0.; - dmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], + dmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], nmax, &aa[1], &lda, &k, &k, &reset, &transl, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2213,7 +2223,7 @@ static logical c_false = FALSE_; /* Call the subroutine. */ - if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) + if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) == 0) { if (full) { if (*trace) { @@ -2266,7 +2276,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - dtbmv_(uplo, trans, diag, &n, &k, &aa[1], + dtbmv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2347,7 +2357,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - dtbsv_(uplo, trans, diag, &n, &k, &aa[1], + dtbsv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2389,11 +2399,11 @@ static logical c_false = FALSE_; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplo == *(unsigned + isame[0] = *(unsigned char *)uplo == *(unsigned char *)uplos; - isame[1] = *(unsigned char *)trans == *(unsigned + isame[1] = *(unsigned char *)trans == *(unsigned char *)transs; - isame[2] = *(unsigned char *)diag == *(unsigned + isame[2] = *(unsigned char *)diag == *(unsigned char *)diags; isame[3] = ns == n; if (full) { @@ -2464,7 +2474,7 @@ static logical c_false = FALSE_; dmvch_(trans, &n, &n, &c_b128, &a[ a_offset], nmax, &x[1], &incx, & c_b120, &z__[1], &incx, &xt[1], & - g[1], &xx[1], eps, &err, fatal, + g[1], &xx[1], eps, &err, fatal, nout, &c_true, (ftnlen)1); } else if (s_cmp(sname + 3, "SV", (ftnlen)2, ( ftnlen)2) == 0) { @@ -2473,7 +2483,7 @@ static logical c_false = FALSE_; i__4 = n; for (i__ = 1; i__ <= i__4; ++i__) { - z__[i__] = xx[(i__ - 1) * abs(incx) + + z__[i__] = xx[(i__ - 1) * abs(incx) + 1]; xx[(i__ - 1) * abs(incx) + 1] = x[i__] ; @@ -2482,7 +2492,7 @@ static logical c_false = FALSE_; dmvch_(trans, &n, &n, &c_b128, &a[ a_offset], nmax, &z__[1], &incx, & c_b120, &x[1], &incx, &xt[1], &g[ - 1], &xx[1], eps, &err, fatal, + 1], &xx[1], eps, &err, fatal, nout, &c_false, (ftnlen)1); } errmax = max(errmax,err); @@ -2584,13 +2594,13 @@ static logical c_false = FALSE_; } /* dchk3_ */ -/* Subroutine */ int dchk4_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk4_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublereal *a, doublereal *aa, doublereal *as, doublereal *x, - doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, - doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublereal *a, doublereal *aa, doublereal *as, doublereal *x, + doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, + doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, ftnlen sname_len) { /* Format strings */ @@ -2625,23 +2635,23 @@ static logical c_false = FALSE_; integer ia, nc, nd, im, in, ms, ix, iy, ns, lx, ly, laa, lda; extern logical lde_(doublereal *, doublereal *, integer *); doublereal als, err; - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, - doublereal *, integer *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer ldas; logical same; integer incx, incy; logical null; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, - integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; logical isame[13]; - extern /* Subroutine */ int dmvch_(char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmvch_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, logical *, integer *, + doublereal *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); integer nargs; logical reset; @@ -2748,7 +2758,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = m - 1; dmake_("GE", " ", " ", &c__1, &m, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (m > 1) { x[m / 2] = 0.; @@ -2782,7 +2792,7 @@ static logical c_false = FALSE_; transl = 0.; i__5 = m - 1; i__6 = n - 1; - dmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], + dmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2913,9 +2923,9 @@ static logical c_false = FALSE_; } else { w[0] = y[n - j + 1]; } - dmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, + dmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, w, &c__1, &c_b128, &a[j * a_dim1 + 1], - &c__1, &yt[1], &g[1], &aa[(j - 1) * + &c__1, &yt[1], &g[1], &aa[(j - 1) * lda + 1], eps, &err, fatal, nout, & c_true, (ftnlen)1); errmax = max(errmax,err); @@ -2995,13 +3005,13 @@ static logical c_false = FALSE_; } /* dchk4_ */ -/* Subroutine */ int dchk5_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk5_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublereal *a, doublereal *aa, doublereal *as, doublereal *x, - doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, - doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublereal *a, doublereal *aa, doublereal *as, doublereal *x, + doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, + doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, ftnlen sname_len) { /* Initialized data */ @@ -3047,21 +3057,21 @@ static logical c_false = FALSE_; logical same; integer incx; logical full; - extern /* Subroutine */ int dspr_(char *, integer *, doublereal *, + extern /* Subroutine */ int dspr_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, ftnlen); logical null; char uplo[1]; - extern /* Subroutine */ int dsyr_(char *, integer *, doublereal *, + extern /* Subroutine */ int dsyr_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, ftnlen), dmake_( - char *, char *, char *, integer *, integer *, doublereal *, - integer *, doublereal *, integer *, integer *, integer *, logical + char *, char *, char *, integer *, integer *, doublereal *, + integer *, doublereal *, integer *, integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; logical isame[13]; - extern /* Subroutine */ int dmvch_(char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmvch_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, logical *, integer *, + doublereal *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); integer nargs; logical reset; @@ -3173,7 +3183,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; dmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { x[n / 2] = 0.; @@ -3342,9 +3352,9 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - dmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, + dmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, &c__1, &c_b128, &a[jj + j * a_dim1], & - c__1, &yt[1], &g[1], &aa[ja], eps, &err, + c__1, &yt[1], &g[1], &aa[ja], eps, &err, fatal, nout, &c_true, (ftnlen)1); if (full) { if (upper) { @@ -3442,13 +3452,13 @@ static logical c_false = FALSE_; } /* dchk5_ */ -/* Subroutine */ int dchk6_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk6_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublereal *a, doublereal *aa, doublereal *as, doublereal *x, - doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, - doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublereal *a, doublereal *aa, doublereal *as, doublereal *x, + doublereal *xx, doublereal *xs, doublereal *y, doublereal *yy, + doublereal *ys, doublereal *yt, doublereal *g, doublereal *z__, ftnlen sname_len) { /* Initialized data */ @@ -3477,7 +3487,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, + integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, i__6; alist al__1; @@ -3496,19 +3506,19 @@ static logical c_false = FALSE_; integer incx, incy; logical full, null; char uplo[1]; - extern /* Subroutine */ int dspr2_(char *, integer *, doublereal *, - doublereal *, integer *, doublereal *, integer *, doublereal *, - ftnlen), dsyr2_(char *, integer *, doublereal *, doublereal *, - integer *, doublereal *, integer *, doublereal *, integer *, - ftnlen), dmake_(char *, char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *, integer *, + extern /* Subroutine */ int dspr2_(char *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + ftnlen), dsyr2_(char *, integer *, doublereal *, doublereal *, + integer *, doublereal *, integer *, doublereal *, integer *, + ftnlen), dmake_(char *, char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *, integer *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; logical isame[13]; - extern /* Subroutine */ int dmvch_(char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmvch_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, logical *, integer *, + doublereal *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); integer nargs; logical reset; @@ -3622,7 +3632,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; dmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { x[n / 2] = 0.; @@ -3657,7 +3667,7 @@ static logical c_false = FALSE_; transl = 0.; i__5 = n - 1; i__6 = n - 1; - dmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], + dmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -3835,7 +3845,7 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - dmvch_("N", &lj, &c__2, &alpha, &z__[jj + + dmvch_("N", &lj, &c__2, &alpha, &z__[jj + z_dim1], nmax, w, &c__1, &c_b128, &a[ jj + j * a_dim1], &c__1, &yt[1], &g[1] , &aa[ja], eps, &err, fatal, nout, & @@ -3941,7 +3951,7 @@ static logical c_false = FALSE_; } /* dchk6_ */ -/* Subroutine */ int dchke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int dchke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -3955,39 +3965,39 @@ static logical c_false = FALSE_; /* Local variables */ doublereal a[1] /* was [1][1] */, x[1], y[1], beta; - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, - doublereal *, integer *, doublereal *, integer *, doublereal *, - integer *), dspr_(char *, integer *, doublereal *, doublereal *, - integer *, doublereal *, ftnlen), dsyr_(char *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, - ftnlen), dspr2_(char *, integer *, doublereal *, doublereal *, + extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + integer *), dspr_(char *, integer *, doublereal *, doublereal *, + integer *, doublereal *, ftnlen), dsyr_(char *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, + ftnlen), dspr2_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, ftnlen), dsyr2_( - char *, integer *, doublereal *, doublereal *, integer *, + char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, ftnlen); doublereal alpha; extern /* Subroutine */ int dgbmv_(char *, integer *, integer *, integer * - , integer *, doublereal *, doublereal *, integer *, doublereal *, + , integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen), dgemv_( - char *, integer *, integer *, doublereal *, doublereal *, integer + char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, - ftnlen), dsbmv_(char *, integer *, integer *, doublereal *, - doublereal *, integer *, doublereal *, integer *, doublereal *, - doublereal *, integer *, ftnlen), dtbmv_(char *, char *, char *, - integer *, integer *, doublereal *, integer *, doublereal *, + ftnlen), dsbmv_(char *, integer *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + doublereal *, integer *, ftnlen), dtbmv_(char *, char *, char *, + integer *, integer *, doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen), dtbsv_(char *, char *, char *, - integer *, integer *, doublereal *, integer *, doublereal *, - integer *, ftnlen, ftnlen, ftnlen), dspmv_(char *, integer *, + integer *, integer *, doublereal *, integer *, doublereal *, + integer *, ftnlen, ftnlen, ftnlen), dspmv_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, - doublereal *, integer *, ftnlen), dtpmv_(char *, char *, char *, - integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen, - ftnlen), dtrmv_(char *, char *, char *, integer *, doublereal *, - integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen), - dtpsv_(char *, char *, char *, integer *, doublereal *, - doublereal *, integer *, ftnlen, ftnlen, ftnlen), dsymv_(char *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + doublereal *, integer *, ftnlen), dtpmv_(char *, char *, char *, + integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen, + ftnlen), dtrmv_(char *, char *, char *, integer *, doublereal *, + integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen), + dtpsv_(char *, char *, char *, integer *, doublereal *, + doublereal *, integer *, ftnlen, ftnlen, ftnlen), dsymv_(char *, + integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen), dtrsv_( - char *, char *, char *, integer *, doublereal *, integer *, - doublereal *, integer *, ftnlen, ftnlen, ftnlen), chkxer_(char *, + char *, char *, char *, integer *, doublereal *, integer *, + doublereal *, integer *, ftnlen, ftnlen, ftnlen), chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -4493,9 +4503,9 @@ static logical c_false = FALSE_; } /* dchke_ */ -/* Subroutine */ int dmake_(char *type__, char *uplo, char *diag, integer *m, +/* Subroutine */ int dmake_(char *type__, char *uplo, char *diag, integer *m, integer *n, doublereal *a, integer *nmax, doublereal *aa, integer * - lda, integer *kl, integer *ku, logical *reset, doublereal *transl, + lda, integer *kl, integer *ku, logical *reset, doublereal *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -4553,7 +4563,7 @@ static logical c_false = FALSE_; i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { if (gen || upper && i__ <= j || lower && i__ >= j) { - if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) + if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) { a[i__ + j * a_dim1] = dbeg_(reset) + *transl; } else { @@ -4728,9 +4738,9 @@ static logical c_false = FALSE_; } /* dmake_ */ /* Subroutine */ int dmvch_(char *trans, integer *m, integer *n, doublereal * - alpha, doublereal *a, integer *nmax, doublereal *x, integer *incx, - doublereal *beta, doublereal *y, integer *incy, doublereal *yt, - doublereal *g, doublereal *yy, doublereal *eps, doublereal *err, + alpha, doublereal *a, integer *nmax, doublereal *x, integer *incx, + doublereal *beta, doublereal *y, integer *incy, doublereal *yt, + doublereal *g, doublereal *yy, doublereal *eps, doublereal *err, logical *fatal, integer *nout, logical *mv, ftnlen trans_len) { /* Format strings */ @@ -4845,7 +4855,7 @@ static logical c_false = FALSE_; *err = 0.; i__1 = ml; for (i__ = 1; i__ <= i__1; ++i__) { - erri = (d__1 = yt[i__] - yy[(i__ - 1) * abs(*incy) + 1], abs(d__1)) / + erri = (d__1 = yt[i__] - yy[(i__ - 1) * abs(*incy) + 1], abs(d__1)) / *eps; if (g[i__] != 0.) { erri /= g[i__]; @@ -5102,7 +5112,7 @@ doublereal ddiff_(doublereal *x, doublereal *y) } /* ddiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/blastest/src/dblat3.c b/blastest/src/dblat3.c index d7a85e29c..b4698f56c 100644 --- a/blastest/src/dblat3.c +++ b/blastest/src/dblat3.c @@ -135,9 +135,14 @@ static integer c__2 = 2; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "dblat3"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*6] = "DGEMM " "DSYMM " "DTRMM " "DTRSM " "DSYRK " + static char snames[6*6] = "DGEMM " "DSYMM " "DTRMM " "DTRSM " "DSYRK " "DSYR2K"; /* Format strings */ @@ -179,10 +184,10 @@ static integer c__2 = 2; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -200,38 +205,38 @@ static integer c__2 = 2; integer nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int dchk1_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dchk1_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, ftnlen), dchk2_(char *, - doublereal *, doublereal *, integer *, integer *, logical *, - logical *, logical *, integer *, integer *, integer *, doublereal + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, ftnlen), dchk2_(char *, + doublereal *, doublereal *, integer *, integer *, logical *, + logical *, logical *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, ftnlen), dchk3_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, doublereal *, integer *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, ftnlen), - dchk4_(char *, doublereal *, doublereal *, integer *, integer *, - logical *, logical *, logical *, integer *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, ftnlen), dchk5_(char *, doublereal *, - doublereal *, integer *, integer *, logical *, logical *, logical - *, integer *, integer *, integer *, doublereal *, integer *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, ftnlen), dchk3_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, doublereal *, integer *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, ftnlen), + dchk4_(char *, doublereal *, doublereal *, integer *, integer *, + logical *, logical *, logical *, integer *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, ftnlen), dchk5_(char *, doublereal *, + doublereal *, integer *, integer *, logical *, logical *, logical + *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *, doublereal *, ftnlen), + doublereal *, doublereal *, doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *, doublereal *, ftnlen), dchke_(integer *, char *, integer *, ftnlen); logical fatal; - extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); logical trace; @@ -506,7 +511,7 @@ static integer c__2 = 2; goto L60; } for (i__ = 1; i__ <= 6; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L50; } @@ -554,7 +559,7 @@ static integer c__2 = 2; } i__1 = n; for (j = 1; j <= i__1; ++j) { - cc[j - 1] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - + cc[j - 1] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3); /* L110: */ } @@ -599,7 +604,7 @@ static integer c__2 = 2; } i__1 = n; for (j = 1; j <= i__1; ++j) { - cc[n - j] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - + cc[n - j] = (doublereal) (j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3); /* L130: */ } @@ -672,34 +677,34 @@ static integer c__2 = 2; /* Test DGEMM, 01. */ L140: dchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test DSYMM, 02. */ L150: dchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test DTRMM, 03, DTRSM, 04. */ L160: dchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, (ftnlen)6); goto L190; /* Test DSYRK, 05. */ L170: dchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test DSYR2K, 06. */ L180: dchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, ( ftnlen)6); goto L190; @@ -743,15 +748,20 @@ static integer c__2 = 2; /* End of DBLAT3. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ -/* Subroutine */ int dchk1_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk1_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *nbet, doublereal *bet, integer *nmax, doublereal *a, - doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, - doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *nbet, doublereal *bet, integer *nmax, doublereal *a, + doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, + doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, doublereal *ct, doublereal *g, ftnlen sname_len) { /* Initialized data */ @@ -775,7 +785,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6; alist al__1; @@ -784,22 +794,22 @@ static integer c__2 = 2; f_rew(alist *); /* Local variables */ - integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, + integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, ica, icb, laa, lbb, lda, lcc, ldb, ldc; extern logical lde_(doublereal *, doublereal *, integer *); doublereal als, bls, err, beta; integer ldas, ldbs, ldcs; logical same, null; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; - extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - logical *, integer *, logical *, ftnlen, ftnlen), dgemm_(char *, - char *, integer *, integer *, integer *, doublereal *, doublereal + logical *, integer *, logical *, ftnlen, ftnlen), dgemm_(char *, + char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen); logical isame[13], trana, tranb; @@ -898,7 +908,7 @@ static integer c__2 = 2; for (ica = 1; ica <= 3; ++ica) { *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] ; - trana = *(unsigned char *)transa == 'T' || *(unsigned + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; if (trana) { @@ -926,9 +936,9 @@ static integer c__2 = 2; ftnlen)1); for (icb = 1; icb <= 3; ++icb) { - *(unsigned char *)transb = *(unsigned char *)&ich[icb + *(unsigned char *)transb = *(unsigned char *)&ich[icb - 1]; - tranb = *(unsigned char *)transb == 'T' || *(unsigned + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; if (tranb) { @@ -1100,9 +1110,9 @@ static integer c__2 = 2; dmmch_(transa, transb, &m, &n, &k, &alpha, &a[a_offset], nmax, &b[b_offset], - nmax, &beta, &c__[c_offset], + nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, - eps, &err, fatal, nout, &c_true, + eps, &err, fatal, nout, &c_true, (ftnlen)1, (ftnlen)1); errmax = max(errmax,err); /* If got really bad answer, report and */ @@ -1183,12 +1193,12 @@ static integer c__2 = 2; } /* dchk1_ */ -/* Subroutine */ int dchk2_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk2_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *nbet, doublereal *bet, integer *nmax, doublereal *a, - doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, - doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *nbet, doublereal *bet, integer *nmax, doublereal *a, + doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, + doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, doublereal *ct, doublereal *g, ftnlen sname_len) { /* Initialized data */ @@ -1213,7 +1223,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; alist al__1; @@ -1222,7 +1232,7 @@ static integer c__2 = 2; f_rew(alist *); /* Local variables */ - integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, + integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, ldb, ldc; extern logical lde_(doublereal *, doublereal *, integer *); integer ics; @@ -1234,21 +1244,21 @@ static integer c__2 = 2; char side[1]; logical left, null; char uplo[1]; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; - extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); logical isame[13]; char sides[1]; integer nargs; logical reset; - extern /* Subroutine */ int dsymm_(char *, char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dsymm_(char *, char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen); char uplos[1]; extern logical lderes_(char *, char *, integer *, integer *, doublereal *, @@ -1391,7 +1401,7 @@ static integer c__2 = 2; /* Generate the matrix C. */ - dmake_("GE", " ", " ", &m, &n, &c__[c_offset], + dmake_("GE", " ", " ", &m, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b86, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1472,9 +1482,9 @@ static integer c__2 = 2; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)sides == *(unsigned + isame[0] = *(unsigned char *)sides == *(unsigned char *)side; - isame[1] = *(unsigned char *)uplos == *(unsigned + isame[1] = *(unsigned char *)uplos == *(unsigned char *)uplo; isame[2] = ms == m; isame[3] = ns == n; @@ -1519,14 +1529,14 @@ static integer c__2 = 2; if (left) { dmmch_("N", "N", &m, &n, &m, &alpha, &a[ - a_offset], nmax, &b[b_offset], + a_offset], nmax, &b[b_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { dmmch_("N", "N", &m, &n, &n, &alpha, &b[ - b_offset], nmax, &a[a_offset], + b_offset], nmax, &a[a_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( @@ -1606,11 +1616,11 @@ static integer c__2 = 2; } /* dchk2_ */ -/* Subroutine */ int dchk3_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk3_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *nmax, doublereal *a, doublereal *aa, doublereal *as, - doublereal *b, doublereal *bb, doublereal *bs, doublereal *ct, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *nmax, doublereal *a, doublereal *aa, doublereal *as, + doublereal *b, doublereal *bb, doublereal *bs, doublereal *ct, doublereal *g, doublereal *c__, ftnlen sname_len) { /* Initialized data */ @@ -1637,7 +1647,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; alist al__1; @@ -1658,25 +1668,25 @@ static integer c__2 = 2; char side[1]; logical left, null; char uplo[1]; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; char diags[1]; - extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); logical isame[13]; char sides[1]; integer nargs; logical reset; - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, - integer *, integer *, doublereal *, doublereal *, integer *, + extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), dtrsm_( char *, char *, char *, char *, integer *, integer *, doublereal * - , doublereal *, integer *, doublereal *, integer *, ftnlen, + , doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen, ftnlen); char uplos[1]; extern logical lderes_(char *, char *, integer *, integer *, doublereal *, @@ -1816,7 +1826,7 @@ static integer c__2 = 2; /* Generate the matrix B. */ - dmake_("GE", " ", " ", &m, &n, &b[b_offset], + dmake_("GE", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, &reset, &c_b86, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1882,7 +1892,7 @@ static integer c__2 = 2; } dtrmm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } else if (s_cmp(sname + 3, "SM", (ftnlen)2, ( ftnlen)2) == 0) { @@ -1915,7 +1925,7 @@ static integer c__2 = 2; } dtrsm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } @@ -1984,18 +1994,18 @@ static integer c__2 = 2; dmmch_(transa, "N", &m, &n, &m, & alpha, &a[a_offset], nmax, &b[b_offset], nmax, & - c_b86, &c__[c_offset], + c_b86, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { dmmch_("N", transa, &m, &n, &n, & alpha, &b[b_offset], nmax, &a[a_offset], nmax, & - c_b86, &c__[c_offset], + c_b86, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } @@ -2008,10 +2018,10 @@ static integer c__2 = 2; i__4 = n; for (j = 1; j <= i__4; ++j) { i__5 = m; - for (i__ = 1; i__ <= i__5; ++i__) + for (i__ = 1; i__ <= i__5; ++i__) { c__[i__ + j * c_dim1] = bb[i__ + (j - 1) * ldb]; - bb[i__ + (j - 1) * ldb] = alpha * b[i__ + j * + bb[i__ + (j - 1) * ldb] = alpha * b[i__ + j * b_dim1]; /* L60: */ } @@ -2024,16 +2034,16 @@ static integer c__2 = 2; &c__[c_offset], nmax, & c_b86, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & - ldb, eps, &err, fatal, + ldb, eps, &err, fatal, nout, &c_false, (ftnlen)1, (ftnlen)1); } else { dmmch_("N", transa, &m, &n, &n, & - c_b96, &c__[c_offset], - nmax, &a[a_offset], nmax, - &c_b86, &b[b_offset], + c_b96, &c__[c_offset], + nmax, &a[a_offset], nmax, + &c_b86, &b[b_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_false, ( ftnlen)1, (ftnlen)1); } @@ -2114,12 +2124,12 @@ static integer c__2 = 2; } /* dchk3_ */ -/* Subroutine */ int dchk4_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk4_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *nbet, doublereal *bet, integer *nmax, doublereal *a, - doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, - doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *nbet, doublereal *bet, integer *nmax, doublereal *a, + doublereal *aa, doublereal *as, doublereal *b, doublereal *bb, + doublereal *bs, doublereal *c__, doublereal *cc, doublereal *cs, doublereal *ct, doublereal *g, ftnlen sname_len) { /* Initialized data */ @@ -2146,7 +2156,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; alist al__1; @@ -2166,13 +2176,13 @@ static integer c__2 = 2; doublereal bets; logical tran, null; char uplo[1]; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; - extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); logical isame[13]; @@ -2180,7 +2190,7 @@ static integer c__2 = 2; logical reset; char trans[1]; logical upper; - extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *, + extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen); char uplos[1]; @@ -2312,7 +2322,7 @@ static integer c__2 = 2; /* Generate the matrix C. */ - dmake_("SY", uplo, " ", &n, &n, &c__[c_offset], + dmake_("SY", uplo, " ", &n, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b86, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2369,7 +2379,7 @@ static integer c__2 = 2; al__1.aunit = *ntra; f_rew(&al__1); } - dsyrk_(uplo, trans, &n, &k, &alpha, &aa[1], &lda, + dsyrk_(uplo, trans, &n, &k, &alpha, &aa[1], &lda, &beta, &cc[1], &ldc, (ftnlen)1, (ftnlen)1) ; @@ -2385,9 +2395,9 @@ static integer c__2 = 2; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; @@ -2440,19 +2450,19 @@ static integer c__2 = 2; } if (tran) { dmmch_("T", "N", &lj, &c__1, &k, & - alpha, &a[jj * a_dim1 + 1], - nmax, &a[j * a_dim1 + 1], - nmax, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { dmmch_("N", "T", &lj, &c__1, &k, & - alpha, &a[jj + a_dim1], nmax, + alpha, &a[jj + a_dim1], nmax, &a[j + a_dim1], nmax, &beta, & c__[jj + j * c_dim1], nmax, & - ct[1], &g[1], &cc[jc], &ldc, + ct[1], &g[1], &cc[jc], &ldc, eps, &err, fatal, nout, & c_true, (ftnlen)1, (ftnlen)1); } @@ -2544,12 +2554,12 @@ static integer c__2 = 2; } /* dchk4_ */ -/* Subroutine */ int dchk5_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int dchk5_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, - integer *nbet, doublereal *bet, integer *nmax, doublereal *ab, - doublereal *aa, doublereal *as, doublereal *bb, doublereal *bs, - doublereal *c__, doublereal *cc, doublereal *cs, doublereal *ct, + fatal, integer *nidim, integer *idim, integer *nalf, doublereal *alf, + integer *nbet, doublereal *bet, integer *nmax, doublereal *ab, + doublereal *aa, doublereal *as, doublereal *bb, doublereal *bs, + doublereal *c__, doublereal *cc, doublereal *cs, doublereal *ct, doublereal *g, doublereal *w, ftnlen sname_len) { /* Initialized data */ @@ -2597,13 +2607,13 @@ static integer c__2 = 2; doublereal bets; logical tran, null; char uplo[1]; - extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dmake_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, logical *, doublereal *, ftnlen, ftnlen, ftnlen); doublereal alpha; - extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dmmch_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); logical isame[13]; @@ -2612,8 +2622,8 @@ static integer c__2 = 2; char trans[1]; logical upper; char uplos[1]; - extern /* Subroutine */ int dsyr2k_(char *, char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + extern /* Subroutine */ int dsyr2k_(char *, char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen); extern logical lderes_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen); @@ -2762,7 +2772,7 @@ static integer c__2 = 2; /* Generate the matrix C. */ - dmake_("SY", uplo, " ", &n, &n, &c__[c_offset], + dmake_("SY", uplo, " ", &n, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b86, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2843,9 +2853,9 @@ static integer c__2 = 2; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; @@ -2902,7 +2912,7 @@ static integer c__2 = 2; if (tran) { i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { - w[i__] = ab[(j - 1 << 1) * *nmax + w[i__] = ab[(j - 1 << 1) * *nmax + k + i__]; w[k + i__] = ab[(j - 1 << 1) * * nmax + i__]; @@ -2913,17 +2923,17 @@ static integer c__2 = 2; i__8 = *nmax << 1; dmmch_("T", "N", &lj, &c__1, &i__6, & alpha, &ab[jjab], &i__7, &w[1] - , &i__8, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + , &i__8, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { - w[i__] = ab[(k + i__ - 1) * *nmax + w[i__] = ab[(k + i__ - 1) * *nmax + j]; - w[k + i__] = ab[(i__ - 1) * *nmax + w[k + i__] = ab[(i__ - 1) * *nmax + j]; /* L60: */ } @@ -2931,9 +2941,9 @@ static integer c__2 = 2; i__7 = *nmax << 1; dmmch_("N", "N", &lj, &c__1, &i__6, & alpha, &ab[jj], nmax, &w[1], & - i__7, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } @@ -3029,7 +3039,7 @@ static integer c__2 = 2; } /* dchk5_ */ -/* Subroutine */ int dchke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int dchke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -3042,24 +3052,24 @@ static integer c__2 = 2; integer s_wsfe(cilist *), do_fio(integer *, char *, ftnlen), e_wsfe(void); /* Local variables */ - doublereal a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] + doublereal a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] /* was [2][1] */, beta, alpha; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen), - dtrmm_(char *, char *, char *, char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + dtrmm_(char *, char *, char *, char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), dsymm_(char *, char *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, ftnlen, ftnlen), - dtrsm_(char *, char *, char *, char *, integer *, integer *, - doublereal *, doublereal *, integer *, doublereal *, integer *, + dtrsm_(char *, char *, char *, char *, integer *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), dsyrk_(char *, char *, integer *, - integer *, doublereal *, doublereal *, integer *, doublereal *, - doublereal *, integer *, ftnlen, ftnlen), dsyr2k_(char *, char *, - integer *, integer *, doublereal *, doublereal *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *, - ftnlen, ftnlen), chkxer_(char *, integer *, integer *, logical *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + doublereal *, integer *, ftnlen, ftnlen), dsyr2k_(char *, char *, + integer *, integer *, doublereal *, doublereal *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *, + ftnlen, ftnlen), chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -3113,142 +3123,142 @@ static integer c__2 = 2; } L10: infoc_1.infot = 1; - dgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 1; - dgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - dgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - dgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - dgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - dgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - dgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - dgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - dgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - dgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - dgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - dgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - dgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - dgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - dgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - dgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - dgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - dgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - dgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, + dgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - dgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - dgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - dgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, + dgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - dgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - dgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - dgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + dgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - dgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + dgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - dgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - dgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + dgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); @@ -3952,9 +3962,9 @@ static integer c__2 = 2; } /* dchke_ */ -/* Subroutine */ int dmake_(char *type__, char *uplo, char *diag, integer *m, +/* Subroutine */ int dmake_(char *type__, char *uplo, char *diag, integer *m, integer *n, doublereal *a, integer *nmax, doublereal *aa, integer * - lda, logical *reset, doublereal *transl, ftnlen type_len, ftnlen + lda, logical *reset, doublereal *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -4097,8 +4107,8 @@ static integer c__2 = 2; } /* dmake_ */ /* Subroutine */ int dmmch_(char *transa, char *transb, integer *m, integer * - n, integer *kk, doublereal *alpha, doublereal *a, integer *lda, - doublereal *b, integer *ldb, doublereal *beta, doublereal *c__, + n, integer *kk, doublereal *alpha, doublereal *a, integer *lda, + doublereal *b, integer *ldb, doublereal *beta, doublereal *c__, integer *ldc, doublereal *ct, doublereal *g, doublereal *cc, integer * ldcc, doublereal *eps, doublereal *err, logical *fatal, integer *nout, logical *mv, ftnlen transa_len, ftnlen transb_len) @@ -4112,7 +4122,7 @@ static integer c__2 = 2; " \002,i3)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, cc_offset, i__1, i__2, i__3; doublereal d__1, d__2; @@ -4166,9 +4176,9 @@ static integer c__2 = 2; cc -= cc_offset; /* Function Body */ - trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; - tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; /* Compute expected result, one column at a time, in CT using data */ @@ -4190,7 +4200,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[i__ + k * a_dim1] * b[k + j * b_dim1]; - g[i__] += (d__1 = a[i__ + k * a_dim1], abs(d__1)) * (d__2 + g[i__] += (d__1 = a[i__ + k * a_dim1], abs(d__1)) * (d__2 = b[k + j * b_dim1], abs(d__2)); /* L20: */ } @@ -4202,7 +4212,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[k + i__ * a_dim1] * b[k + j * b_dim1]; - g[i__] += (d__1 = a[k + i__ * a_dim1], abs(d__1)) * (d__2 + g[i__] += (d__1 = a[k + i__ * a_dim1], abs(d__1)) * (d__2 = b[k + j * b_dim1], abs(d__2)); /* L40: */ } @@ -4214,7 +4224,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[i__ + k * a_dim1] * b[j + k * b_dim1]; - g[i__] += (d__1 = a[i__ + k * a_dim1], abs(d__1)) * (d__2 + g[i__] += (d__1 = a[i__ + k * a_dim1], abs(d__1)) * (d__2 = b[j + k * b_dim1], abs(d__2)); /* L60: */ } @@ -4226,7 +4236,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[k + i__ * a_dim1] * b[j + k * b_dim1]; - g[i__] += (d__1 = a[k + i__ * a_dim1], abs(d__1)) * (d__2 + g[i__] += (d__1 = a[k + i__ * a_dim1], abs(d__1)) * (d__2 = b[j + k * b_dim1], abs(d__2)); /* L80: */ } @@ -4520,7 +4530,7 @@ doublereal ddiff_(doublereal *x, doublereal *y) } /* ddiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/blastest/src/sblat1.c b/blastest/src/sblat1.c index 6996666a5..7bde1b108 100644 --- a/blastest/src/sblat1.c +++ b/blastest/src/sblat1.c @@ -69,6 +69,11 @@ static real c_b63 = 0.f; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "sblat1"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ static real sfac = 9.765625e-4f; @@ -123,11 +128,11 @@ static real c_b63 = 0.f; combla_1.incy = 9999; if (combla_1.icase == 3 || combla_1.icase == 11) { check0_(&sfac); - } else if (combla_1.icase == 7 || combla_1.icase == 8 || + } else if (combla_1.icase == 7 || combla_1.icase == 8 || combla_1.icase == 9 || combla_1.icase == 10) { check1_(&sfac); - } else if (combla_1.icase == 1 || combla_1.icase == 2 || - combla_1.icase == 5 || combla_1.icase == 6 || combla_1.icase + } else if (combla_1.icase == 1 || combla_1.icase == 2 || + combla_1.icase == 5 || combla_1.icase == 6 || combla_1.icase == 12 || combla_1.icase == 13) { check2_(&sfac); } else if (combla_1.icase == 4) { @@ -142,7 +147,12 @@ static real c_b63 = 0.f; } s_stop("", (ftnlen)0); - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int header_(void) @@ -202,16 +212,16 @@ static real c_b63 = 0.f; static real dc1[8] = { .6f,.8f,-.6f,.8f,.6f,1.f,0.f,1.f }; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); /* Local variables */ integer i__, k; real sa, sb, sc, ss, dtemp[9]; - extern /* Subroutine */ int srotg_(real *, real *, real *, real *), + extern /* Subroutine */ int srotg_(real *, real *, real *, real *), stest_(integer *, real *, real *, real *, real *), stest1_(real *, - real *, real *, real *), srotmg_(real *, real *, real *, real *, + real *, real *, real *), srotmg_(real *, real *, real *, real *, real *); /* Fortran I/O blocks */ @@ -322,7 +332,7 @@ static real c_b63 = 0.f; real r__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -335,8 +345,8 @@ static real c_b63 = 0.f; real stemp[1]; extern real sasum_(integer *, real *, integer *); real strue[8]; - extern /* Subroutine */ int stest_(integer *, real *, real *, real *, - real *), itest1_(integer *, integer *), stest1_(real *, real *, + extern /* Subroutine */ int stest_(integer *, real *, real *, real *, + real *), itest1_(integer *, integer *), stest1_(real *, real *, real *, real *); extern integer isamax_(integer *, real *, integer *); @@ -378,11 +388,11 @@ static real c_b63 = 0.f; stest1_(&r__1, stemp, stemp, sfac); } else if (combla_1.icase == 9) { /* .. SSCAL .. */ - sscal_(&combla_1.n, &sa[(combla_1.incx - 1) * 5 + np1 - 1], + sscal_(&combla_1.n, &sa[(combla_1.incx - 1) * 5 + np1 - 1], sx, &combla_1.incx); i__1 = len; for (i__ = 1; i__ <= i__1; ++i__) { - strue[i__ - 1] = dtrue5[i__ + (np1 + combla_1.incx * 5 << + strue[i__ - 1] = dtrue5[i__ + (np1 + combla_1.incx * 5 << 3) - 49]; /* L40: */ } @@ -455,87 +465,87 @@ static real c_b63 = 0.f; ; static struct { real e_1[448]; - } equiv_3 = {{ .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, - -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, 0.f, 0.f, 0.f, - 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, 0.f, + } equiv_3 = {{ .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, 3.8f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, - 2.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, -.4f, 0.f, 0.f, 0.f, - 0.f, 0.f, .6f, .1f, -.5f, .8f, 0.f, 0.f, 0.f, -.8f, 3.8f, - -2.2f, -1.2f, 0.f, 0.f, 0.f, -.9f, 2.8f, -1.4f, -1.3f, 0.f, - 0.f, 0.f, 3.5f, -.4f, -2.2f, 4.7f, 0.f, 0.f, 0.f, .6f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, - .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, - 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, - 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, - 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, -.5f, 0.f, 0.f, 0.f, + 2.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, -.4f, 0.f, 0.f, 0.f, + 0.f, 0.f, .6f, .1f, -.5f, .8f, 0.f, 0.f, 0.f, -.8f, 3.8f, + -2.2f, -1.2f, 0.f, 0.f, 0.f, -.9f, 2.8f, -1.4f, -1.3f, 0.f, + 0.f, 0.f, 3.5f, -.4f, -2.2f, 4.7f, 0.f, 0.f, 0.f, .6f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, -.5f, 0.f, 0.f, 0.f, 0.f, 0.f, .1f, -3.f, 0.f, 0.f, 0.f, 0.f, -.3f, .1f, -2.f, 0.f, 0.f, 0.f, 0.f, 3.3f, .1f, -2.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, - -.5f, .8f, .9f, -.3f, -.4f, -2.f, .1f, 1.4f, .8f, .6f, -.3f, - -2.8f, -1.8f, .1f, 1.3f, .8f, 0.f, -.3f, -1.9f, 3.8f, .1f, - -3.1f, .8f, 4.8f, -.3f, -1.5f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, - 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, - -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, + -.5f, .8f, .9f, -.3f, -.4f, -2.f, .1f, 1.4f, .8f, .6f, -.3f, + -2.8f, -1.8f, .1f, 1.3f, .8f, 0.f, -.3f, -1.9f, 3.8f, .1f, + -3.1f, .8f, 4.8f, -.3f, -1.5f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, -.5f, 0.f, 0.f, 0.f, 0.f, 4.8f, .1f, -3.f, - 0.f, 0.f, 0.f, 0.f, 3.3f, .1f, -2.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 3.3f, .1f, -2.f, 0.f, 0.f, 0.f, 0.f, 2.1f, .1f, -2.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, -.5f, .8f, .9f, -.3f, -.4f, -1.6f, .1f, -2.2f, .8f, 5.4f, -.3f, -2.8f, -1.5f, - .1f, -1.4f, .8f, 3.6f, -.3f, -1.9f, 3.7f, .1f, -2.2f, .8f, - 3.6f, -.3f, -1.5f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, - 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, - 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, - .6f, .1f, 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, -1.f, 0.f, 0.f, 0.f, + .1f, -1.4f, .8f, 3.6f, -.3f, -1.9f, 3.7f, .1f, -2.2f, .8f, + 3.6f, -.3f, -1.5f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, .6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + .6f, .1f, 0.f, 0.f, 0.f, 0.f, 0.f, -.8f, -1.f, 0.f, 0.f, 0.f, 0.f, 0.f, -.9f, -.8f, 0.f, 0.f, 0.f, 0.f, 0.f, 3.5f, .8f, 0.f, 0.f, 0.f, 0.f, 0.f, .6f, .1f, -.5f, .8f, 0.f, 0.f, 0.f, -.8f, - -1.f, 1.4f, -1.6f, 0.f, 0.f, 0.f, -.9f, -.8f, 1.3f, -1.6f, + -1.f, 1.4f, -1.6f, 0.f, 0.f, 0.f, -.9f, -.8f, 1.3f, -1.6f, 0.f, 0.f, 0.f, 3.5f, .8f, -3.1f, 4.8f, 0.f, 0.f, 0.f }}; static struct { real e_1[448]; - } equiv_7 = {{ .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, - .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, 0.f, 0.f, 0.f, + } equiv_7 = {{ .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, 0.f, - 0.f, 0.f, 0.f, 0.f, .7f, -4.8f, 0.f, 0.f, 0.f, 0.f, 0.f, - 1.7f, -.7f, 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 3.5f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, .7f, -4.8f, 0.f, 0.f, 0.f, 0.f, 0.f, + 1.7f, -.7f, 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 3.5f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, .7f, 0.f, 0.f, 0.f, .7f, -4.8f, - 3.f, 1.1f, 0.f, 0.f, 0.f, 1.7f, -.7f, -.7f, 2.3f, 0.f, 0.f, - 0.f, -2.6f, 3.5f, -.7f, -3.6f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, - 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, - 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, - 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, + 3.f, 1.1f, 0.f, 0.f, 0.f, 1.7f, -.7f, -.7f, 2.3f, 0.f, 0.f, + 0.f, -2.6f, 3.5f, -.7f, -3.6f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, 0.f, 0.f, 0.f, 0.f, - 4.f, -.9f, -.3f, 0.f, 0.f, 0.f, 0.f, -.5f, -.9f, 1.5f, 0.f, - 0.f, 0.f, 0.f, -1.5f, -.9f, -1.8f, 0.f, 0.f, 0.f, 0.f, .5f, + 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, 0.f, 0.f, 0.f, 0.f, + 4.f, -.9f, -.3f, 0.f, 0.f, 0.f, 0.f, -.5f, -.9f, 1.5f, 0.f, + 0.f, 0.f, 0.f, -1.5f, -.9f, -1.8f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, .7f, -.6f, .2f, .8f, 3.7f, -.9f, -1.2f, .7f, -1.5f, - .2f, 2.2f, -.3f, -.9f, 2.1f, .7f, -1.6f, .2f, 2.f, -1.6f, - -.9f, -2.1f, .7f, 2.9f, .2f, -3.8f, .5f, 0.f, 0.f, 0.f, 0.f, - 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, - 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, - 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 0.f, 0.f, + .2f, 2.2f, -.3f, -.9f, 2.1f, .7f, -1.6f, .2f, 2.f, -1.6f, + -.9f, -2.1f, .7f, 2.9f, .2f, -3.8f, .5f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, .5f, -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 4.f, + 0.f, 0.f, 0.f, .5f, -.9f, 0.f, 0.f, 0.f, 0.f, 0.f, 4.f, -6.3f, 0.f, 0.f, 0.f, 0.f, 0.f, -.5f, .3f, 0.f, 0.f, 0.f, 0.f, - 0.f, -1.5f, 3.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, - .7f, 0.f, 0.f, 0.f, 3.7f, -7.2f, 3.f, 1.7f, 0.f, 0.f, 0.f, - -.3f, .9f, -.7f, 1.9f, 0.f, 0.f, 0.f, -1.6f, 2.7f, -.7f, - -3.4f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, - 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, - 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, - 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, - 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, -1.5f, 3.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, + .7f, 0.f, 0.f, 0.f, 3.7f, -7.2f, 3.f, 1.7f, 0.f, 0.f, 0.f, + -.3f, .9f, -.7f, 1.9f, 0.f, 0.f, 0.f, -1.6f, 2.7f, -.7f, + -3.4f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, .5f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, .7f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.7f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, -2.6f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, 0.f, 0.f, 0.f, 0.f, .7f, -.9f, 1.2f, 0.f, 0.f, - 0.f, 0.f, 1.7f, -.9f, .5f, 0.f, 0.f, 0.f, 0.f, -2.6f, -.9f, - -1.3f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, .7f, -.6f, .2f, - .8f, .7f, -.9f, 1.2f, .7f, -1.5f, .2f, 1.6f, 1.7f, -.9f, .5f, - .7f, -1.6f, .2f, 2.4f, -2.6f, -.9f, -1.3f, .7f, 2.9f, .2f, + 0.f, 0.f, 1.7f, -.9f, .5f, 0.f, 0.f, 0.f, 0.f, -2.6f, -.9f, + -1.3f, 0.f, 0.f, 0.f, 0.f, .5f, -.9f, .3f, .7f, -.6f, .2f, + .8f, .7f, -.9f, 1.2f, .7f, -1.5f, .2f, 1.6f, 1.7f, -.9f, .5f, + .7f, -1.6f, .2f, 2.4f, -2.6f, -.9f, -1.3f, .7f, 2.9f, .2f, -4.f }}; @@ -544,7 +554,7 @@ static real c_b63 = 0.f; real r__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -568,13 +578,13 @@ static real c_b63 = 0.f; #define dt19yd ((real *)&equiv_7 + 336) integer ksize; real ssize[7]; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ), stest_(integer *, real *, real *, real *, real *), saxpy_( integer *, real *, real *, integer *, real *, integer *), srotm_( integer *, real *, integer *, real *, integer *, real *), stest1_( real *, real *, real *, real *); - extern real sdsdot_(integer *, real *, real *, integer *, real *, integer + extern real sdsdot_(integer *, real *, real *, integer *, real *, integer *); /* Fortran I/O blocks */ @@ -627,7 +637,7 @@ static real c_b63 = 0.f; /* .. SDOT .. */ r__1 = sdot_(&combla_1.n, sx, &combla_1.incx, sy, & combla_1.incy); - stest1_(&r__1, &dt7[kn + (ki << 2) - 5], &ssize1[kn - 1], + stest1_(&r__1, &dt7[kn + (ki << 2) - 5], &ssize1[kn - 1], sfac); } else if (combla_1.icase == 2) { /* .. SAXPY .. */ @@ -664,9 +674,9 @@ static real c_b63 = 0.f; for (i__ = 1; i__ <= 7; ++i__) { sx[i__ - 1] = dx1[i__ - 1]; sy[i__ - 1] = dy1[i__ - 1]; - stx[i__ - 1] = dt19x[i__ + (kpar + (kni << 2)) * 7 - + stx[i__ - 1] = dt19x[i__ + (kpar + (kni << 2)) * 7 - 36]; - sty[i__ - 1] = dt19y[i__ + (kpar + (kni << 2)) * 7 - + sty[i__ - 1] = dt19y[i__ + (kpar + (kni << 2)) * 7 - 36]; } @@ -696,7 +706,7 @@ static real c_b63 = 0.f; /* .. SDSROT .. */ r__1 = sdsdot_(&combla_1.n, &c_b39, sx, &combla_1.incx, sy, & combla_1.incy); - stest1_(&r__1, &st7b[kn + (ki << 2) - 5], &ssize3[kn - 1], + stest1_(&r__1, &st7b[kn + (ki << 2) - 5], &ssize3[kn - 1], sfac); } else { s_wsle(&io___80); @@ -759,7 +769,7 @@ static real c_b63 = 0.f; 1.17f,1.17f,1.17f,1.17f,1.17f,1.17f,1.17f,1.17f,1.17f }; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -770,12 +780,12 @@ static real c_b63 = 0.f; real mwpc[11]; integer mwpn[11]; real mwps[11]; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, integer *, real *, real *); real mwpx[5], mwpy[5]; integer ksize; real copyx[5], copyy[5]; - extern /* Subroutine */ int stest_(integer *, real *, real *, real *, + extern /* Subroutine */ int stest_(integer *, real *, real *, real *, real *); real mwptx[55] /* was [11][5] */, mwpty[55] /* was [11][5] */; integer mwpinx[11], mwpiny[11]; @@ -1032,7 +1042,7 @@ static real c_b63 = 0.f; sfac) { real scomp[1], strue[1]; - extern /* Subroutine */ int stest_(integer *, real *, real *, real *, + extern /* Subroutine */ int stest_(integer *, real *, real *, real *, real *); /* ************************* STEST1 ***************************** */ diff --git a/blastest/src/sblat2.c b/blastest/src/sblat2.c index 54d0a010a..a2ce310f6 100644 --- a/blastest/src/sblat2.c +++ b/blastest/src/sblat2.c @@ -155,10 +155,15 @@ static logical c_false = FALSE_; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "sblat2"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*16] = "SGEMV " "SGBMV " "SSYMV " "SSBMV " "SSPMV " - "STRMV " "STBMV " "STPMV " "STRSV " "STBSV " "STPSV " "SGER " + static char snames[6*16] = "SGEMV " "SGBMV " "SSYMV " "SSBMV " "SSPMV " + "STRMV " "STBMV " "STPMV " "STRSV " "STBSV " "STPSV " "SGER " "SSYR " "SSPR " "SSYR2 " "SSPR2 "; /* Format strings */ @@ -204,10 +209,10 @@ static logical c_false = FALSE_; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -228,40 +233,40 @@ static logical c_false = FALSE_; integer ninc, nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int schk1_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, integer *, integer *, real *, integer *, real *, + extern /* Subroutine */ int schk1_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, integer *, integer *, real *, integer *, real *, integer *, integer *, integer *, integer *, real *, real *, real * - , real *, real *, real *, real *, real *, real *, real *, real *, - ftnlen), schk2_(char *, real *, real *, integer *, integer *, - logical *, logical *, logical *, integer *, integer *, integer *, - integer *, integer *, real *, integer *, real *, integer *, - integer *, integer *, integer *, real *, real *, real *, real *, - real *, real *, real *, real *, real *, real *, real *, ftnlen), - schk3_(char *, real *, real *, integer *, integer *, logical *, - logical *, logical *, integer *, integer *, integer *, integer *, + , real *, real *, real *, real *, real *, real *, real *, real *, + ftnlen), schk2_(char *, real *, real *, integer *, integer *, + logical *, logical *, logical *, integer *, integer *, integer *, + integer *, integer *, real *, integer *, real *, integer *, + integer *, integer *, integer *, real *, real *, real *, real *, + real *, real *, real *, real *, real *, real *, real *, ftnlen), + schk3_(char *, real *, real *, integer *, integer *, logical *, + logical *, logical *, integer *, integer *, integer *, integer *, integer *, integer *, integer *, integer *, real *, real *, real * , real *, real *, real *, real *, real *, real *, ftnlen), schk4_( char *, real *, real *, integer *, integer *, logical *, logical * - , logical *, integer *, integer *, integer *, real *, integer *, - integer *, integer *, integer *, real *, real *, real *, real *, - real *, real *, real *, real *, real *, real *, real *, real *, - ftnlen), schk5_(char *, real *, real *, integer *, integer *, - logical *, logical *, logical *, integer *, integer *, integer *, + , logical *, integer *, integer *, integer *, real *, integer *, + integer *, integer *, integer *, real *, real *, real *, real *, + real *, real *, real *, real *, real *, real *, real *, real *, + ftnlen), schk5_(char *, real *, real *, integer *, integer *, + logical *, logical *, logical *, integer *, integer *, integer *, real *, integer *, integer *, integer *, integer *, real *, real * - , real *, real *, real *, real *, real *, real *, real *, real *, + , real *, real *, real *, real *, real *, real *, real *, real *, real *, real *, ftnlen), schk6_(char *, real *, real *, integer *, integer *, logical *, logical *, logical *, integer *, integer *, - integer *, real *, integer *, integer *, integer *, integer *, - real *, real *, real *, real *, real *, real *, real *, real *, + integer *, real *, integer *, integer *, integer *, integer *, + real *, real *, real *, real *, real *, real *, real *, real *, real *, real *, real *, real *, ftnlen); logical fatal; extern /* Subroutine */ int schke_(integer *, char *, integer *, ftnlen); logical trace; integer nidim; - extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, - real *, real *, real *, real *, real *, logical *, integer *, + extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, + real *, real *, real *, real *, real *, logical *, integer *, logical *, ftnlen); char snaps[32], trans[1]; integer isnum; @@ -610,7 +615,7 @@ static logical c_false = FALSE_; goto L80; } for (i__ = 1; i__ <= 16; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L70; } @@ -737,44 +742,44 @@ static logical c_false = FALSE_; /* Test SGEMV, 01, and SGBMV, 02. */ L140: schk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test SSYMV, 03, SSBMV, 04, and SSPMV, 05. */ L150: schk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test STRMV, 06, STBMV, 07, STPMV, 08, */ /* STRSV, 09, STBSV, 10, and STPSV, 11. */ L160: schk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, &c__65, &c__2, a, aa, as, y, yy, ys, yt, g, z__, (ftnlen) 6); goto L200; /* Test SGER, 12. */ L170: schk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test SSYR, 13, and SSPR, 14. */ L180: schk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test SSYR2, 15, and SSPR2, 16. */ L190: schk6_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); L200: @@ -816,15 +821,20 @@ static logical c_false = FALSE_; /* End of SBLAT2. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int schk1_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, integer * nalf, real *alf, integer *nbet, real *bet, integer *ninc, integer * - inc, integer *nmax, integer *incmax, real *a, real *aa, real *as, - real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, + inc, integer *nmax, integer *incmax, real *a, real *aa, real *as, + real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, real *g, ftnlen sname_len) { /* Initialized data */ @@ -872,24 +882,24 @@ static logical c_false = FALSE_; logical full, tran, null; real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *, integer *, logical *, real *, ftnlen, ftnlen, ftnlen); integer nargs; extern /* Subroutine */ int sgbmv_(char *, integer *, integer *, integer * , integer *, real *, real *, integer *, real *, integer *, real *, - real *, integer *, ftnlen), smvch_(char *, integer *, integer *, - real *, real *, integer *, real *, integer *, real *, real *, - integer *, real *, real *, real *, real *, real *, logical *, + real *, integer *, ftnlen), smvch_(char *, integer *, integer *, + real *, real *, integer *, real *, integer *, real *, real *, + integer *, real *, real *, real *, real *, real *, logical *, integer *, logical *, ftnlen), sgemv_(char *, integer *, integer * - , real *, real *, integer *, real *, integer *, real *, real *, + , real *, real *, integer *, real *, integer *, real *, real *, integer *, ftnlen); logical reset; integer incxs, incys; char trans[1]; logical banded; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); real transl; char transs[1]; @@ -1066,9 +1076,9 @@ static logical c_false = FALSE_; transl = 0.f; i__7 = abs(incy); i__8 = ml - 1; - smake_("GE", " ", " ", &c__1, &ml, &y[1], + smake_("GE", " ", " ", &c__1, &ml, &y[1], &c__1, &yy[1], &i__7, &c__0, & - i__8, &reset, &transl, (ftnlen)2, + i__8, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1076,7 +1086,7 @@ static logical c_false = FALSE_; /* Save every datum before calling the */ /* subroutine. */ - *(unsigned char *)transs = *(unsigned + *(unsigned char *)transs = *(unsigned char *)trans; ms = m; ns = n; @@ -1134,7 +1144,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - sgemv_(trans, &m, &n, &alpha, &aa[1], + sgemv_(trans, &m, &n, &alpha, &aa[1], &lda, &xx[1], &incx, &beta, & yy[1], &incy, (ftnlen)1); } else if (banded) { @@ -1259,8 +1269,8 @@ static logical c_false = FALSE_; smvch_(trans, &m, &n, &alpha, &a[ a_offset], nmax, &x[1], &incx, - &beta, &y[1], &incy, &yt[1], - &g[1], &yy[1], eps, &err, + &beta, &y[1], &incy, &yt[1], + &g[1], &yy[1], eps, &err, fatal, nout, &c_true, (ftnlen) 1); errmax = max(errmax,err); @@ -1365,11 +1375,11 @@ static logical c_false = FALSE_; } /* schk1_ */ /* Subroutine */ int schk2_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, integer * nalf, real *alf, integer *nbet, real *bet, integer *ninc, integer * - inc, integer *nmax, integer *incmax, real *a, real *aa, real *as, - real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, + inc, integer *nmax, integer *incmax, real *a, real *aa, real *as, + real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, real *g, ftnlen sname_len) { /* Initialized data */ @@ -1407,7 +1417,7 @@ static logical c_false = FALSE_; f_rew(alist *); /* Local variables */ - integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, + integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, laa, lda; real als, bls; extern logical lse_(real *, real *, integer *); @@ -1419,27 +1429,27 @@ static logical c_false = FALSE_; char uplo[1]; real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *, integer *, logical *, real *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, - real *, real *, real *, real *, real *, logical *, integer *, + extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, + real *, real *, real *, real *, real *, logical *, integer *, logical *, ftnlen); logical reset; integer incxs, incys; - extern /* Subroutine */ int ssbmv_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, + extern /* Subroutine */ int ssbmv_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, ftnlen); char uplos[1]; - extern /* Subroutine */ int sspmv_(char *, integer *, real *, real *, + extern /* Subroutine */ int sspmv_(char *, integer *, real *, real *, real *, integer *, real *, real *, integer *, ftnlen), ssymv_( - char *, integer *, real *, real *, integer *, real *, integer *, + char *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *, ftnlen); logical banded, packed; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); real transl; @@ -1599,7 +1609,7 @@ static logical c_false = FALSE_; i__8 = n - 1; smake_("GE", " ", " ", &c__1, &n, &y[1], & c__1, &yy[1], &i__7, &c__0, &i__8, & - reset, &transl, (ftnlen)2, (ftnlen)1, + reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1816,8 +1826,8 @@ static logical c_false = FALSE_; /* Check the result. */ - smvch_("N", &n, &n, &alpha, &a[a_offset], - nmax, &x[1], &incx, &beta, &y[1], + smvch_("N", &n, &n, &alpha, &a[a_offset], + nmax, &x[1], &incx, &beta, &y[1], &incy, &yt[1], &g[1], &yy[1], eps, &err, fatal, nout, &c_true, ( ftnlen)1); @@ -1928,10 +1938,10 @@ static logical c_false = FALSE_; } /* schk2_ */ /* Subroutine */ int schk3_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, integer * ninc, integer *inc, integer *nmax, integer *incmax, real *a, real *aa, - real *as, real *x, real *xx, real *xs, real *xt, real *g, real *z__, + real *as, real *x, real *xx, real *xs, real *xt, real *g, real *z__, ftnlen sname_len) { /* Initialized data */ @@ -1971,7 +1981,7 @@ static logical c_false = FALSE_; integer *, char *, ftnlen), e_wsfe(void), f_rew(alist *); /* Local variables */ - integer i__, k, n, nc, ik, in, nk, ks, ix, ns, lx, laa, icd, lda, ict, + integer i__, k, n, nc, ik, in, nk, ks, ix, ns, lx, laa, icd, lda, ict, icu; extern logical lse_(real *, real *, integer *); real err; @@ -1982,32 +1992,32 @@ static logical c_false = FALSE_; logical full, null; char uplo[1], diags[1]; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *, integer *, logical *, real *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, - real *, real *, real *, real *, real *, logical *, integer *, + extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, + real *, real *, real *, real *, real *, logical *, integer *, logical *, ftnlen); logical reset; integer incxs; char trans[1]; - extern /* Subroutine */ int stbmv_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *, ftnlen, ftnlen, - ftnlen), stbsv_(char *, char *, char *, integer *, integer *, + extern /* Subroutine */ int stbmv_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, ftnlen, ftnlen, + ftnlen), stbsv_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen); char uplos[1]; - extern /* Subroutine */ int stpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ int stpmv_(char *, char *, char *, integer *, real *, real *, integer *, ftnlen, ftnlen, ftnlen), strmv_(char *, - char *, char *, integer *, real *, integer *, real *, integer *, + char *, char *, integer *, real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen), stpsv_(char *, char *, char *, integer *, real *, real *, integer *, ftnlen, ftnlen, ftnlen), strsv_(char * , char *, char *, integer *, real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen); logical banded, packed; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); real transl; char transs[1]; @@ -2133,13 +2143,13 @@ static logical c_false = FALSE_; ; for (icd = 1; icd <= 2; ++icd) { - *(unsigned char *)diag = *(unsigned char *)&ichd[icd + *(unsigned char *)diag = *(unsigned char *)&ichd[icd - 1]; /* Generate the matrix A. */ transl = 0.f; - smake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], + smake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], nmax, &aa[1], &lda, &k, &k, &reset, &transl, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2186,7 +2196,7 @@ static logical c_false = FALSE_; /* Call the subroutine. */ - if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) + if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) == 0) { if (full) { if (*trace) { @@ -2239,7 +2249,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - stbmv_(uplo, trans, diag, &n, &k, &aa[1], + stbmv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2320,7 +2330,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - stbsv_(uplo, trans, diag, &n, &k, &aa[1], + stbsv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2362,11 +2372,11 @@ static logical c_false = FALSE_; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplo == *(unsigned + isame[0] = *(unsigned char *)uplo == *(unsigned char *)uplos; - isame[1] = *(unsigned char *)trans == *(unsigned + isame[1] = *(unsigned char *)trans == *(unsigned char *)transs; - isame[2] = *(unsigned char *)diag == *(unsigned + isame[2] = *(unsigned char *)diag == *(unsigned char *)diags; isame[3] = ns == n; if (full) { @@ -2437,7 +2447,7 @@ static logical c_false = FALSE_; smvch_(trans, &n, &n, &c_b128, &a[ a_offset], nmax, &x[1], &incx, & c_b120, &z__[1], &incx, &xt[1], & - g[1], &xx[1], eps, &err, fatal, + g[1], &xx[1], eps, &err, fatal, nout, &c_true, (ftnlen)1); } else if (s_cmp(sname + 3, "SV", (ftnlen)2, ( ftnlen)2) == 0) { @@ -2446,7 +2456,7 @@ static logical c_false = FALSE_; i__4 = n; for (i__ = 1; i__ <= i__4; ++i__) { - z__[i__] = xx[(i__ - 1) * abs(incx) + + z__[i__] = xx[(i__ - 1) * abs(incx) + 1]; xx[(i__ - 1) * abs(incx) + 1] = x[i__] ; @@ -2455,7 +2465,7 @@ static logical c_false = FALSE_; smvch_(trans, &n, &n, &c_b128, &a[ a_offset], nmax, &z__[1], &incx, & c_b120, &x[1], &incx, &xt[1], &g[ - 1], &xx[1], eps, &err, fatal, + 1], &xx[1], eps, &err, fatal, nout, &c_false, (ftnlen)1); } errmax = max(errmax,err); @@ -2558,10 +2568,10 @@ static logical c_false = FALSE_; } /* schk3_ */ /* Subroutine */ int schk4_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * ninc, integer *inc, integer *nmax, integer *incmax, real *a, real *aa, - real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, + real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, real *g, real *z__, ftnlen sname_len) { /* Format strings */ @@ -2599,24 +2609,24 @@ static logical c_false = FALSE_; real err; integer ldas; logical same; - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); integer incx, incy; logical null; real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *, integer *, logical *, real *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, - real *, real *, real *, real *, real *, logical *, integer *, + extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, + real *, real *, real *, real *, real *, logical *, integer *, logical *, ftnlen); logical reset; integer incxs, incys; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); real transl; @@ -2718,7 +2728,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = m - 1; smake_("GE", " ", " ", &c__1, &m, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (m > 1) { x[m / 2] = 0.f; @@ -2752,7 +2762,7 @@ static logical c_false = FALSE_; transl = 0.f; i__5 = m - 1; i__6 = n - 1; - smake_(sname + 1, " ", " ", &m, &n, &a[a_offset], + smake_(sname + 1, " ", " ", &m, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2883,9 +2893,9 @@ static logical c_false = FALSE_; } else { w[0] = y[n - j + 1]; } - smvch_("N", &m, &c__1, &alpha, &z__[1], nmax, + smvch_("N", &m, &c__1, &alpha, &z__[1], nmax, w, &c__1, &c_b128, &a[j * a_dim1 + 1], - &c__1, &yt[1], &g[1], &aa[(j - 1) * + &c__1, &yt[1], &g[1], &aa[(j - 1) * lda + 1], eps, &err, fatal, nout, & c_true, (ftnlen)1); errmax = max(errmax,err); @@ -2966,10 +2976,10 @@ static logical c_false = FALSE_; } /* schk4_ */ /* Subroutine */ int schk5_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * ninc, integer *inc, integer *nmax, integer *incmax, real *a, real *aa, - real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, + real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, real *g, real *z__, ftnlen sname_len) { /* Initialized data */ @@ -3017,18 +3027,18 @@ static logical c_false = FALSE_; integer incx; logical full, null; char uplo[1]; - extern /* Subroutine */ int sspr_(char *, integer *, real *, real *, - integer *, real *, ftnlen), ssyr_(char *, integer *, real *, real + extern /* Subroutine */ int sspr_(char *, integer *, real *, real *, + integer *, real *, ftnlen), ssyr_(char *, integer *, real *, real *, integer *, real *, integer *, ftnlen); real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *, integer *, logical *, real *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, - real *, real *, real *, real *, real *, logical *, integer *, + extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, + real *, real *, real *, real *, real *, logical *, integer *, logical *, ftnlen); logical reset; integer incxs; @@ -3036,7 +3046,7 @@ static logical c_false = FALSE_; char uplos[1]; logical packed; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); real transl; @@ -3140,7 +3150,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; smake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { x[n / 2] = 0.f; @@ -3309,9 +3319,9 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - smvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, + smvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, &c__1, &c_b128, &a[jj + j * a_dim1], & - c__1, &yt[1], &g[1], &aa[ja], eps, &err, + c__1, &yt[1], &g[1], &aa[ja], eps, &err, fatal, nout, &c_true, (ftnlen)1); if (full) { if (upper) { @@ -3410,10 +3420,10 @@ static logical c_false = FALSE_; } /* schk5_ */ /* Subroutine */ int schk6_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * ninc, integer *inc, integer *nmax, integer *incmax, real *a, real *aa, - real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, + real *as, real *x, real *xx, real *xs, real *y, real *yy, real *ys, real *yt, real *g, real *z__, ftnlen sname_len) { /* Initialized data */ @@ -3442,7 +3452,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, + integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, i__6; alist al__1; @@ -3462,19 +3472,19 @@ static logical c_false = FALSE_; integer incx, incy; logical full, null; char uplo[1]; - extern /* Subroutine */ int sspr2_(char *, integer *, real *, real *, - integer *, real *, integer *, real *, ftnlen), ssyr2_(char *, - integer *, real *, real *, integer *, real *, integer *, real *, + extern /* Subroutine */ int sspr2_(char *, integer *, real *, real *, + integer *, real *, integer *, real *, ftnlen), ssyr2_(char *, + integer *, real *, real *, integer *, real *, integer *, real *, integer *, ftnlen); real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *, integer *, logical *, real *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, - real *, real *, real *, real *, real *, logical *, integer *, + extern /* Subroutine */ int smvch_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, + real *, real *, real *, real *, real *, logical *, integer *, logical *, ftnlen); logical reset; integer incxs, incys; @@ -3482,7 +3492,7 @@ static logical c_false = FALSE_; char uplos[1]; logical packed; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); real transl; @@ -3588,7 +3598,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; smake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { x[n / 2] = 0.f; @@ -3623,7 +3633,7 @@ static logical c_false = FALSE_; transl = 0.f; i__5 = n - 1; i__6 = n - 1; - smake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], + smake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -3801,7 +3811,7 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - smvch_("N", &lj, &c__2, &alpha, &z__[jj + + smvch_("N", &lj, &c__2, &alpha, &z__[jj + z_dim1], nmax, w, &c__1, &c_b128, &a[ jj + j * a_dim1], &c__1, &yt[1], &g[1] , &aa[ja], eps, &err, fatal, nout, & @@ -3907,7 +3917,7 @@ static logical c_false = FALSE_; } /* schk6_ */ -/* Subroutine */ int schke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int schke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -3921,35 +3931,35 @@ static logical c_false = FALSE_; /* Local variables */ real a[1] /* was [1][1] */, x[1], y[1], beta; - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, - integer *, real *, integer *, real *, integer *), sspr_(char *, - integer *, real *, real *, integer *, real *, ftnlen), ssyr_(char - *, integer *, real *, real *, integer *, real *, integer *, - ftnlen), sspr2_(char *, integer *, real *, real *, integer *, - real *, integer *, real *, ftnlen), ssyr2_(char *, integer *, - real *, real *, integer *, real *, integer *, real *, integer *, + extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + integer *, real *, integer *, real *, integer *), sspr_(char *, + integer *, real *, real *, integer *, real *, ftnlen), ssyr_(char + *, integer *, real *, real *, integer *, real *, integer *, + ftnlen), sspr2_(char *, integer *, real *, real *, integer *, + real *, integer *, real *, ftnlen), ssyr2_(char *, integer *, + real *, real *, integer *, real *, integer *, real *, integer *, ftnlen); real alpha; extern /* Subroutine */ int sgbmv_(char *, integer *, integer *, integer * , integer *, real *, real *, integer *, real *, integer *, real *, - real *, integer *, ftnlen), sgemv_(char *, integer *, integer *, - real *, real *, integer *, real *, integer *, real *, real *, - integer *, ftnlen), ssbmv_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *, - ftnlen), stbmv_(char *, char *, char *, integer *, integer *, - real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen), - stbsv_(char *, char *, char *, integer *, integer *, real *, + real *, integer *, ftnlen), sgemv_(char *, integer *, integer *, + real *, real *, integer *, real *, integer *, real *, real *, + integer *, ftnlen), ssbmv_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *, + ftnlen), stbmv_(char *, char *, char *, integer *, integer *, + real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen), + stbsv_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen), sspmv_( - char *, integer *, real *, real *, real *, integer *, real *, - real *, integer *, ftnlen), stpmv_(char *, char *, char *, - integer *, real *, real *, integer *, ftnlen, ftnlen, ftnlen), - strmv_(char *, char *, char *, integer *, real *, integer *, real - *, integer *, ftnlen, ftnlen, ftnlen), stpsv_(char *, char *, - char *, integer *, real *, real *, integer *, ftnlen, ftnlen, - ftnlen), ssymv_(char *, integer *, real *, real *, integer *, + char *, integer *, real *, real *, real *, integer *, real *, + real *, integer *, ftnlen), stpmv_(char *, char *, char *, + integer *, real *, real *, integer *, ftnlen, ftnlen, ftnlen), + strmv_(char *, char *, char *, integer *, real *, integer *, real + *, integer *, ftnlen, ftnlen, ftnlen), stpsv_(char *, char *, + char *, integer *, real *, real *, integer *, ftnlen, ftnlen, + ftnlen), ssymv_(char *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *, ftnlen), strsv_( - char *, char *, char *, integer *, real *, integer *, real *, - integer *, ftnlen, ftnlen, ftnlen), chkxer_(char *, integer *, + char *, char *, char *, integer *, real *, integer *, real *, + integer *, ftnlen, ftnlen, ftnlen), chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -4455,9 +4465,9 @@ static logical c_false = FALSE_; } /* schke_ */ -/* Subroutine */ int smake_(char *type__, char *uplo, char *diag, integer *m, +/* Subroutine */ int smake_(char *type__, char *uplo, char *diag, integer *m, integer *n, real *a, integer *nmax, real *aa, integer *lda, integer * - kl, integer *ku, logical *reset, real *transl, ftnlen type_len, + kl, integer *ku, logical *reset, real *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -4516,7 +4526,7 @@ static logical c_false = FALSE_; i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { if (gen || upper && i__ <= j || lower && i__ >= j) { - if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) + if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) { a[i__ + j * a_dim1] = sbeg_(reset) + *transl; } else { @@ -4690,9 +4700,9 @@ static logical c_false = FALSE_; } /* smake_ */ -/* Subroutine */ int smvch_(char *trans, integer *m, integer *n, real *alpha, - real *a, integer *nmax, real *x, integer *incx, real *beta, real *y, - integer *incy, real *yt, real *g, real *yy, real *eps, real *err, +/* Subroutine */ int smvch_(char *trans, integer *m, integer *n, real *alpha, + real *a, integer *nmax, real *x, integer *incx, real *beta, real *y, + integer *incy, real *yt, real *g, real *yy, real *eps, real *err, logical *fatal, integer *nout, logical *mv, ftnlen trans_len) { /* Format strings */ @@ -4807,7 +4817,7 @@ static logical c_false = FALSE_; *err = 0.f; i__1 = ml; for (i__ = 1; i__ <= i__1; ++i__) { - erri = (r__1 = yt[i__] - yy[(i__ - 1) * abs(*incy) + 1], abs(r__1)) / + erri = (r__1 = yt[i__] - yy[(i__ - 1) * abs(*incy) + 1], abs(r__1)) / *eps; if (g[i__] != 0.f) { erri /= g[i__]; @@ -4903,7 +4913,7 @@ logical lse_(real *ri, real *rj, integer *lr) } /* lse_ */ -logical lseres_(char *type__, char *uplo, integer *m, integer *n, real *aa, +logical lseres_(char *type__, char *uplo, integer *m, integer *n, real *aa, real *as, integer *lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ @@ -5064,7 +5074,7 @@ real sdiff_(real *x, real *y) } /* sdiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/blastest/src/sblat3.c b/blastest/src/sblat3.c index dc5ef5738..01d4ca4b8 100644 --- a/blastest/src/sblat3.c +++ b/blastest/src/sblat3.c @@ -135,9 +135,14 @@ static integer c__2 = 2; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "sblat3"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*6] = "SGEMM " "SSYMM " "STRMM " "STRSM " "SSYRK " + static char snames[6*6] = "SGEMM " "SSYMM " "STRMM " "STRSM " "SSYRK " "SSYR2K"; /* Format strings */ @@ -179,10 +184,10 @@ static integer c__2 = 2; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -200,33 +205,33 @@ static integer c__2 = 2; integer nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int schk1_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, real *, integer *, real *, integer *, real *, real *, - real *, real *, real *, real *, real *, real *, real *, real *, - real *, ftnlen), schk2_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, real *, integer *, real *, integer *, real *, real *, - real *, real *, real *, real *, real *, real *, real *, real *, - real *, ftnlen), schk3_(char *, real *, real *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, real *, integer *, real *, real *, real *, real *, - real *, real *, real *, real *, real *, ftnlen), schk4_(char *, - real *, real *, integer *, integer *, logical *, logical *, - logical *, integer *, integer *, integer *, real *, integer *, + extern /* Subroutine */ int schk1_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, real *, integer *, real *, integer *, real *, real *, + real *, real *, real *, real *, real *, real *, real *, real *, + real *, ftnlen), schk2_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, real *, integer *, real *, integer *, real *, real *, + real *, real *, real *, real *, real *, real *, real *, real *, + real *, ftnlen), schk3_(char *, real *, real *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, real *, integer *, real *, real *, real *, real *, + real *, real *, real *, real *, real *, ftnlen), schk4_(char *, + real *, real *, integer *, integer *, logical *, logical *, + logical *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, real *, real *, real *, real *, real *, - real *, real *, real *, real *, real *, ftnlen), schk5_(char *, - real *, real *, integer *, integer *, logical *, logical *, - logical *, integer *, integer *, integer *, real *, integer *, + real *, real *, real *, real *, real *, ftnlen), schk5_(char *, + real *, real *, integer *, integer *, logical *, logical *, + logical *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, real *, real *, real *, real *, real *, real *, real *, real *, real *, real *, ftnlen); logical fatal; extern /* Subroutine */ int schke_(integer *, char *, integer *, ftnlen); logical trace; integer nidim; - extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, - integer *, real *, real *, integer *, real *, integer *, real *, - real *, integer *, real *, real *, real *, integer *, real *, + extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, + integer *, real *, real *, integer *, real *, integer *, real *, + real *, integer *, real *, real *, real *, integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); char snaps[32]; integer isnum; @@ -496,7 +501,7 @@ static integer c__2 = 2; goto L60; } for (i__ = 1; i__ <= 6; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L50; } @@ -662,34 +667,34 @@ static integer c__2 = 2; /* Test SGEMM, 01. */ L140: schk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test SSYMM, 02. */ L150: schk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test STRMM, 03, STRSM, 04. */ L160: schk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, (ftnlen)6); goto L190; /* Test SSYRK, 05. */ L170: schk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test SSYR2K, 06. */ L180: schk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, ( ftnlen)6); goto L190; @@ -733,14 +738,19 @@ static integer c__2 = 2; /* End of SBLAT3. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int schk1_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * - nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, - real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, + nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, + real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, ftnlen sname_len) { /* Initialized data */ @@ -764,7 +774,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6; alist al__1; @@ -773,7 +783,7 @@ static integer c__2 = 2; f_rew(alist *); /* Local variables */ - integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, + integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, ica, icb, laa, lbb, lda, lcc, ldb, ldc; real als, bls; extern logical lse_(real *, real *, integer *); @@ -782,22 +792,22 @@ static integer c__2 = 2; logical same, null; real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, logical *, real * , ftnlen, ftnlen, ftnlen); logical trana, tranb; - extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, - integer *, real *, real *, integer *, real *, integer *, real *, - real *, integer *, real *, real *, real *, integer *, real *, + extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, + integer *, real *, real *, integer *, real *, integer *, real *, + real *, integer *, real *, real *, real *, integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen), sgemm_( - char *, char *, integer *, integer *, integer *, real *, real *, - integer *, real *, integer *, real *, real *, integer *, ftnlen, + char *, char *, integer *, integer *, integer *, real *, real *, + integer *, real *, integer *, real *, real *, integer *, ftnlen, ftnlen); integer nargs; logical reset; char tranas[1], tranbs[1], transa[1], transb[1]; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -888,7 +898,7 @@ static integer c__2 = 2; for (ica = 1; ica <= 3; ++ica) { *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] ; - trana = *(unsigned char *)transa == 'T' || *(unsigned + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; if (trana) { @@ -916,9 +926,9 @@ static integer c__2 = 2; ftnlen)1); for (icb = 1; icb <= 3; ++icb) { - *(unsigned char *)transb = *(unsigned char *)&ich[icb + *(unsigned char *)transb = *(unsigned char *)&ich[icb - 1]; - tranb = *(unsigned char *)transb == 'T' || *(unsigned + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; if (tranb) { @@ -1090,9 +1100,9 @@ static integer c__2 = 2; smmch_(transa, transb, &m, &n, &k, &alpha, &a[a_offset], nmax, &b[b_offset], - nmax, &beta, &c__[c_offset], + nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, - eps, &err, fatal, nout, &c_true, + eps, &err, fatal, nout, &c_true, (ftnlen)1, (ftnlen)1); errmax = max(errmax,err); /* If got really bad answer, report and */ @@ -1174,10 +1184,10 @@ static integer c__2 = 2; } /* schk1_ */ /* Subroutine */ int schk2_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * - nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, - real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, + nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, + real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, ftnlen sname_len) { /* Initialized data */ @@ -1202,7 +1212,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; alist al__1; @@ -1211,7 +1221,7 @@ static integer c__2 = 2; f_rew(alist *); /* Local variables */ - integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, + integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, ldb, ldc, ics; real als, bls; integer icu; @@ -1224,22 +1234,22 @@ static integer c__2 = 2; char uplo[1]; real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, logical *, real * , ftnlen, ftnlen, ftnlen); char sides[1]; - extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, - integer *, real *, real *, integer *, real *, integer *, real *, - real *, integer *, real *, real *, real *, integer *, real *, + extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, + integer *, real *, real *, integer *, real *, integer *, real *, + real *, integer *, real *, real *, real *, integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); integer nargs; logical reset; char uplos[1]; - extern /* Subroutine */ int ssymm_(char *, char *, integer *, integer *, - real *, real *, integer *, real *, integer *, real *, real *, + extern /* Subroutine */ int ssymm_(char *, char *, integer *, integer *, + real *, real *, integer *, real *, integer *, real *, real *, integer *, ftnlen, ftnlen); real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -1378,7 +1388,7 @@ static integer c__2 = 2; /* Generate the matrix C. */ - smake_("GE", " ", " ", &m, &n, &c__[c_offset], + smake_("GE", " ", " ", &m, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b84, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1459,9 +1469,9 @@ static integer c__2 = 2; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)sides == *(unsigned + isame[0] = *(unsigned char *)sides == *(unsigned char *)side; - isame[1] = *(unsigned char *)uplos == *(unsigned + isame[1] = *(unsigned char *)uplos == *(unsigned char *)uplo; isame[2] = ms == m; isame[3] = ns == n; @@ -1506,14 +1516,14 @@ static integer c__2 = 2; if (left) { smmch_("N", "N", &m, &n, &m, &alpha, &a[ - a_offset], nmax, &b[b_offset], + a_offset], nmax, &b[b_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { smmch_("N", "N", &m, &n, &n, &alpha, &b[ - b_offset], nmax, &a[a_offset], + b_offset], nmax, &a[a_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( @@ -1594,7 +1604,7 @@ static integer c__2 = 2; } /* schk2_ */ /* Subroutine */ int schk3_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * nmax, real *a, real *aa, real *as, real *b, real *bb, real *bs, real * ct, real *g, real *c__, ftnlen sname_len) @@ -1623,7 +1633,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; alist al__1; @@ -1647,25 +1657,25 @@ static integer c__2 = 2; real alpha; char diags[1]; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, logical *, real * , ftnlen, ftnlen, ftnlen); char sides[1]; - extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, - integer *, real *, real *, integer *, real *, integer *, real *, - real *, integer *, real *, real *, real *, integer *, real *, + extern /* Subroutine */ int smmch_(char *, char *, integer *, integer *, + integer *, real *, real *, integer *, real *, integer *, real *, + real *, integer *, real *, real *, real *, integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); integer nargs; logical reset; char uplos[1]; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ int strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * - , ftnlen, ftnlen, ftnlen, ftnlen), strsm_(char *, char *, char *, - char *, integer *, integer *, real *, real *, integer *, real *, + , ftnlen, ftnlen, ftnlen, ftnlen), strsm_(char *, char *, char *, + char *, integer *, integer *, real *, real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen, ftnlen); char tranas[1], transa[1]; real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -1800,7 +1810,7 @@ static integer c__2 = 2; /* Generate the matrix B. */ - smake_("GE", " ", " ", &m, &n, &b[b_offset], + smake_("GE", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, &reset, &c_b84, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1866,7 +1876,7 @@ static integer c__2 = 2; } strmm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } else if (s_cmp(sname + 3, "SM", (ftnlen)2, ( ftnlen)2) == 0) { @@ -1899,7 +1909,7 @@ static integer c__2 = 2; } strsm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } @@ -1968,18 +1978,18 @@ static integer c__2 = 2; smmch_(transa, "N", &m, &n, &m, & alpha, &a[a_offset], nmax, &b[b_offset], nmax, & - c_b84, &c__[c_offset], + c_b84, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { smmch_("N", transa, &m, &n, &n, & alpha, &b[b_offset], nmax, &a[a_offset], nmax, & - c_b84, &c__[c_offset], + c_b84, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } @@ -1992,10 +2002,10 @@ static integer c__2 = 2; i__4 = n; for (j = 1; j <= i__4; ++j) { i__5 = m; - for (i__ = 1; i__ <= i__5; ++i__) + for (i__ = 1; i__ <= i__5; ++i__) { c__[i__ + j * c_dim1] = bb[i__ + (j - 1) * ldb]; - bb[i__ + (j - 1) * ldb] = alpha * b[i__ + j * + bb[i__ + (j - 1) * ldb] = alpha * b[i__ + j * b_dim1]; /* L60: */ } @@ -2008,16 +2018,16 @@ static integer c__2 = 2; &c__[c_offset], nmax, & c_b84, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & - ldb, eps, &err, fatal, + ldb, eps, &err, fatal, nout, &c_false, (ftnlen)1, (ftnlen)1); } else { smmch_("N", transa, &m, &n, &n, & - c_b94, &c__[c_offset], - nmax, &a[a_offset], nmax, - &c_b84, &b[b_offset], + c_b94, &c__[c_offset], + nmax, &a[a_offset], nmax, + &c_b84, &b[b_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_false, ( ftnlen)1, (ftnlen)1); } @@ -2099,10 +2109,10 @@ static integer c__2 = 2; } /* schk3_ */ /* Subroutine */ int schk4_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * - nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, - real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, + nbet, real *bet, integer *nmax, real *a, real *aa, real *as, real *b, + real *bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, ftnlen sname_len) { /* Initialized data */ @@ -2129,7 +2139,7 @@ static integer c__2 = 2; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; alist al__1; @@ -2151,22 +2161,22 @@ static integer c__2 = 2; char uplo[1]; real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, logical *, real * - , ftnlen, ftnlen, ftnlen), smmch_(char *, char *, integer *, + , ftnlen, ftnlen, ftnlen), smmch_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer * - , real *, real *, integer *, real *, real *, real *, integer *, + , real *, real *, integer *, real *, real *, real *, integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); integer nargs; logical reset; char trans[1]; logical upper; char uplos[1]; - extern /* Subroutine */ int ssyrk_(char *, char *, integer *, integer *, - real *, real *, integer *, real *, real *, integer *, ftnlen, + extern /* Subroutine */ int ssyrk_(char *, char *, integer *, integer *, + real *, real *, integer *, real *, real *, integer *, ftnlen, ftnlen); real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); char transs[1]; @@ -2293,7 +2303,7 @@ static integer c__2 = 2; /* Generate the matrix C. */ - smake_("SY", uplo, " ", &n, &n, &c__[c_offset], + smake_("SY", uplo, " ", &n, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b84, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2350,7 +2360,7 @@ static integer c__2 = 2; al__1.aunit = *ntra; f_rew(&al__1); } - ssyrk_(uplo, trans, &n, &k, &alpha, &aa[1], &lda, + ssyrk_(uplo, trans, &n, &k, &alpha, &aa[1], &lda, &beta, &cc[1], &ldc, (ftnlen)1, (ftnlen)1) ; @@ -2366,9 +2376,9 @@ static integer c__2 = 2; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; @@ -2421,19 +2431,19 @@ static integer c__2 = 2; } if (tran) { smmch_("T", "N", &lj, &c__1, &k, & - alpha, &a[jj * a_dim1 + 1], - nmax, &a[j * a_dim1 + 1], - nmax, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { smmch_("N", "T", &lj, &c__1, &k, & - alpha, &a[jj + a_dim1], nmax, + alpha, &a[jj + a_dim1], nmax, &a[j + a_dim1], nmax, &beta, & c__[jj + j * c_dim1], nmax, & - ct[1], &g[1], &cc[jc], &ldc, + ct[1], &g[1], &cc[jc], &ldc, eps, &err, fatal, nout, & c_true, (ftnlen)1, (ftnlen)1); } @@ -2526,7 +2536,7 @@ static integer c__2 = 2; } /* schk4_ */ /* Subroutine */ int schk5_(char *sname, real *eps, real *thresh, integer * - nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, integer *nidim, integer *idim, integer *nalf, real *alf, integer * nbet, real *bet, integer *nmax, real *ab, real *aa, real *as, real * bb, real *bs, real *c__, real *cc, real *cs, real *ct, real *g, real * @@ -2579,22 +2589,22 @@ static integer c__2 = 2; char uplo[1]; real alpha; logical isame[13]; - extern /* Subroutine */ int smake_(char *, char *, char *, integer *, + extern /* Subroutine */ int smake_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, logical *, real * - , ftnlen, ftnlen, ftnlen), smmch_(char *, char *, integer *, + , ftnlen, ftnlen, ftnlen), smmch_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer * - , real *, real *, integer *, real *, real *, real *, integer *, + , real *, real *, integer *, real *, real *, real *, integer *, real *, real *, logical *, integer *, logical *, ftnlen, ftnlen); integer nargs; logical reset; char trans[1]; logical upper; char uplos[1]; - extern /* Subroutine */ int ssyr2k_(char *, char *, integer *, integer *, - real *, real *, integer *, real *, integer *, real *, real *, + extern /* Subroutine */ int ssyr2k_(char *, char *, integer *, integer *, + real *, real *, integer *, real *, integer *, real *, real *, integer *, ftnlen, ftnlen); real errmax; - extern logical lseres_(char *, char *, integer *, integer *, real *, real + extern logical lseres_(char *, char *, integer *, integer *, real *, real *, integer *, ftnlen, ftnlen); char transs[1]; @@ -2740,7 +2750,7 @@ static integer c__2 = 2; /* Generate the matrix C. */ - smake_("SY", uplo, " ", &n, &n, &c__[c_offset], + smake_("SY", uplo, " ", &n, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b84, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2821,9 +2831,9 @@ static integer c__2 = 2; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; @@ -2880,7 +2890,7 @@ static integer c__2 = 2; if (tran) { i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { - w[i__] = ab[(j - 1 << 1) * *nmax + w[i__] = ab[(j - 1 << 1) * *nmax + k + i__]; w[k + i__] = ab[(j - 1 << 1) * * nmax + i__]; @@ -2891,17 +2901,17 @@ static integer c__2 = 2; i__8 = *nmax << 1; smmch_("T", "N", &lj, &c__1, &i__6, & alpha, &ab[jjab], &i__7, &w[1] - , &i__8, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + , &i__8, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { - w[i__] = ab[(k + i__ - 1) * *nmax + w[i__] = ab[(k + i__ - 1) * *nmax + j]; - w[k + i__] = ab[(i__ - 1) * *nmax + w[k + i__] = ab[(i__ - 1) * *nmax + j]; /* L60: */ } @@ -2909,9 +2919,9 @@ static integer c__2 = 2; i__7 = *nmax << 1; smmch_("N", "N", &lj, &c__1, &i__6, & alpha, &ab[jj], nmax, &w[1], & - i__7, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } @@ -3007,7 +3017,7 @@ static integer c__2 = 2; } /* schk5_ */ -/* Subroutine */ int schke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int schke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -3020,22 +3030,22 @@ static integer c__2 = 2; integer s_wsfe(cilist *), do_fio(integer *, char *, ftnlen), e_wsfe(void); /* Local variables */ - real a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] /* + real a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] /* was [2][1] */, beta, alpha; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, - integer *, real *, real *, integer *, real *, integer *, real *, + extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *, ftnlen, ftnlen), strmm_(char *, char *, char *, - char *, integer *, integer *, real *, real *, integer *, real *, + char *, integer *, integer *, real *, real *, integer *, real *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), ssymm_(char *, char *, - integer *, integer *, real *, real *, integer *, real *, integer - *, real *, real *, integer *, ftnlen, ftnlen), strsm_(char *, - char *, char *, char *, integer *, integer *, real *, real *, - integer *, real *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), - ssyrk_(char *, char *, integer *, integer *, real *, real *, + integer *, integer *, real *, real *, integer *, real *, integer + *, real *, real *, integer *, ftnlen, ftnlen), strsm_(char *, + char *, char *, char *, integer *, integer *, real *, real *, + integer *, real *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), + ssyrk_(char *, char *, integer *, integer *, real *, real *, integer *, real *, real *, integer *, ftnlen, ftnlen), ssyr2k_( - char *, char *, integer *, integer *, real *, real *, integer *, - real *, integer *, real *, real *, integer *, ftnlen, ftnlen), - chkxer_(char *, integer *, integer *, logical *, logical *, + char *, char *, integer *, integer *, real *, real *, integer *, + real *, integer *, real *, real *, integer *, ftnlen, ftnlen), + chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -3089,142 +3099,142 @@ static integer c__2 = 2; } L10: infoc_1.infot = 1; - sgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 1; - sgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - sgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - sgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - sgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - sgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - sgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - sgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - sgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - sgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - sgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - sgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - sgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - sgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - sgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - sgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - sgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - sgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - sgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, + sgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - sgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - sgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - sgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, + sgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - sgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - sgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - sgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + sgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - sgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + sgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - sgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - sgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + sgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); @@ -3928,9 +3938,9 @@ static integer c__2 = 2; } /* schke_ */ -/* Subroutine */ int smake_(char *type__, char *uplo, char *diag, integer *m, +/* Subroutine */ int smake_(char *type__, char *uplo, char *diag, integer *m, integer *n, real *a, integer *nmax, real *aa, integer *lda, logical * - reset, real *transl, ftnlen type_len, ftnlen uplo_len, ftnlen + reset, real *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -4075,7 +4085,7 @@ static integer c__2 = 2; /* Subroutine */ int smmch_(char *transa, char *transb, integer *m, integer * n, integer *kk, real *alpha, real *a, integer *lda, real *b, integer * ldb, real *beta, real *c__, integer *ldc, real *ct, real *g, real *cc, - integer *ldcc, real *eps, real *err, logical *fatal, integer *nout, + integer *ldcc, real *eps, real *err, logical *fatal, integer *nout, logical *mv, ftnlen transa_len, ftnlen transb_len) { /* Format strings */ @@ -4087,7 +4097,7 @@ static integer c__2 = 2; " \002,i3)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, cc_offset, i__1, i__2, i__3; real r__1, r__2; @@ -4141,9 +4151,9 @@ static integer c__2 = 2; cc -= cc_offset; /* Function Body */ - trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; - tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; /* Compute expected result, one column at a time, in CT using data */ @@ -4165,7 +4175,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[i__ + k * a_dim1] * b[k + j * b_dim1]; - g[i__] += (r__1 = a[i__ + k * a_dim1], abs(r__1)) * (r__2 + g[i__] += (r__1 = a[i__ + k * a_dim1], abs(r__1)) * (r__2 = b[k + j * b_dim1], abs(r__2)); /* L20: */ } @@ -4177,7 +4187,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[k + i__ * a_dim1] * b[k + j * b_dim1]; - g[i__] += (r__1 = a[k + i__ * a_dim1], abs(r__1)) * (r__2 + g[i__] += (r__1 = a[k + i__ * a_dim1], abs(r__1)) * (r__2 = b[k + j * b_dim1], abs(r__2)); /* L40: */ } @@ -4189,7 +4199,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[i__ + k * a_dim1] * b[j + k * b_dim1]; - g[i__] += (r__1 = a[i__ + k * a_dim1], abs(r__1)) * (r__2 + g[i__] += (r__1 = a[i__ + k * a_dim1], abs(r__1)) * (r__2 = b[j + k * b_dim1], abs(r__2)); /* L60: */ } @@ -4201,7 +4211,7 @@ static integer c__2 = 2; i__3 = *m; for (i__ = 1; i__ <= i__3; ++i__) { ct[i__] += a[k + i__ * a_dim1] * b[j + k * b_dim1]; - g[i__] += (r__1 = a[k + i__ * a_dim1], abs(r__1)) * (r__2 + g[i__] += (r__1 = a[k + i__ * a_dim1], abs(r__1)) * (r__2 = b[j + k * b_dim1], abs(r__2)); /* L80: */ } @@ -4328,7 +4338,7 @@ logical lse_(real *ri, real *rj, integer *lr) } /* lse_ */ -logical lseres_(char *type__, char *uplo, integer *m, integer *n, real *aa, +logical lseres_(char *type__, char *uplo, integer *m, integer *n, real *aa, real *as, integer *lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ @@ -4495,7 +4505,7 @@ real sdiff_(real *x, real *y) } /* sdiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/blastest/src/zblat1.c b/blastest/src/zblat1.c index b620910be..93a24f4c3 100644 --- a/blastest/src/zblat1.c +++ b/blastest/src/zblat1.c @@ -68,6 +68,11 @@ static doublereal c_b52 = 0.; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "zblat1"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ static doublereal sfac = 9.765625e-4; @@ -84,7 +89,7 @@ static doublereal c_b52 = 0.; /* Local variables */ integer ic; - extern /* Subroutine */ int check1_(doublereal *), check2_(doublereal *), + extern /* Subroutine */ int check1_(doublereal *), check2_(doublereal *), header_(void); /* Fortran I/O blocks */ @@ -136,7 +141,12 @@ static doublereal c_b52 = 0.; } s_stop("", (ftnlen)0); - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ /* Subroutine */ int header_(void) @@ -222,7 +232,7 @@ static doublereal c_b52 = 0.; doublecomplex z__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -230,14 +240,14 @@ static doublereal c_b52 = 0.; integer i__; doublecomplex cx[8]; integer np1, len; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, - doublecomplex *, integer *), ctest_(integer *, doublecomplex *, + extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + doublecomplex *, integer *), ctest_(integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublereal *); doublecomplex mwpcs[5], mwpct[5]; extern /* Subroutine */ int itest1_(integer *, integer *); extern doublereal dznrm2_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int stest1_(doublereal *, doublereal *, - doublereal *, doublereal *), zdscal_(integer *, doublereal *, + extern /* Subroutine */ int stest1_(doublereal *, doublereal *, + doublereal *, doublereal *), zdscal_(integer *, doublereal *, doublecomplex *, integer *); extern integer izamax_(integer *, doublecomplex *, integer *); extern doublereal dzasum_(integer *, doublecomplex *, integer *); @@ -433,7 +443,7 @@ static doublereal c_b52 = 0.; 0.,0.},{0.,0.},{0.,0.},{0.,0.},{.7,-.8},{-.9,.5},{-.4,-.7},{0.,0.} ,{0.,0.},{0.,0.},{0.,0.},{.7,-.8},{-.9,.5},{-.4,-.7},{.1,-.5},{ -.1,-.9},{-.5,-.3},{.2,-.8} }; - static doublecomplex csize1[4] = { {0.,0.},{.9,.9},{1.63,1.73},{2.9,2.78} + static doublecomplex csize1[4] = { {0.,0.},{.9,.9},{1.63,1.73},{2.9,2.78} }; static doublecomplex csize3[14] = { {0.,0.},{0.,0.},{0.,0.},{0.,0.},{0., 0.},{0.,0.},{0.,0.},{1.17,1.17},{1.17,1.17},{1.17,1.17},{1.17, @@ -447,7 +457,7 @@ static doublereal c_b52 = 0.; doublecomplex z__1; /* Builtin functions */ - integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_wsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_wsle(void); /* Subroutine */ int s_stop(char *, ftnlen); @@ -457,7 +467,7 @@ static doublereal c_b52 = 0.; integer mx, my; doublecomplex cdot[1]; integer lenx, leny; - extern /* Subroutine */ int ctest_(integer *, doublecomplex *, + extern /* Subroutine */ int ctest_(integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublereal *); extern /* Double Complex */ #ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL @@ -465,10 +475,10 @@ static doublereal c_b52 = 0.; #else doublecomplex zdotc_( #endif - integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ksize; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern /* Double Complex */ #ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL @@ -476,10 +486,10 @@ doublecomplex zdotc_( #else doublecomplex zdotu_( #endif - integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, - doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, + extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); /* Fortran I/O blocks */ @@ -669,11 +679,11 @@ doublecomplex zdotu_( } /* stest_ */ -/* Subroutine */ int stest1_(doublereal *scomp1, doublereal *strue1, +/* Subroutine */ int stest1_(doublereal *scomp1, doublereal *strue1, doublereal *ssize, doublereal *sfac) { doublereal scomp[1], strue[1]; - extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, doublereal *, doublereal *); /* ************************* STEST1 ***************************** */ @@ -715,7 +725,7 @@ doublereal sdiff_(doublereal *sa, doublereal *sb) return ret_val; } /* sdiff_ */ -/* Subroutine */ int ctest_(integer *len, doublecomplex *ccomp, doublecomplex +/* Subroutine */ int ctest_(integer *len, doublecomplex *ccomp, doublecomplex *ctrue, doublecomplex *csize, doublereal *sfac) { /* System generated locals */ @@ -727,7 +737,7 @@ doublereal sdiff_(doublereal *sa, doublereal *sb) /* Local variables */ integer i__; doublereal scomp[20], ssize[20], strue[20]; - extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ int stest_(integer *, doublereal *, doublereal *, doublereal *, doublereal *); /* **************************** CTEST ***************************** */ diff --git a/blastest/src/zblat2.c b/blastest/src/zblat2.c index 030f03b83..5550b413f 100644 --- a/blastest/src/zblat2.c +++ b/blastest/src/zblat2.c @@ -157,10 +157,15 @@ static logical c_false = FALSE_; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "zblat2"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*17] = "ZGEMV " "ZGBMV " "ZHEMV " "ZHBMV " "ZHPMV " - "ZTRMV " "ZTBMV " "ZTPMV " "ZTRSV " "ZTBSV " "ZTPSV " "ZGERC " + static char snames[6*17] = "ZGEMV " "ZGBMV " "ZHEMV " "ZHBMV " "ZHPMV " + "ZTRMV " "ZTBMV " "ZTPMV " "ZTRSV " "ZTBSV " "ZTPSV " "ZGERC " "ZGERU " "ZHER " "ZHPR " "ZHER2 " "ZHPR2 "; /* Format strings */ @@ -208,10 +213,10 @@ static logical c_false = FALSE_; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -234,53 +239,53 @@ static logical c_false = FALSE_; integer ninc, nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int zchk1_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, - integer *, integer *, integer *, integer *, doublecomplex *, - integer *, doublecomplex *, integer *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, doublecomplex *, + extern /* Subroutine */ int zchk1_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, + integer *, integer *, integer *, integer *, doublecomplex *, + integer *, doublecomplex *, integer *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, - ftnlen), zchk2_(char *, doublereal *, doublereal *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, integer *, integer *, doublecomplex *, integer *, - doublecomplex *, integer *, integer *, integer *, integer *, + ftnlen), zchk2_(char *, doublereal *, doublereal *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, integer *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * - , doublecomplex *, doublecomplex *, doublecomplex *, - doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, - ftnlen), zchk3_(char *, doublereal *, doublereal *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, integer *, integer *, integer *, integer *, integer *, + , doublecomplex *, doublecomplex *, doublecomplex *, + doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, + ftnlen), zchk3_(char *, doublereal *, doublereal *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, integer *, integer *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, - doublecomplex *, ftnlen), zchk4_(char *, doublereal *, - doublereal *, integer *, integer *, logical *, logical *, logical - *, integer *, integer *, integer *, doublecomplex *, integer *, + doublecomplex *, ftnlen), zchk4_(char *, doublereal *, + doublereal *, integer *, integer *, logical *, logical *, logical + *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, - doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex - *, doublecomplex *, doublecomplex *, doublecomplex *, + doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex + *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, ftnlen), zchk5_( - char *, doublereal *, doublereal *, integer *, integer *, logical - *, logical *, logical *, integer *, integer *, integer *, - doublecomplex *, integer *, integer *, integer *, integer *, + char *, doublereal *, doublereal *, integer *, integer *, logical + *, logical *, logical *, integer *, integer *, integer *, + doublecomplex *, integer *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * - , doublecomplex *, doublecomplex *, doublecomplex *, - doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, - doublecomplex *, ftnlen), zchk6_(char *, doublereal *, doublereal - *, integer *, integer *, logical *, logical *, logical *, integer - *, integer *, integer *, doublecomplex *, integer *, integer *, - integer *, integer *, doublecomplex *, doublecomplex *, + , doublecomplex *, doublecomplex *, doublecomplex *, + doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, + doublecomplex *, ftnlen), zchk6_(char *, doublereal *, doublereal + *, integer *, integer *, logical *, logical *, logical *, integer + *, integer *, integer *, doublecomplex *, integer *, integer *, + integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * - , doublecomplex *, doublecomplex *, doublecomplex *, + , doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, ftnlen); logical fatal, trace; integer nidim; extern /* Subroutine */ int zchke_(integer *, char *, integer *, ftnlen); char snaps[32], trans[1]; - extern /* Subroutine */ int zmvch_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublereal *, doublecomplex *, doublereal *, + extern /* Subroutine */ int zmvch_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublereal *, doublecomplex *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); integer isnum; logical ltest[17], sfatal; @@ -630,7 +635,7 @@ static logical c_false = FALSE_; goto L80; } for (i__ = 1; i__ <= 17; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L70; } @@ -689,7 +694,7 @@ static logical c_false = FALSE_; /* YY holds the exact result. On exit from ZMVCH YT holds */ /* the result computed by ZMVCH. */ *(unsigned char *)trans = 'N'; - zmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c__1, &c_b1, y, &c__1, yt, g, + zmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c__1, &c_b1, y, &c__1, yt, g, yy, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1); same = lze_(yy, yt, &n); if (! same || err != 0.) { @@ -702,7 +707,7 @@ static logical c_false = FALSE_; s_stop("", (ftnlen)0); } *(unsigned char *)trans = 'T'; - zmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c_n1, &c_b1, y, &c_n1, yt, g, + zmvch_(trans, &n, &n, &c_b2, a, &c__65, x, &c_n1, &c_b1, y, &c_n1, yt, g, yy, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1); same = lze_(yy, yt, &n); if (! same || err != 0.) { @@ -763,44 +768,44 @@ static logical c_false = FALSE_; /* Test ZGEMV, 01, and ZGBMV, 02. */ L140: zchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test ZHEMV, 03, ZHBMV, 04, and ZHPMV, 05. */ L150: zchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, - &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &nalf, alf, + &nbet, bet, &ninc, inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, (ftnlen)6); goto L200; /* Test ZTRMV, 06, ZTBMV, 07, ZTPMV, 08, */ /* ZTRSV, 09, ZTBSV, 10, and ZTPSV, 11. */ L160: zchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, + trace, &rewi, &fatal, &nidim, idim, &nkb, kb, &ninc, inc, &c__65, &c__2, a, aa, as, y, yy, ys, yt, g, z__, (ftnlen) 6); goto L200; /* Test ZGERC, 12, ZGERU, 13. */ L170: zchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test ZHER, 14, and ZHPR, 15. */ L180: zchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); goto L200; /* Test ZHER2, 16, and ZHPR2, 17. */ L190: zchk6_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, - inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &ninc, + inc, &c__65, &c__2, a, aa, as, x, xx, xs, y, yy, ys, yt, g, z__, (ftnlen)6); L200: @@ -842,16 +847,21 @@ static logical c_false = FALSE_; /* End of ZBLAT2. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ -/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, - integer *nalf, doublecomplex *alf, integer *nbet, doublecomplex *bet, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex - *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, + fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, + integer *nalf, doublecomplex *alf, integer *nbet, doublecomplex *bet, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex + *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal * g, ftnlen sname_len) { @@ -881,7 +891,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9; alist al__1; @@ -904,7 +914,7 @@ static logical c_false = FALSE_; logical full, tran, null; doublecomplex alpha; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); @@ -912,22 +922,22 @@ static logical c_false = FALSE_; logical reset; integer incxs, incys; extern /* Subroutine */ int zgbmv_(char *, integer *, integer *, integer * - , integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, + , integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen); char trans[1]; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), - zmvch_(char *, integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - doublereal *, doublecomplex *, doublereal *, doublereal *, + extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), + zmvch_(char *, integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + doublereal *, doublecomplex *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); logical banded; doublereal errmax; doublecomplex transl; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); char transs[1]; @@ -1108,9 +1118,9 @@ static logical c_false = FALSE_; transl.r = 0., transl.i = 0.; i__7 = abs(incy); i__8 = ml - 1; - zmake_("GE", " ", " ", &c__1, &ml, &y[1], + zmake_("GE", " ", " ", &c__1, &ml, &y[1], &c__1, &yy[1], &i__7, &c__0, & - i__8, &reset, &transl, (ftnlen)2, + i__8, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1118,7 +1128,7 @@ static logical c_false = FALSE_; /* Save every datum before calling the */ /* subroutine. */ - *(unsigned char *)transs = *(unsigned + *(unsigned char *)transs = *(unsigned char *)trans; ms = m; ns = n; @@ -1129,7 +1139,7 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__7; ++i__) { i__8 = i__; i__9 = i__; - as[i__8].r = aa[i__9].r, as[i__8].i = + as[i__8].r = aa[i__9].r, as[i__8].i = aa[i__9].i; /* L10: */ } @@ -1138,7 +1148,7 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__7; ++i__) { i__8 = i__; i__9 = i__; - xs[i__8].r = xx[i__9].r, xs[i__8].i = + xs[i__8].r = xx[i__9].r, xs[i__8].i = xx[i__9].i; /* L20: */ } @@ -1148,7 +1158,7 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__7; ++i__) { i__8 = i__; i__9 = i__; - ys[i__8].r = yy[i__9].r, ys[i__8].i = + ys[i__8].r = yy[i__9].r, ys[i__8].i = yy[i__9].i; /* L30: */ } @@ -1187,7 +1197,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - zgemv_(trans, &m, &n, &alpha, &aa[1], + zgemv_(trans, &m, &n, &alpha, &aa[1], &lda, &xx[1], &incx, &beta, & yy[1], &incy, (ftnlen)1); } else if (banded) { @@ -1248,7 +1258,7 @@ static logical c_false = FALSE_; isame[1] = ms == m; isame[2] = ns == n; if (full) { - isame[3] = als.r == alpha.r && als.i + isame[3] = als.r == alpha.r && als.i == alpha.i; isame[4] = lze_(&as[1], &aa[1], &laa); isame[5] = ldas == lda; @@ -1270,13 +1280,13 @@ static logical c_false = FALSE_; } else if (banded) { isame[3] = kls == kl; isame[4] = kus == ku; - isame[5] = als.r == alpha.r && als.i + isame[5] = als.r == alpha.r && als.i == alpha.i; isame[6] = lze_(&as[1], &aa[1], &laa); isame[7] = ldas == lda; isame[8] = lze_(&xs[1], &xx[1], &lx); isame[9] = incxs == incx; - isame[10] = bls.r == beta.r && bls.i + isame[10] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[11] = lze_(&ys[1], &yy[1], & @@ -1318,8 +1328,8 @@ static logical c_false = FALSE_; zmvch_(trans, &m, &n, &alpha, &a[ a_offset], nmax, &x[1], &incx, - &beta, &y[1], &incy, &yt[1], - &g[1], &yy[1], eps, &err, + &beta, &y[1], &incy, &yt[1], + &g[1], &yy[1], eps, &err, fatal, nout, &c_true, (ftnlen) 1); errmax = max(errmax,err); @@ -1423,13 +1433,13 @@ static logical c_false = FALSE_; } /* zchk1_ */ -/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, - integer *nalf, doublecomplex *alf, integer *nbet, doublecomplex *bet, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex - *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, + fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, + integer *nalf, doublecomplex *alf, integer *nbet, doublecomplex *bet, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex + *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal * g, ftnlen sname_len) { @@ -1463,7 +1473,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, + integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8, i__9; alist al__1; @@ -1472,7 +1482,7 @@ static logical c_false = FALSE_; f_rew(alist *); /* Local variables */ - integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, + integer i__, k, n, ia, ib, ic, nc, ik, in, nk, ks, ix, iy, ns, lx, ly, laa, lda; doublecomplex als, bls; doublereal err; @@ -1485,31 +1495,31 @@ static logical c_false = FALSE_; char uplo[1]; doublecomplex alpha; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; logical reset; integer incxs, incys; - extern /* Subroutine */ int zhbmv_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), - zmvch_(char *, integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - doublereal *, doublecomplex *, doublereal *, doublereal *, + extern /* Subroutine */ int zhbmv_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), + zmvch_(char *, integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + doublereal *, doublecomplex *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen), zhemv_(char *, integer * - , doublecomplex *, doublecomplex *, integer *, doublecomplex *, + , doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen); char uplos[1]; - extern /* Subroutine */ int zhpmv_(char *, integer *, doublecomplex *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, + extern /* Subroutine */ int zhpmv_(char *, integer *, doublecomplex *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen); logical banded, packed; doublereal errmax; doublecomplex transl; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -1672,7 +1682,7 @@ static logical c_false = FALSE_; i__8 = n - 1; zmake_("GE", " ", " ", &c__1, &n, &y[1], & c__1, &yy[1], &i__7, &c__0, &i__8, & - reset, &transl, (ftnlen)2, (ftnlen)1, + reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1824,13 +1834,13 @@ static logical c_false = FALSE_; unsigned char *)uplos; isame[1] = ns == n; if (full) { - isame[2] = als.r == alpha.r && als.i == + isame[2] = als.r == alpha.r && als.i == alpha.i; isame[3] = lze_(&as[1], &aa[1], &laa); isame[4] = ldas == lda; isame[5] = lze_(&xs[1], &xx[1], &lx); isame[6] = incxs == incx; - isame[7] = bls.r == beta.r && bls.i == + isame[7] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[8] = lze_(&ys[1], &yy[1], &ly); @@ -1843,13 +1853,13 @@ static logical c_false = FALSE_; isame[9] = incys == incy; } else if (banded) { isame[2] = ks == k; - isame[3] = als.r == alpha.r && als.i == + isame[3] = als.r == alpha.r && als.i == alpha.i; isame[4] = lze_(&as[1], &aa[1], &laa); isame[5] = ldas == lda; isame[6] = lze_(&xs[1], &xx[1], &lx); isame[7] = incxs == incx; - isame[8] = bls.r == beta.r && bls.i == + isame[8] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[9] = lze_(&ys[1], &yy[1], &ly); @@ -1861,12 +1871,12 @@ static logical c_false = FALSE_; } isame[10] = incys == incy; } else if (packed) { - isame[2] = als.r == alpha.r && als.i == + isame[2] = als.r == alpha.r && als.i == alpha.i; isame[3] = lze_(&as[1], &aa[1], &laa); isame[4] = lze_(&xs[1], &xx[1], &lx); isame[5] = incxs == incx; - isame[6] = bls.r == beta.r && bls.i == + isame[6] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[7] = lze_(&ys[1], &yy[1], &ly); @@ -1904,8 +1914,8 @@ static logical c_false = FALSE_; /* Check the result. */ - zmvch_("N", &n, &n, &alpha, &a[a_offset], - nmax, &x[1], &incx, &beta, &y[1], + zmvch_("N", &n, &n, &alpha, &a[a_offset], + nmax, &x[1], &incx, &beta, &y[1], &incy, &yt[1], &g[1], &yy[1], eps, &err, fatal, nout, &c_true, ( ftnlen)1); @@ -2015,12 +2025,12 @@ static logical c_false = FALSE_; } /* zchk2_ */ -/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, - integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex - *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *xt, + fatal, integer *nidim, integer *idim, integer *nkb, integer *kb, + integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex + *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *xt, doublereal *g, doublecomplex *z__, ftnlen sname_len) { /* Initialized data */ @@ -2060,7 +2070,7 @@ static logical c_false = FALSE_; integer *, char *, ftnlen), e_wsfe(void), f_rew(alist *); /* Local variables */ - integer i__, k, n, nc, ik, in, nk, ks, ix, ns, lx, laa, icd, lda, ict, + integer i__, k, n, nc, ik, in, nk, ks, ix, ns, lx, laa, icd, lda, ict, icu; doublereal err; extern logical lze_(doublecomplex *, doublecomplex *, integer *); @@ -2071,7 +2081,7 @@ static logical c_false = FALSE_; logical full, null; char uplo[1], diags[1]; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); @@ -2079,28 +2089,28 @@ static logical c_false = FALSE_; logical reset; integer incxs; char trans[1]; - extern /* Subroutine */ int zmvch_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublereal *, doublecomplex *, doublereal *, + extern /* Subroutine */ int zmvch_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublereal *, doublecomplex *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); char uplos[1]; - extern /* Subroutine */ int ztbmv_(char *, char *, char *, integer *, + extern /* Subroutine */ int ztbmv_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen), ztbsv_(char *, char *, char *, integer * - , integer *, doublecomplex *, integer *, doublecomplex *, integer - *, ftnlen, ftnlen, ftnlen), ztpmv_(char *, char *, char *, - integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, - ftnlen, ftnlen), ztrmv_(char *, char *, char *, integer *, - doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, - ftnlen, ftnlen), ztpsv_(char *, char *, char *, integer *, - doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen, + , integer *, doublecomplex *, integer *, doublecomplex *, integer + *, ftnlen, ftnlen, ftnlen), ztpmv_(char *, char *, char *, + integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, + ftnlen, ftnlen), ztrmv_(char *, char *, char *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, + ftnlen, ftnlen), ztpsv_(char *, char *, char *, integer *, + doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen), ztrsv_(char *, char *, char *, integer *, doublecomplex * , integer *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen); logical banded, packed; doublereal errmax; doublecomplex transl; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); char transs[1]; @@ -2226,13 +2236,13 @@ static logical c_false = FALSE_; ; for (icd = 1; icd <= 2; ++icd) { - *(unsigned char *)diag = *(unsigned char *)&ichd[icd + *(unsigned char *)diag = *(unsigned char *)&ichd[icd - 1]; /* Generate the matrix A. */ transl.r = 0., transl.i = 0.; - zmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], + zmake_(sname + 1, uplo, diag, &n, &n, &a[a_offset], nmax, &aa[1], &lda, &k, &k, &reset, &transl, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -2287,7 +2297,7 @@ static logical c_false = FALSE_; /* Call the subroutine. */ - if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) + if (s_cmp(sname + 3, "MV", (ftnlen)2, (ftnlen)2) == 0) { if (full) { if (*trace) { @@ -2340,7 +2350,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - ztbmv_(uplo, trans, diag, &n, &k, &aa[1], + ztbmv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2421,7 +2431,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - ztbsv_(uplo, trans, diag, &n, &k, &aa[1], + ztbsv_(uplo, trans, diag, &n, &k, &aa[1], &lda, &xx[1], &incx, (ftnlen)1, ( ftnlen)1, (ftnlen)1); } else if (packed) { @@ -2463,11 +2473,11 @@ static logical c_false = FALSE_; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplo == *(unsigned + isame[0] = *(unsigned char *)uplo == *(unsigned char *)uplos; - isame[1] = *(unsigned char *)trans == *(unsigned + isame[1] = *(unsigned char *)trans == *(unsigned char *)transs; - isame[2] = *(unsigned char *)diag == *(unsigned + isame[2] = *(unsigned char *)diag == *(unsigned char *)diags; isame[3] = ns == n; if (full) { @@ -2537,7 +2547,7 @@ static logical c_false = FALSE_; zmvch_(trans, &n, &n, &c_b2, &a[a_offset], nmax, &x[1], &incx, &c_b1, &z__[ - 1], &incx, &xt[1], &g[1], &xx[1], + 1], &incx, &xt[1], &g[1], &xx[1], eps, &err, fatal, nout, &c_true, ( ftnlen)1); } else if (s_cmp(sname + 3, "SV", (ftnlen)2, ( @@ -2549,18 +2559,18 @@ static logical c_false = FALSE_; for (i__ = 1; i__ <= i__4; ++i__) { i__5 = i__; i__6 = (i__ - 1) * abs(incx) + 1; - z__[i__5].r = xx[i__6].r, z__[i__5].i + z__[i__5].r = xx[i__6].r, z__[i__5].i = xx[i__6].i; i__5 = (i__ - 1) * abs(incx) + 1; i__6 = i__; - xx[i__5].r = x[i__6].r, xx[i__5].i = + xx[i__5].r = x[i__6].r, xx[i__5].i = x[i__6].i; /* L50: */ } zmvch_(trans, &n, &n, &c_b2, &a[a_offset], nmax, &z__[1], &incx, &c_b1, &x[ - 1], &incx, &xt[1], &g[1], &xx[1], - eps, &err, fatal, nout, &c_false, + 1], &incx, &xt[1], &g[1], &xx[1], + eps, &err, fatal, nout, &c_false, (ftnlen)1); } errmax = max(errmax,err); @@ -2662,12 +2672,12 @@ static logical c_false = FALSE_; } /* zchk3_ */ -/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex - *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, + alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex + *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal * g, doublecomplex *z__, ftnlen sname_len) { @@ -2713,26 +2723,26 @@ static logical c_false = FALSE_; logical null; doublecomplex alpha; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, + extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical reset; integer incxs, incys; - extern /* Subroutine */ int zmvch_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublereal *, doublecomplex *, doublereal *, + extern /* Subroutine */ int zmvch_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublereal *, doublecomplex *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen), zgeru_( integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal errmax; doublecomplex transl; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -2834,7 +2844,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = m - 1; zmake_("GE", " ", " ", &c__1, &m, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (m > 1) { i__3 = m / 2; @@ -2873,7 +2883,7 @@ static logical c_false = FALSE_; transl.r = 0., transl.i = 0.; i__5 = m - 1; i__6 = n - 1; - zmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], + zmake_(sname + 1, " ", " ", &m, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -3032,9 +3042,9 @@ static logical c_false = FALSE_; d_cnjg(&z__1, w); w[0].r = z__1.r, w[0].i = z__1.i; } - zmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, + zmvch_("N", &m, &c__1, &alpha, &z__[1], nmax, w, &c__1, &c_b2, &a[j * a_dim1 + 1], & - c__1, &yt[1], &g[1], &aa[(j - 1) * + c__1, &yt[1], &g[1], &aa[(j - 1) * lda + 1], eps, &err, fatal, nout, & c_true, (ftnlen)1); errmax = max(errmax,err); @@ -3114,12 +3124,12 @@ static logical c_false = FALSE_; } /* zchk4_ */ -/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex - *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, + alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex + *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal * g, doublecomplex *z__, ftnlen sname_len) { @@ -3169,32 +3179,32 @@ static logical c_false = FALSE_; doublereal rals; integer incx; logical full; - extern /* Subroutine */ int zher_(char *, integer *, doublereal *, + extern /* Subroutine */ int zher_(char *, integer *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, ftnlen); logical null; char uplo[1]; - extern /* Subroutine */ int zhpr_(char *, integer *, doublereal *, + extern /* Subroutine */ int zhpr_(char *, integer *, doublereal *, doublecomplex *, integer *, doublecomplex *, ftnlen); doublecomplex alpha; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; logical reset; integer incxs; - extern /* Subroutine */ int zmvch_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublereal *, doublecomplex *, doublereal *, + extern /* Subroutine */ int zmvch_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublereal *, doublecomplex *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); logical upper; char uplos[1]; logical packed; doublereal ralpha, errmax; doublecomplex transl; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -3297,7 +3307,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; zmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { i__3 = n / 2; @@ -3372,7 +3382,7 @@ static logical c_false = FALSE_; al__1.aunit = *ntra; f_rew(&al__1); } - zher_(uplo, &n, &ralpha, &xx[1], &incx, &aa[1], &lda, + zher_(uplo, &n, &ralpha, &xx[1], &incx, &aa[1], &lda, (ftnlen)1); } else if (packed) { if (*trace) { @@ -3482,9 +3492,9 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - zmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, - &c__1, &c_b2, &a[jj + j * a_dim1], &c__1, - &yt[1], &g[1], &aa[ja], eps, &err, fatal, + zmvch_("N", &lj, &c__1, &alpha, &z__[jj], &lj, w, + &c__1, &c_b2, &a[jj + j * a_dim1], &c__1, + &yt[1], &g[1], &aa[ja], eps, &err, fatal, nout, &c_true, (ftnlen)1); if (full) { if (upper) { @@ -3582,12 +3592,12 @@ static logical c_false = FALSE_; } /* zchk5_ */ -/* Subroutine */ int zchk6_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk6_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, - doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex - *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, + alf, integer *ninc, integer *inc, integer *nmax, integer *incmax, + doublecomplex *a, doublecomplex *aa, doublecomplex *as, doublecomplex + *x, doublecomplex *xx, doublecomplex *xs, doublecomplex *y, doublecomplex *yy, doublecomplex *ys, doublecomplex *yt, doublereal * g, doublecomplex *z__, ftnlen sname_len) { @@ -3617,7 +3627,7 @@ static logical c_false = FALSE_; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, + integer a_dim1, a_offset, z_dim1, z_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; doublecomplex z__1, z__2, z__3; alist al__1; @@ -3639,31 +3649,31 @@ static logical c_false = FALSE_; integer incx, incy; logical full, null; char uplo[1]; - extern /* Subroutine */ int zher2_(char *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, integer *, ftnlen), zhpr2_(char *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, + extern /* Subroutine */ int zher2_(char *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *, ftnlen), zhpr2_(char *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, ftnlen); doublecomplex alpha; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; logical reset; integer incxs, incys; - extern /* Subroutine */ int zmvch_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublereal *, doublecomplex *, doublereal *, + extern /* Subroutine */ int zmvch_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublereal *, doublecomplex *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen); logical upper; char uplos[1]; logical packed; doublereal errmax; doublecomplex transl; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -3768,7 +3778,7 @@ static logical c_false = FALSE_; i__3 = abs(incx); i__4 = n - 1; zmake_("GE", " ", " ", &c__1, &n, &x[1], &c__1, &xx[1], &i__3, - &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, + &c__0, &i__4, &reset, &transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); if (n > 1) { i__3 = n / 2; @@ -3808,7 +3818,7 @@ static logical c_false = FALSE_; transl.r = 0., transl.i = 0.; i__5 = n - 1; i__6 = n - 1; - zmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], + zmake_(sname + 1, uplo, " ", &n, &n, &a[a_offset], nmax, &aa[1], &lda, &i__5, &i__6, &reset, & transl, (ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -3996,14 +4006,14 @@ static logical c_false = FALSE_; i__5 = n; for (j = 1; j <= i__5; ++j) { d_cnjg(&z__2, &z__[j + (z_dim1 << 1)]); - z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, - z__1.i = alpha.r * z__2.i + alpha.i * + z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, + z__1.i = alpha.r * z__2.i + alpha.i * z__2.r; w[0].r = z__1.r, w[0].i = z__1.i; d_cnjg(&z__2, &alpha); d_cnjg(&z__3, &z__[j + z_dim1]); - z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, - z__1.i = z__2.r * z__3.i + z__2.i * + z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, + z__1.i = z__2.r * z__3.i + z__2.i * z__3.r; w[1].r = z__1.r, w[1].i = z__1.i; if (upper) { @@ -4013,8 +4023,8 @@ static logical c_false = FALSE_; jj = j; lj = n - j + 1; } - zmvch_("N", &lj, &c__2, &c_b2, &z__[jj + - z_dim1], nmax, w, &c__1, &c_b2, &a[jj + zmvch_("N", &lj, &c__2, &c_b2, &z__[jj + + z_dim1], nmax, w, &c__1, &c_b2, &a[jj + j * a_dim1], &c__1, &yt[1], &g[1], & aa[ja], eps, &err, fatal, nout, & c_true, (ftnlen)1); @@ -4119,7 +4129,7 @@ static logical c_false = FALSE_; } /* zchk6_ */ -/* Subroutine */ int zchke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int zchke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -4133,47 +4143,47 @@ static logical c_false = FALSE_; /* Local variables */ doublecomplex a[1] /* was [1][1] */, x[1], y[1], beta; - extern /* Subroutine */ int zher_(char *, integer *, doublereal *, - doublecomplex *, integer *, doublecomplex *, integer *, ftnlen), + extern /* Subroutine */ int zher_(char *, integer *, doublereal *, + doublecomplex *, integer *, doublecomplex *, integer *, ftnlen), zhpr_(char *, integer *, doublereal *, doublecomplex *, integer *, - doublecomplex *, ftnlen), zher2_(char *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, integer *, ftnlen), zhpr2_(char *, - integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, ftnlen), zher2_(char *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, integer *, ftnlen), zhpr2_(char *, + integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, ftnlen); doublecomplex alpha; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, integer *), zgbmv_(char *, integer *, integer *, + extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *), zgbmv_(char *, integer *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, ftnlen), zhbmv_(char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), - zgemv_(char *, integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, integer *, ftnlen), zhemv_(char - *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, ftnlen), zgeru_(integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, integer *), ztbmv_(char *, char *, char *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, ftnlen), zhbmv_(char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, ftnlen), + zgemv_(char *, integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, doublecomplex *, integer *, ftnlen), zhemv_(char + *, integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, ftnlen), zgeru_(integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *), ztbmv_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, - integer *, ftnlen, ftnlen, ftnlen), zhpmv_(char *, integer *, - doublecomplex *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, integer *, ftnlen), ztbsv_(char - *, char *, char *, integer *, integer *, doublecomplex *, integer + integer *, ftnlen, ftnlen, ftnlen), zhpmv_(char *, integer *, + doublecomplex *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublecomplex *, integer *, ftnlen), ztbsv_(char + *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen), ztpmv_( - char *, char *, char *, integer *, doublecomplex *, doublecomplex - *, integer *, ftnlen, ftnlen, ftnlen), ztrmv_(char *, char *, - char *, integer *, doublecomplex *, integer *, doublecomplex *, + char *, char *, char *, integer *, doublecomplex *, doublecomplex + *, integer *, ftnlen, ftnlen, ftnlen), ztrmv_(char *, char *, + char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen), ztpsv_(char *, char *, char *, - integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, - ftnlen, ftnlen), ztrsv_(char *, char *, char *, integer *, - doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, + integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, + ftnlen, ftnlen), ztrsv_(char *, char *, char *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen); doublereal ralpha; - extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical + extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -4702,9 +4712,9 @@ static logical c_false = FALSE_; } /* zchke_ */ -/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, - integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, - integer *lda, integer *kl, integer *ku, logical *reset, doublecomplex +/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, + integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, + integer *lda, integer *kl, integer *ku, logical *reset, doublecomplex *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -4765,7 +4775,7 @@ static logical c_false = FALSE_; i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { if (gen || upper && i__ <= j || lower && i__ >= j) { - if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) + if (i__ <= j && j - i__ <= *ku || i__ >= j && i__ - j <= *kl) { i__3 = i__ + j * a_dim1; zbeg_(&z__2, reset); @@ -4998,11 +5008,11 @@ static logical c_false = FALSE_; } /* zmake_ */ -/* Subroutine */ int zmvch_(char *trans, integer *m, integer *n, +/* Subroutine */ int zmvch_(char *trans, integer *m, integer *n, doublecomplex *alpha, doublecomplex *a, integer *nmax, doublecomplex * x, integer *incx, doublecomplex *beta, doublecomplex *y, integer * - incy, doublecomplex *yt, doublereal *g, doublecomplex *yy, doublereal - *eps, doublereal *err, logical *fatal, integer *nout, logical *mv, + incy, doublecomplex *yt, doublereal *g, doublecomplex *yy, doublereal + *eps, doublereal *err, logical *fatal, integer *nout, logical *mv, ftnlen trans_len) { /* Format strings */ @@ -5105,15 +5115,15 @@ static logical c_false = FALSE_; i__4 = iy; i__5 = j + i__ * a_dim1; i__6 = jx; - z__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, + z__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, z__2.i = a[i__5].r * x[i__6].i + a[i__5].i * x[i__6] .r; z__1.r = yt[i__4].r + z__2.r, z__1.i = yt[i__4].i + z__2.i; yt[i__3].r = z__1.r, yt[i__3].i = z__1.i; i__3 = j + i__ * a_dim1; i__4 = jx; - g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[j - + i__ * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, + g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[j + + i__ * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, abs(d__3)) + (d__4 = d_imag(&x[jx]), abs(d__4))); jx += incxl; /* L10: */ @@ -5125,14 +5135,14 @@ static logical c_false = FALSE_; i__4 = iy; d_cnjg(&z__3, &a[j + i__ * a_dim1]); i__5 = jx; - z__2.r = z__3.r * x[i__5].r - z__3.i * x[i__5].i, z__2.i = + z__2.r = z__3.r * x[i__5].r - z__3.i * x[i__5].i, z__2.i = z__3.r * x[i__5].i + z__3.i * x[i__5].r; z__1.r = yt[i__4].r + z__2.r, z__1.i = yt[i__4].i + z__2.i; yt[i__3].r = z__1.r, yt[i__3].i = z__1.i; i__3 = j + i__ * a_dim1; i__4 = jx; - g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[j - + i__ * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, + g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[j + + i__ * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, abs(d__3)) + (d__4 = d_imag(&x[jx]), abs(d__4))); jx += incxl; /* L20: */ @@ -5144,7 +5154,7 @@ static logical c_false = FALSE_; i__4 = iy; i__5 = i__ + j * a_dim1; i__6 = jx; - z__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, + z__2.r = a[i__5].r * x[i__6].r - a[i__5].i * x[i__6].i, z__2.i = a[i__5].r * x[i__6].i + a[i__5].i * x[i__6] .r; z__1.r = yt[i__4].r + z__2.r, z__1.i = yt[i__4].i + z__2.i; @@ -5152,7 +5162,7 @@ static logical c_false = FALSE_; i__3 = i__ + j * a_dim1; i__4 = jx; g[iy] += ((d__1 = a[i__3].r, abs(d__1)) + (d__2 = d_imag(&a[ - i__ + j * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, + i__ + j * a_dim1]), abs(d__2))) * ((d__3 = x[i__4].r, abs(d__3)) + (d__4 = d_imag(&x[jx]), abs(d__4))); jx += incxl; /* L30: */ @@ -5160,7 +5170,7 @@ static logical c_false = FALSE_; } i__2 = iy; i__3 = iy; - z__2.r = alpha->r * yt[i__3].r - alpha->i * yt[i__3].i, z__2.i = + z__2.r = alpha->r * yt[i__3].r - alpha->i * yt[i__3].i, z__2.i = alpha->r * yt[i__3].i + alpha->i * yt[i__3].r; i__4 = iy; z__3.r = beta->r * y[i__4].r - beta->i * y[i__4].i, z__3.i = beta->r * @@ -5169,7 +5179,7 @@ static logical c_false = FALSE_; yt[i__2].r = z__1.r, yt[i__2].i = z__1.i; i__2 = iy; g[iy] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha), abs( - d__2))) * g[iy] + ((d__3 = beta->r, abs(d__3)) + (d__4 = + d__2))) * g[iy] + ((d__3 = beta->r, abs(d__3)) + (d__4 = d_imag(beta), abs(d__4))) * ((d__5 = y[i__2].r, abs(d__5)) + ( d__6 = d_imag(&y[iy]), abs(d__6))); iy += incyl; @@ -5281,8 +5291,8 @@ logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr) } /* lze_ */ -logical lzeres_(char *type__, char *uplo, integer *m, integer *n, - doublecomplex *aa, doublecomplex *as, integer *lda, ftnlen type_len, +logical lzeres_(char *type__, char *uplo, integer *m, integer *n, + doublecomplex *aa, doublecomplex *as, integer *lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ @@ -5459,7 +5469,7 @@ doublereal ddiff_(doublereal *x, doublereal *y) } /* ddiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/blastest/src/zblat3.c b/blastest/src/zblat3.c index 3ff3634b6..045eeba42 100644 --- a/blastest/src/zblat3.c +++ b/blastest/src/zblat3.c @@ -140,9 +140,14 @@ static integer c_n1 = -1; /* ===================================================================== */ /* Main program */ int main(void) { +#ifdef BLIS_ENABLE_HPX + char* program = "zblat3"; + bli_thread_initialize_hpx( 1, &program ); +#endif + /* Initialized data */ - static char snames[6*9] = "ZGEMM " "ZHEMM " "ZSYMM " "ZTRMM " "ZTRSM " + static char snames[6*9] = "ZGEMM " "ZHEMM " "ZSYMM " "ZTRMM " "ZTRSM " "ZHERK " "ZSYRK " "ZHER2K" "ZSYR2K"; /* Format strings */ @@ -186,10 +191,10 @@ static integer c_n1 = -1; cllist cl__1; /* Builtin functions */ - integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), + integer s_rsle(cilist *), do_lio(integer *, integer *, char *, ftnlen), e_rsle(void), f_open(olist *), s_wsfe(cilist *), do_fio(integer *, - char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), - s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, + char *, ftnlen), e_wsfe(void), s_wsle(cilist *), e_wsle(void), + s_rsfe(cilist *), e_rsfe(void), s_cmp(const char *, const char *, ftnlen, ftnlen); /* Subroutine */ int s_stop(char *, ftnlen); integer f_clos(cllist *); @@ -208,44 +213,44 @@ static integer c_n1 = -1; integer nbet, ntra; logical rewi; integer nout; - extern /* Subroutine */ int zchk1_(char *, doublereal *, doublereal *, - integer *, integer *, logical *, logical *, logical *, integer *, + extern /* Subroutine */ int zchk1_(char *, doublereal *, doublereal *, + integer *, integer *, logical *, logical *, logical *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, - ftnlen), zchk2_(char *, doublereal *, doublereal *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, + ftnlen), zchk2_(char *, doublereal *, doublereal *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex - *, doublecomplex *, doublecomplex *, doublecomplex *, - doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, - ftnlen), zchk3_(char *, doublereal *, doublereal *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, - integer *, doublecomplex *, integer *, doublecomplex *, + doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex + *, doublecomplex *, doublecomplex *, doublecomplex *, + doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, + ftnlen), zchk3_(char *, doublereal *, doublereal *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * , doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, - ftnlen), zchk4_(char *, doublereal *, doublereal *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, + ftnlen), zchk4_(char *, doublereal *, doublereal *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex - *, doublecomplex *, doublecomplex *, doublecomplex *, - doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, - ftnlen), zchk5_(char *, doublereal *, doublereal *, integer *, - integer *, logical *, logical *, logical *, integer *, integer *, + doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex + *, doublecomplex *, doublecomplex *, doublecomplex *, + doublecomplex *, doublecomplex *, doublecomplex *, doublereal *, + ftnlen), zchk5_(char *, doublereal *, doublereal *, integer *, + integer *, logical *, logical *, logical *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex - *, doublecomplex *, doublecomplex *, doublecomplex *, - doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, + doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex + *, doublecomplex *, doublecomplex *, doublecomplex *, + doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, ftnlen); logical fatal, trace; integer nidim; - extern /* Subroutine */ int zchke_(integer *, char *, integer *, ftnlen), - zmmch_(char *, char *, integer *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublereal *, doublecomplex *, integer *, - doublereal *, doublereal *, logical *, integer *, logical *, + extern /* Subroutine */ int zchke_(integer *, char *, integer *, ftnlen), + zmmch_(char *, char *, integer *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublereal *, doublecomplex *, integer *, + doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); char snaps[32]; integer isnum; @@ -517,7 +522,7 @@ static integer c_n1 = -1; goto L60; } for (i__ = 1; i__ <= 9; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) + if (s_cmp(snamet, snames + (i__ - 1) * 6, (ftnlen)6, (ftnlen)6) == 0) { goto L50; } @@ -580,7 +585,7 @@ static integer c_n1 = -1; *(unsigned char *)transa = 'N'; *(unsigned char *)transb = 'N'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { @@ -595,7 +600,7 @@ static integer c_n1 = -1; } *(unsigned char *)transb = 'C'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { @@ -628,7 +633,7 @@ static integer c_n1 = -1; *(unsigned char *)transa = 'C'; *(unsigned char *)transb = 'N'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { @@ -643,7 +648,7 @@ static integer c_n1 = -1; } *(unsigned char *)transb = 'C'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & - c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, &nout, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { @@ -697,34 +702,34 @@ static integer c_n1 = -1; /* Test ZGEMM, 01. */ L140: zchk1_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test ZHEMM, 02, ZSYMM, 03. */ L150: zchk2_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test ZTRMM, 04, ZTRSM, 05. */ L160: zchk3_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, (ftnlen)6); goto L190; /* Test ZHERK, 06, ZSYRK, 07. */ L170: zchk4_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, - bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, (ftnlen)6); goto L190; /* Test ZHER2K, 08, ZSYR2K, 09. */ L180: zchk5_(snames + (isnum - 1) * 6, &eps, &thresh, &nout, &ntra, & - trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, + trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, ( ftnlen)6); goto L190; @@ -768,15 +773,20 @@ static integer c_n1 = -1; /* End of ZBLAT3. */ - return 0; +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else + // Return peacefully. + return 0; +#endif } /* main */ -/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, - doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, + a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, + doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * g, ftnlen sname_len) { @@ -802,7 +812,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; alist al__1; @@ -811,7 +821,7 @@ static integer c_n1 = -1; f_rew(alist *); /* Local variables */ - integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, + integer i__, k, m, n, ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns, ica, icb, laa, lbb, lda, lcc, ldb, ldc; doublecomplex als, bls; doublereal err; @@ -821,23 +831,23 @@ static integer c_n1 = -1; logical same, null; doublecomplex alpha; logical isame[13], trana, tranb; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, - logical *, ftnlen, ftnlen), zgemm_(char *, char *, integer *, + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, + logical *, ftnlen, ftnlen), zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); logical reset; char tranas[1], tranbs[1], transa[1], transb[1]; doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -928,7 +938,7 @@ static integer c_n1 = -1; for (ica = 1; ica <= 3; ++ica) { *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] ; - trana = *(unsigned char *)transa == 'T' || *(unsigned + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; if (trana) { @@ -956,9 +966,9 @@ static integer c_n1 = -1; ftnlen)1); for (icb = 1; icb <= 3; ++icb) { - *(unsigned char *)transb = *(unsigned char *)&ich[icb + *(unsigned char *)transb = *(unsigned char *)&ich[icb - 1]; - tranb = *(unsigned char *)transb == 'T' || *(unsigned + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; if (tranb) { @@ -1099,13 +1109,13 @@ static integer c_n1 = -1; isame[2] = ms == m; isame[3] = ns == n; isame[4] = ks == k; - isame[5] = als.r == alpha.r && als.i == + isame[5] = als.r == alpha.r && als.i == alpha.i; isame[6] = lze_(&as[1], &aa[1], &laa); isame[7] = ldas == lda; isame[8] = lze_(&bs[1], &bb[1], &lbb); isame[9] = ldbs == ldb; - isame[10] = bls.r == beta.r && bls.i == + isame[10] = bls.r == beta.r && bls.i == beta.i; if (null) { isame[11] = lze_(&cs[1], &cc[1], &lcc); @@ -1143,9 +1153,9 @@ static integer c_n1 = -1; zmmch_(transa, transb, &m, &n, &k, &alpha, &a[a_offset], nmax, &b[b_offset], - nmax, &beta, &c__[c_offset], + nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, - eps, &err, fatal, nout, &c_true, + eps, &err, fatal, nout, &c_true, (ftnlen)1, (ftnlen)1); errmax = max(errmax,err); /* If got really bad answer, report and */ @@ -1226,12 +1236,12 @@ static integer c_n1 = -1; } /* zchk1_ */ -/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, - doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, + a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, + doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * g, ftnlen sname_len) { @@ -1258,7 +1268,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; alist al__1; @@ -1267,7 +1277,7 @@ static integer c_n1 = -1; integer *, char *, ftnlen), e_wsfe(void), f_rew(alist *); /* Local variables */ - integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, + integer i__, m, n, ia, ib, na, nc, im, in, ms, ns, laa, lbb, lda, lcc, ldb, ldc, ics; doublecomplex als, bls; integer icu; @@ -1282,27 +1292,27 @@ static integer c_n1 = -1; doublecomplex alpha; logical isame[13]; char sides[1]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, - logical *, ftnlen, ftnlen), zhemm_(char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, + logical *, ftnlen, ftnlen), zhemm_(char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); logical reset; char uplos[1]; - extern /* Subroutine */ int zsymm_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, + extern /* Subroutine */ int zsymm_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -1443,7 +1453,7 @@ static integer c_n1 = -1; /* Generate the matrix C. */ - zmake_("GE", " ", " ", &m, &n, &c__[c_offset], + zmake_("GE", " ", " ", &m, &n, &c__[c_offset], nmax, &cc[1], &ldc, &reset, &c_b1, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1539,9 +1549,9 @@ static integer c_n1 = -1; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)sides == *(unsigned + isame[0] = *(unsigned char *)sides == *(unsigned char *)side; - isame[1] = *(unsigned char *)uplos == *(unsigned + isame[1] = *(unsigned char *)uplos == *(unsigned char *)uplo; isame[2] = ms == m; isame[3] = ns == n; @@ -1586,14 +1596,14 @@ static integer c_n1 = -1; if (left) { zmmch_("N", "N", &m, &n, &m, &alpha, &a[ - a_offset], nmax, &b[b_offset], + a_offset], nmax, &b[b_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { zmmch_("N", "N", &m, &n, &n, &alpha, &b[ - b_offset], nmax, &a[a_offset], + b_offset], nmax, &a[a_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, &err, fatal, nout, &c_true, ( @@ -1673,12 +1683,12 @@ static integer c_n1 = -1; } /* zchk2_ */ -/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *nmax, doublecomplex *a, doublecomplex *aa, - doublecomplex *as, doublecomplex *b, doublecomplex *bb, doublecomplex - *bs, doublecomplex *ct, doublereal *g, doublecomplex *c__, ftnlen + alf, integer *nmax, doublecomplex *a, doublecomplex *aa, + doublecomplex *as, doublecomplex *b, doublecomplex *bb, doublecomplex + *bs, doublecomplex *ct, doublereal *g, doublecomplex *c__, ftnlen sname_len) { /* Initialized data */ @@ -1705,7 +1715,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; doublecomplex z__1; alist al__1; @@ -1731,27 +1741,27 @@ static integer c_n1 = -1; char diags[1]; logical isame[13]; char sides[1]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); logical reset; char uplos[1]; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), - ztrsm_(char *, char *, char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, + doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), + ztrsm_(char *, char *, char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen); char tranas[1], transa[1]; doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); /* Fortran I/O blocks */ @@ -1888,7 +1898,7 @@ static integer c_n1 = -1; /* Generate the matrix B. */ - zmake_("GE", " ", " ", &m, &n, &b[b_offset], + zmake_("GE", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, &reset, &c_b1, ( ftnlen)2, (ftnlen)1, (ftnlen)1); @@ -1960,7 +1970,7 @@ static integer c_n1 = -1; } ztrmm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } else if (s_cmp(sname + 3, "SM", (ftnlen)2, ( ftnlen)2) == 0) { @@ -1993,7 +2003,7 @@ static integer c_n1 = -1; } ztrsm_(side, uplo, transa, diag, &m, &n, & alpha, &aa[1], &lda, &bb[1], &ldb, - (ftnlen)1, (ftnlen)1, (ftnlen)1, + (ftnlen)1, (ftnlen)1, (ftnlen)1, (ftnlen)1); } @@ -2019,7 +2029,7 @@ static integer c_n1 = -1; unsigned char *)diag; isame[4] = ms == m; isame[5] = ns == n; - isame[6] = als.r == alpha.r && als.i == + isame[6] = als.r == alpha.r && als.i == alpha.i; isame[7] = lze_(&as[1], &aa[1], &laa); isame[8] = ldas == lda; @@ -2063,18 +2073,18 @@ static integer c_n1 = -1; zmmch_(transa, "N", &m, &n, &m, & alpha, &a[a_offset], nmax, &b[b_offset], nmax, & - c_b1, &c__[c_offset], + c_b1, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } else { zmmch_("N", transa, &m, &n, &n, & alpha, &b[b_offset], nmax, &a[a_offset], nmax, & - c_b1, &c__[c_offset], + c_b1, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ - 1], &ldb, eps, &err, + 1], &ldb, eps, &err, fatal, nout, &c_true, ( ftnlen)1, (ftnlen)1); } @@ -2087,14 +2097,14 @@ static integer c_n1 = -1; i__4 = n; for (j = 1; j <= i__4; ++j) { i__5 = m; - for (i__ = 1; i__ <= i__5; ++i__) + for (i__ = 1; i__ <= i__5; ++i__) { i__6 = i__ + j * c_dim1; i__7 = i__ + (j - 1) * ldb; c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i; i__6 = i__ + (j - 1) * ldb; i__7 = i__ + j * b_dim1; - z__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, + z__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, z__1.i = alpha.r * b[i__7].i + alpha.i * b[ i__7].r; bb[i__6].r = z__1.r, bb[i__6].i = z__1.i; @@ -2105,20 +2115,20 @@ static integer c_n1 = -1; if (left) { zmmch_(transa, "N", &m, &n, &m, & - c_b2, &a[a_offset], nmax, + c_b2, &a[a_offset], nmax, &c__[c_offset], nmax, & - c_b1, &b[b_offset], nmax, + c_b1, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & - ldb, eps, &err, fatal, + ldb, eps, &err, fatal, nout, &c_false, (ftnlen)1, (ftnlen)1); } else { zmmch_("N", transa, &m, &n, &n, & - c_b2, &c__[c_offset], - nmax, &a[a_offset], nmax, + c_b2, &c__[c_offset], + nmax, &a[a_offset], nmax, &c_b1, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & - ldb, eps, &err, fatal, + ldb, eps, &err, fatal, nout, &c_false, (ftnlen)1, (ftnlen)1); } @@ -2199,12 +2209,12 @@ static integer c_n1 = -1; } /* zchk3_ */ -/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, - doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, + a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, + doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * g, ftnlen sname_len) { @@ -2236,7 +2246,7 @@ static integer c_n1 = -1; "ER:\002)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; doublecomplex z__1; alist al__1; @@ -2262,29 +2272,29 @@ static integer c_n1 = -1; doublecomplex alpha; doublereal rbeta; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); doublereal rbets; logical reset; - extern /* Subroutine */ int zherk_(char *, char *, integer *, integer *, - doublereal *, doublecomplex *, integer *, doublereal *, + extern /* Subroutine */ int zherk_(char *, char *, integer *, integer *, + doublereal *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *, ftnlen, ftnlen); char trans[1]; logical upper; char uplos[1]; - extern /* Subroutine */ int zsyrk_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, + extern /* Subroutine */ int zsyrk_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); doublereal ralpha, errmax; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); char transs[1], transt[1]; @@ -2426,7 +2436,7 @@ static integer c_n1 = -1; } null = n <= 0; if (conj) { - null = null || (k <= 0 || ralpha == 0.) && + null = null || (k <= 0 || ralpha == 0.) && rbeta == 1.; } @@ -2505,7 +2515,7 @@ static integer c_n1 = -1; f_rew(&al__1); } zherk_(uplo, trans, &n, &k, &ralpha, &aa[1], & - lda, &rbeta, &cc[1], &ldc, (ftnlen)1, + lda, &rbeta, &cc[1], &ldc, (ftnlen)1, (ftnlen)1); } else { if (*trace) { @@ -2552,16 +2562,16 @@ static integer c_n1 = -1; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; if (conj) { isame[4] = rals == ralpha; } else { - isame[4] = als.r == alpha.r && als.i == + isame[4] = als.r == alpha.r && als.i == alpha.i; } isame[5] = lze_(&as[1], &aa[1], &laa); @@ -2569,7 +2579,7 @@ static integer c_n1 = -1; if (conj) { isame[7] = rbets == rbeta; } else { - isame[7] = bets.r == beta.r && bets.i == + isame[7] = bets.r == beta.r && bets.i == beta.i; } if (null) { @@ -2623,19 +2633,19 @@ static integer c_n1 = -1; } if (tran) { zmmch_(transt, "N", &lj, &c__1, &k, & - alpha, &a[jj * a_dim1 + 1], - nmax, &a[j * a_dim1 + 1], - nmax, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { zmmch_("N", transt, &lj, &c__1, &k, & - alpha, &a[jj + a_dim1], nmax, + alpha, &a[jj + a_dim1], nmax, &a[j + a_dim1], nmax, &beta, & c__[jj + j * c_dim1], nmax, & - ct[1], &g[1], &cc[jc], &ldc, + ct[1], &g[1], &cc[jc], &ldc, eps, &err, fatal, nout, & c_true, (ftnlen)1, (ftnlen)1); } @@ -2743,12 +2753,12 @@ static integer c_n1 = -1; } /* zchk4_ */ -/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, +/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, integer *nout, integer *ntra, logical *trace, logical *rewi, logical * fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - ab, doublecomplex *aa, doublecomplex *as, doublecomplex *bb, - doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, + ab, doublecomplex *aa, doublecomplex *as, doublecomplex *bb, + doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal *g, doublecomplex *w, ftnlen sname_len) { @@ -2807,30 +2817,30 @@ static integer c_n1 = -1; doublecomplex alpha; doublereal rbeta; logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, logical *, doublecomplex *, ftnlen, ftnlen, ftnlen); integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, logical *, ftnlen, ftnlen); doublereal rbets; logical reset; char trans[1]; logical upper; char uplos[1]; - extern /* Subroutine */ int zher2k_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublereal *, doublecomplex *, integer *, ftnlen, - ftnlen), zsyr2k_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, + extern /* Subroutine */ int zher2k_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublereal *, doublecomplex *, integer *, ftnlen, + ftnlen), zsyr2k_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, + extern logical lzeres_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); char transs[1], transt[1]; @@ -2986,7 +2996,7 @@ static integer c_n1 = -1; } null = n <= 0; if (conj) { - null = null || (k <= 0 || alpha.r == 0. && + null = null || (k <= 0 || alpha.r == 0. && alpha.i == 0.) && rbeta == 1.; } @@ -3121,9 +3131,9 @@ static integer c_n1 = -1; /* See what data changed inside subroutines. */ - isame[0] = *(unsigned char *)uplos == *(unsigned + isame[0] = *(unsigned char *)uplos == *(unsigned char *)uplo; - isame[1] = *(unsigned char *)transs == *(unsigned + isame[1] = *(unsigned char *)transs == *(unsigned char *)trans; isame[2] = ns == n; isame[3] = ks == k; @@ -3135,7 +3145,7 @@ static integer c_n1 = -1; if (conj) { isame[9] = rbets == rbeta; } else { - isame[9] = bets.r == beta.r && bets.i == + isame[9] = bets.r == beta.r && bets.i == beta.i; } if (null) { @@ -3191,20 +3201,20 @@ static integer c_n1 = -1; i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { i__7 = i__; - i__8 = (j - 1 << 1) * *nmax + k + + i__8 = (j - 1 << 1) * *nmax + k + i__; - z__1.r = alpha.r * ab[i__8].r - - alpha.i * ab[i__8].i, + z__1.r = alpha.r * ab[i__8].r - + alpha.i * ab[i__8].i, z__1.i = alpha.r * ab[ i__8].i + alpha.i * ab[ i__8].r; - w[i__7].r = z__1.r, w[i__7].i = + w[i__7].r = z__1.r, w[i__7].i = z__1.i; if (conj) { i__7 = k + i__; d_cnjg(&z__2, &alpha); i__8 = (j - 1 << 1) * *nmax + i__; - z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i, + z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i, z__1.i = z__2.r * ab[i__8].i + z__2.i * ab[ i__8].r; w[i__7].r = z__1.r, w[i__7].i = z__1.i; @@ -3212,7 +3222,7 @@ static integer c_n1 = -1; i__7 = k + i__; i__8 = (j - 1 << 1) * *nmax + i__; z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; w[i__7].r = z__1.r, w[i__7].i = z__1.i; } @@ -3223,9 +3233,9 @@ static integer c_n1 = -1; i__8 = *nmax << 1; zmmch_(transt, "N", &lj, &c__1, &i__6, &c_b2, &ab[jjab], &i__7, &w[ - 1], &i__8, &beta, &c__[jj + j + 1], &i__8, &beta, &c__[jj + j * c_dim1], nmax, &ct[1], &g[1] - , &cc[jc], &ldc, eps, &err, + , &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } else { @@ -3234,14 +3244,14 @@ static integer c_n1 = -1; if (conj) { i__7 = i__; d_cnjg(&z__2, &ab[(k + i__ - 1) * *nmax + j]); - z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, - z__1.i = alpha.r * z__2.i + alpha.i * + z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, + z__1.i = alpha.r * z__2.i + alpha.i * z__2.r; w[i__7].r = z__1.r, w[i__7].i = z__1.i; i__7 = k + i__; i__8 = (i__ - 1) * *nmax + j; z__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, z__2.i = alpha.r * ab[i__8].i + alpha.i + .i, z__2.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; d_cnjg(&z__1, &z__2); w[i__7].r = z__1.r, w[i__7].i = z__1.i; @@ -3249,13 +3259,13 @@ static integer c_n1 = -1; i__7 = i__; i__8 = (k + i__ - 1) * *nmax + j; z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; w[i__7].r = z__1.r, w[i__7].i = z__1.i; i__7 = k + i__; i__8 = (i__ - 1) * *nmax + j; z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] - .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; w[i__7].r = z__1.r, w[i__7].i = z__1.i; } @@ -3265,9 +3275,9 @@ static integer c_n1 = -1; i__7 = *nmax << 1; zmmch_("N", "N", &lj, &c__1, &i__6, & c_b2, &ab[jj], nmax, &w[1], & - i__7, &beta, &c__[jj + j * - c_dim1], nmax, &ct[1], &g[1], - &cc[jc], &ldc, eps, &err, + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, fatal, nout, &c_true, (ftnlen) 1, (ftnlen)1); } @@ -3380,7 +3390,7 @@ static integer c_n1 = -1; } /* zchk5_ */ -/* Subroutine */ int zchke_(integer *isnum, char *srnamt, integer *nout, +/* Subroutine */ int zchke_(integer *isnum, char *srnamt, integer *nout, ftnlen srnamt_len) { /* Format strings */ @@ -3393,37 +3403,37 @@ static integer c_n1 = -1; integer s_wsfe(cilist *), do_fio(integer *, char *, ftnlen), e_wsfe(void); /* Local variables */ - doublecomplex a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] + doublecomplex a[2] /* was [2][1] */, b[2] /* was [2][1] */, c__[2] /* was [2][1] */, beta, alpha; doublereal rbeta; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, ftnlen, ftnlen), zhemm_(char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, ftnlen, ftnlen), zherk_(char *, char *, integer *, + extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, ftnlen, ftnlen), zhemm_(char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, ftnlen, ftnlen), zherk_(char *, char *, integer *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, - doublecomplex *, integer *, ftnlen, ftnlen), ztrmm_(char *, char - *, char *, char *, integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, - ftnlen, ftnlen, ftnlen), zsymm_(char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, + doublecomplex *, integer *, ftnlen, ftnlen), ztrmm_(char *, char + *, char *, char *, integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, ftnlen, + ftnlen, ftnlen, ftnlen), zsymm_(char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer * - , doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), - zsyrk_(char *, char *, integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, ftnlen, ftnlen), zher2k_(char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublereal *, doublecomplex *, - integer *, ftnlen, ftnlen), zsyr2k_(char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, + , doublecomplex *, integer *, ftnlen, ftnlen, ftnlen, ftnlen), + zsyrk_(char *, char *, integer *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, ftnlen, ftnlen), zher2k_(char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublereal *, doublecomplex *, + integer *, ftnlen, ftnlen), zsyr2k_(char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, ftnlen, ftnlen); doublereal ralpha; - extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical + extern /* Subroutine */ int chkxer_(char *, integer *, integer *, logical *, logical *, ftnlen); /* Fortran I/O blocks */ @@ -3485,302 +3495,302 @@ static integer c_n1 = -1; } L10: infoc_1.infot = 1; - zgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("/", "N", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 1; - zgemm_("/", "C", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("/", "C", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 1; - zgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("/", "T", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - zgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - zgemm_("C", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 2; - zgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "/", &c__0, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("N", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("C", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("C", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("C", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "N", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("T", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "C", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 3; - zgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "T", &c_n1, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("N", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("C", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("C", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("C", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "N", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("T", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "C", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 4; - zgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "T", &c__0, &c_n1, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("N", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("C", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("C", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("C", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "N", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("T", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "C", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 5; - zgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "T", &c__0, &c__0, &c_n1, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__2, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, + zgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("C", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("C", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, + zgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__2, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("T", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "C", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 8; - zgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "T", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "N", &c__0, &c__0, &c__2, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, + zgemm_("C", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, + zgemm_("T", "N", &c__0, &c__0, &c__2, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("N", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("C", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("T", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "C", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("N", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("C", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 10; - zgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "T", &c__0, &c__2, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + zgemm_("N", "N", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + zgemm_("N", "C", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, + zgemm_("N", "T", &c__2, &c__0, &c__0, &alpha, a, &c__2, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("C", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("C", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("C", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("C", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "N", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("T", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "C", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); infoc_1.infot = 13; - zgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, + zgemm_("T", "T", &c__2, &c__0, &c__0, &alpha, a, &c__1, b, &c__1, &beta, c__, &c__1, (ftnlen)1, (ftnlen)1); chkxer_(srnamt, &infoc_1.infot, nout, &infoc_1.lerr, &infoc_1.ok, (ftnlen) 6); @@ -4960,9 +4970,9 @@ static integer c_n1 = -1; } /* zchke_ */ -/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, - integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, - integer *lda, logical *reset, doublecomplex *transl, ftnlen type_len, +/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, + integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, + integer *lda, logical *reset, doublecomplex *transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ @@ -5148,10 +5158,10 @@ static integer c_n1 = -1; } /* zmake_ */ /* Subroutine */ int zmmch_(char *transa, char *transb, integer *m, integer * - n, integer *kk, doublecomplex *alpha, doublecomplex *a, integer *lda, + n, integer *kk, doublecomplex *alpha, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *beta, doublecomplex * c__, integer *ldc, doublecomplex *ct, doublereal *g, doublecomplex * - cc, integer *ldcc, doublereal *eps, doublereal *err, logical *fatal, + cc, integer *ldcc, doublereal *eps, doublereal *err, logical *fatal, integer *nout, logical *mv, ftnlen transa_len, ftnlen transb_len) { /* Format strings */ @@ -5165,7 +5175,7 @@ static integer c_n1 = -1; " \002,i3)"; /* System generated locals */ - integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; doublereal d__1, d__2, d__3, d__4, d__5, d__6; doublecomplex z__1, z__2, z__3, z__4; @@ -5224,9 +5234,9 @@ static integer c_n1 = -1; cc -= cc_offset; /* Function Body */ - trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 'C'; - tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 'C'; ctrana = *(unsigned char *)transa == 'C'; ctranb = *(unsigned char *)transb == 'C'; @@ -5254,17 +5264,17 @@ static integer c_n1 = -1; i__5 = i__; i__6 = i__ + k * a_dim1; i__7 = k + j * b_dim1; - z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, z__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[ i__7].r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = i__ + k * a_dim1; i__5 = k + j * b_dim1; g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag( &a[i__ + k * a_dim1]), abs(d__2))) * ((d__3 = b[ - i__5].r, abs(d__3)) + (d__4 = d_imag(&b[k + j * + i__5].r, abs(d__3)) + (d__4 = d_imag(&b[k + j * b_dim1]), abs(d__4))); /* L20: */ } @@ -5280,15 +5290,15 @@ static integer c_n1 = -1; i__5 = i__; d_cnjg(&z__3, &a[k + i__ * a_dim1]); i__6 = k + j * b_dim1; - z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, + z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, z__2.i = z__3.r * b[i__6].i + z__3.i * b[i__6] .r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = k + i__ * a_dim1; i__5 = k + j * b_dim1; - g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * (( d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( &b[k + j * b_dim1]), abs(d__4))); @@ -5308,12 +5318,12 @@ static integer c_n1 = -1; z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] .i, z__2.i = a[i__6].r * b[i__7].i + a[i__6] .i * b[i__7].r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = k + i__ * a_dim1; i__5 = k + j * b_dim1; - g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * (( d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( &b[k + j * b_dim1]), abs(d__4))); @@ -5332,15 +5342,15 @@ static integer c_n1 = -1; i__5 = i__; i__6 = i__ + k * a_dim1; d_cnjg(&z__3, &b[j + k * b_dim1]); - z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, - z__2.i = a[i__6].r * z__3.i + a[i__6].i * + z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, + z__2.i = a[i__6].r * z__3.i + a[i__6].i * z__3.r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = i__ + k * a_dim1; i__5 = j + k * b_dim1; - g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * (( d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( &b[j + k * b_dim1]), abs(d__4))); @@ -5360,12 +5370,12 @@ static integer c_n1 = -1; z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] .i, z__2.i = a[i__6].r * b[i__7].i + a[i__6] .i * b[i__7].r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = i__ + k * a_dim1; i__5 = j + k * b_dim1; - g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * (( d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( &b[j + k * b_dim1]), abs(d__4))); @@ -5385,17 +5395,17 @@ static integer c_n1 = -1; i__5 = i__; d_cnjg(&z__3, &a[k + i__ * a_dim1]); d_cnjg(&z__4, &b[j + k * b_dim1]); - z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, - z__2.i = z__3.r * z__4.i + z__3.i * + z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, + z__2.i = z__3.r * z__4.i + z__3.i * z__4.r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[k + i__ * a_dim1]), abs(d__2))) - * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(&b[j + k * b_dim1]), abs(d__4))); /* L120: */ } @@ -5410,17 +5420,17 @@ static integer c_n1 = -1; i__5 = i__; d_cnjg(&z__3, &a[k + i__ * a_dim1]); i__6 = j + k * b_dim1; - z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, + z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, z__2.i = z__3.r * b[i__6].i + z__3.i * b[ i__6].r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[k + i__ * a_dim1]), abs(d__2))) - * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(&b[j + k * b_dim1]), abs(d__4))); /* L140: */ } @@ -5437,17 +5447,17 @@ static integer c_n1 = -1; i__5 = i__; i__6 = k + i__ * a_dim1; d_cnjg(&z__3, &b[j + k * b_dim1]); - z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, - z__2.i = a[i__6].r * z__3.i + a[i__6].i * + z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, + z__2.i = a[i__6].r * z__3.i + a[i__6].i * z__3.r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[k + i__ * a_dim1]), abs(d__2))) - * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(&b[j + k * b_dim1]), abs(d__4))); /* L160: */ } @@ -5463,16 +5473,16 @@ static integer c_n1 = -1; i__6 = k + i__ * a_dim1; i__7 = j + k * b_dim1; z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[ - i__7].i, z__2.i = a[i__6].r * b[i__7].i + + i__7].i, z__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[i__7].r; - z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + z__2.i; ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; i__4 = k + i__ * a_dim1; i__5 = j + k * b_dim1; g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(&a[k + i__ * a_dim1]), abs(d__2))) - * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(&b[j + k * b_dim1]), abs(d__4))); /* L180: */ } @@ -5485,17 +5495,17 @@ static integer c_n1 = -1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = i__; i__4 = i__; - z__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, z__2.i = + z__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, z__2.i = alpha->r * ct[i__4].i + alpha->i * ct[i__4].r; i__5 = i__ + j * c_dim1; - z__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, z__3.i = + z__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, z__3.i = beta->r * c__[i__5].i + beta->i * c__[i__5].r; z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; ct[i__3].r = z__1.r, ct[i__3].i = z__1.i; i__3 = i__ + j * c_dim1; - g[i__] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha), + g[i__] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha), abs(d__2))) * g[i__] + ((d__3 = beta->r, abs(d__3)) + ( - d__4 = d_imag(beta), abs(d__4))) * ((d__5 = c__[i__3].r, + d__4 = d_imag(beta), abs(d__4))) * ((d__5 = c__[i__3].r, abs(d__5)) + (d__6 = d_imag(&c__[i__ + j * c_dim1]), abs( d__6))); /* L200: */ @@ -5621,8 +5631,8 @@ logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr) } /* lze_ */ -logical lzeres_(char *type__, char *uplo, integer *m, integer *n, - doublecomplex *aa, doublecomplex *as, integer *lda, ftnlen type_len, +logical lzeres_(char *type__, char *uplo, integer *m, integer *n, + doublecomplex *aa, doublecomplex *as, integer *lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ @@ -5807,7 +5817,7 @@ doublereal ddiff_(doublereal *x, doublereal *y) } /* ddiff_ */ -/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, +/* Subroutine */ int chkxer_(char *srnamt, integer *infot, integer *nout, logical *lerr, logical *ok, ftnlen srnamt_len) { /* Format strings */ diff --git a/build/bli_config.h.in b/build/bli_config.h.in index 5208a90f8..41e76d214 100644 --- a/build/bli_config.h.in +++ b/build/bli_config.h.in @@ -65,6 +65,13 @@ #endif #endif +#if @enable_hpx@ +#define BLIS_ENABLE_HPX +#if @enable_hpx_as_def@ +#define BLIS_ENABLE_HPX_AS_DEFAULT +#endif +#endif + #if @enable_jrir_slab@ #define BLIS_ENABLE_JRIR_SLAB #endif diff --git a/build/config.mk.in b/build/config.mk.in index efb123366..4624220cf 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -123,6 +123,7 @@ LDFLAGS_PRESET := @ldflags_preset@ # The level of debugging info to generate. DEBUG_TYPE := @debug_type@ +ENABLE_DEBUG := @enable_debug@ # Whether to compile and link the AddressSanitizer library. MK_ENABLE_ASAN := @enable_asan@ diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def index db20ffbca..4bc91784c 100644 --- a/build/libblis-symbols.def +++ b/build/libblis-symbols.def @@ -557,6 +557,8 @@ bli_info_get_enable_openmp_as_default bli_info_get_enable_pba_pools bli_info_get_enable_pthreads bli_info_get_enable_pthreads_as_default +bli_info_get_enable_hpx +bli_info_get_enable_hpx_as_default bli_info_get_enable_sandbox bli_info_get_enable_sba_pools bli_info_get_enable_threading diff --git a/common.mk b/common.mk index e69b97782..119d09e87 100644 --- a/common.mk +++ b/common.mk @@ -112,6 +112,7 @@ get-noopt-cxxflags-for = $(strip $(CFLAGS_PRESET) \ $(call load-var-for,CXXLANGFLAGS,$(1)) \ $(call load-var-for,CPPROCFLAGS,$(1)) \ $(CTHREADFLAGS) \ + $(CXXTHREADFLAGS) \ $(CINCFLAGS) $(VERS_DEF) \ ) @@ -151,6 +152,13 @@ get-frame-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ $(BUILD_SYMFLAGS) \ ) +get-frame-cxxflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ + $(call get-noopt-cxxflags-for,$(1)) \ + $(BUILD_ASANFLAGS) \ + $(BUILD_CPPFLAGS) \ + $(BUILD_SYMFLAGS) \ + ) + get-kernel-cflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \ $(call load-var-for,CKVECFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ @@ -224,6 +232,7 @@ get-refinit-text-for = "('$(1)' CFLAGS for ref. kernel init)" get-refkern-text-for = "('$(1)' CFLAGS for ref. kernels)" get-config-text-for = "('$(1)' CFLAGS for config code)" get-frame-text-for = "('$(1)' CFLAGS for framework code)" +get-frame-cxxtext-for = "('$(1)' CXXFLAGS for framework code)" get-kernel-text-for = "('$(1)' CFLAGS for kernels)" get-addon-c99text-for = "('$(1)' CFLAGS for addons)" get-addon-cxxtext-for = "('$(1)' CXXFLAGS for addons)" @@ -348,7 +357,11 @@ REFNM := ref # Source suffixes. CONFIG_SRC_SUFS := c KERNELS_SRC_SUFS := c s S +ifneq ($(findstring hpx,$(THREADING_MODEL)),) +FRAME_SRC_SUFS := c cpp +else FRAME_SRC_SUFS := c +endif ADDON_C99_SUFS := c ADDON_CXX_SUFS := cc cpp cxx @@ -427,7 +440,6 @@ ADDON_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(ADDON_DIR) SANDBOX_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(SANDBOX_DIR) - # # --- Library name and local paths --------------------------------------------- # @@ -687,8 +699,12 @@ endif # --- Linker program --- -# Use whatever compiler was chosen. +# Use whatever compiler was chosen. A C++ compiler must be used if HPX is enabled. +ifneq ($(findstring hpx,$(THREADING_MODEL)),) +LINKER := $(CXX) +else LINKER := $(CC) +endif # --- Warning flags --- @@ -798,14 +814,22 @@ endif CLANGFLAGS := -std=c99 $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CLANGFLAGS,$(c)))) -# Enable C++11. +# Enable C++11, or C++17 if HPX threading is enabled. +ifneq ($(findstring hpx,$(THREADING_MODEL)),) +CXXLANGFLAGS := -std=c++17 +else CXXLANGFLAGS := -std=c++11 +endif $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CXXLANGFLAGS,$(c)))) # --- C Preprocessor flags --- # Enable clock_gettime() in time.h. CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L +# Enable ip_mreq on macOS which is needed for ASIO which is needed for HPX +ifeq ($(OS_NAME),Darwin) +CPPROCFLAGS += -D_DARWIN_C_SOURCE +endif $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPPROCFLAGS,$(c)))) # --- AddressSanitizer flags --- @@ -823,6 +847,7 @@ endif # gets added to begin with. CTHREADFLAGS := +CXXTHREADFLAGS := ifeq ($(CC_VENDOR),gcc) #ifneq ($(findstring auto,$(THREADING_MODEL)),) @@ -866,6 +891,18 @@ LDFLAGS += $(LIBPTHREAD) endif endif +# Threading flags for HPX +ifneq ($(findstring hpx,$(THREADING_MODEL)),) +HPX_CXXFLAGS := $(shell pkg-config --cflags hpx_component) +HPX_LDFLAGS := $(filter-out -shared,$(shell pkg-config --libs hpx_component)) +CTHREADFLAGS += $(filter-out -std=%,$(HPX_CXXFLAGS)) +LDFLAGS += $(HPX_LDFLAGS) +ifeq ($(OS_NAME),Darwin) +RPATH_PREFIX := -Wl,-rpath, +LDFLAGS += $(patsubst -L%,$(RPATH_PREFIX)%,$(filter -L%,$(HPX_LDFLAGS))) +endif +endif + # --- #pragma omp simd flags (used for reference kernels only) --- ifeq ($(PRAGMA_OMP_SIMD),yes) diff --git a/configure b/configure index fd4812b1b..f808134d3 100755 --- a/configure +++ b/configure @@ -170,12 +170,12 @@ print_usage() echo " -t MODEL, --enable-threading[=MODEL], --disable-threading" echo " " echo " Enable threading in the library, using threading model(s)" - echo " MODEL={single,openmp,pthreads,auto}. If multiple values" + echo " MODEL={single,openmp,pthreads,hpx,auto}. If multiple values" echo " are specified within MODEL, they will all be compiled into" echo " BLIS, and the choice of which to use will be determined at" echo " runtime. If the user does not express a preference (by" echo " setting the BLIS_THREAD_IMPL environment variable to" - echo " 'single', 'openmp', or 'pthreads'; by calling the global" + echo " 'single', 'openmp', 'pthreads', or 'hpx'; by calling the global" echo " runtime API bli_thread_set_thread_impl(); or by encoding a" echo " choice on a per-call basis within a rntm_t passed into the" echo " expert API), then the first model listed in MODEL will be" @@ -2478,6 +2478,7 @@ main() # The user-given debug type and a flag indicating it was given. debug_type='' debug_flag='' + enable_debug='no' # A flag indicating whether AddressSanitizer should be used. enable_asan='no' @@ -3461,8 +3462,10 @@ main() debug_type='noopt' echo "${script_name}: enabling debug symbols; optimizations disabled." fi + enable_debug='yes' else debug_type='off' + enable_debug='no' echo "${script_name}: debug symbols disabled." fi @@ -3526,14 +3529,17 @@ main() enable_single='yes' enable_openmp='no' enable_pthreads='no' + enable_hpx='no' enable_single_01=1 enable_openmp_01=0 enable_pthreads_01=0 + enable_hpx_01=0 parsed_tm='' first_tm='' enable_single_as_def_01=0 enable_openmp_as_def_01=0 enable_pthreads_as_def_01=0 + enable_hpx_as_def_01=0 # Convert whatever reasonable separator the user may have used into a space. threading_model_list=$(echo "${threading_model}" | sed -e "s/[,+]/ /g") @@ -3561,6 +3567,10 @@ main() parsed_tm="${parsed_tm} pthreads" + elif [ "x${word}" = "xhpx" ]; then + + parsed_tm="${parsed_tm} hpx" + elif [ "x${word}" = "xauto" ]; then parsed_tm="${parsed_tm} auto" @@ -3652,7 +3662,15 @@ main() echo "${script_name}: enabling support for threading via pthreads." enable_pthreads='yes' enable_pthreads_01=1 + + elif [ "x${word}" = "xhpx" ]; then + + echo "${script_name}: enabling support for threading via hpx." + enable_hpx='yes' + enable_hpx_01=1 + fi + done # Define boolean variables that can easily be interpreted with #ifdef @@ -3662,25 +3680,37 @@ main() enable_single_as_def_01=1 enable_openmp_as_def_01=0 enable_pthreads_as_def_01=0 + enable_hpx_as_def_01=0 elif [ "x${first_tm}" = "xopenmp" ]; then enable_single_as_def_01=0 enable_openmp_as_def_01=1 enable_pthreads_as_def_01=0 + enable_hpx_as_def_01=0 elif [ "x${first_tm}" = "xpthreads" ]; then enable_single_as_def_01=0 enable_openmp_as_def_01=0 enable_pthreads_as_def_01=1 + enable_hpx_as_def_01=0 + + elif [ "x${first_tm}" = "xhpx" ]; then + + enable_single_as_def_01=0 + enable_openmp_as_def_01=0 + enable_pthreads_as_def_01=0 + enable_hpx_as_def_01=1 + fi # If either OpenMP or pthreads was enabled, given that single-threaded mode is # also always enabled, remind the user which one will serve as the default # (that is, absent any explicit choice at runtime). if [ "x${enable_openmp}" = "xyes" ] || - [ "x${enable_pthreads}" = "xyes" ]; then + [ "x${enable_pthreads}" = "xyes" ] || + [ "x${enable_hpx}" = "xyes" ]; then if [ "x${first_tm}" = "xsingle" ]; then echo "${script_name}: threading will default to single-threaded." @@ -3688,6 +3718,8 @@ main() echo "${script_name}: threading will default to OpenMP." elif [ "x${first_tm}" = "xpthreads" ]; then echo "${script_name}: threading will default to pthreads." + elif [ "x${first_tm}" = "xhpx" ]; then + echo "${script_name}: threading will default to HPX." fi fi @@ -4102,6 +4134,7 @@ main() | sed -e "s/@ldflags_preset@/${ldflags_preset_esc}/g" \ | sed -e "s/@enable_asan@/${enable_asan}/g" \ | sed -e "s/@debug_type@/${debug_type}/g" \ + | sed -e "s/@enable_debug@/${enable_debug}/g" \ | sed -e "s/@enable_system@/${enable_system}/g" \ | sed -e "s/@threading_model@/${threading_model}/g" \ | sed -e "s/@prefix@/${prefix_esc}/g" \ @@ -4142,6 +4175,8 @@ main() | sed -e "s/@enable_openmp_as_def@/${enable_openmp_as_def_01}/g" \ | sed -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \ | sed -e "s/@enable_pthreads_as_def@/${enable_pthreads_as_def_01}/g" \ + | sed -e "s/@enable_hpx@/${enable_hpx_01}/g" \ + | sed -e "s/@enable_hpx_as_def@/${enable_hpx_as_def_01}/g" \ | sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \ | sed -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \ | sed -e "s/@enable_pba_pools@/${enable_pba_pools_01}/g" \ diff --git a/docs/FAQ.md b/docs/FAQ.md index 3d0852d36..aee099b37 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -115,7 +115,7 @@ For more information on macrokernels, please read our [ACM TOMS papers](https:// As of 0.2.0, BLIS contains a new infrastructure for communicating runtime information (such as kernel addresses and blocksizes) from the highest levels of code all the way down the function stack, even into the kernels themselves. This new data structure is called a *context* (defined in code as a `cntx_t` type), and together with its API it helped us clean up some hacks and other awkwardness that existed in BLIS prior to 0.2.0. Contexts also lay the groundwork for managing kernels and related kernel information at runtime. -If you are a kernel developer, you can usually ignore the `cntx_t*` argument that is passed into each kernel, since the kernels already inherently "know" this information (such as register blocksizes). And if you are a user, and the function you want to call takes a `cntx_t*` argument, you can safely pass in `NULL` and BLIS will automatically build a suitable context for you at runtime. +If you are a kernel developer, you can usually ignore the `cntx_t*` argument that is passed into each kernel, since the kernels already inherently "know" this information (such as register blocksizes). And if you are a user, and the function you want to call takes a `cntx_t*` argument, you can safely pass in `NULL` and BLIS will automatically build a suitable context for you at runtime. ### I'm used to thinking in terms of column-major/row-major storage and leading dimensions. What is a "row stride" / "column stride"? @@ -171,7 +171,7 @@ Originally, BLIS did indeed require the application to explicitly setup (initial ### Does BLIS support multithreading? -Yes! BLIS supports multithreading (via OpenMP or POSIX threads) for all of its level-3 operations. For more information on enabling and controlling multithreading, please see the [Multithreading](Multithreading.md) guide. +Yes! BLIS supports multithreading (via OpenMP, POSIX threads, or HPX) for all of its level-3 operations. For more information on enabling and controlling multithreading, please see the [Multithreading](Multithreading.md) guide. BLIS is also thread-safe so that you can call BLIS from threads within a multithreaded library or application. BLIS derives its thread-safety via unconditional use of features present in POSIX threads (pthreads). These pthreads features are employed for thread-safety regardless of whether BLIS is configured for OpenMP multithreading, pthreads multithreading, or single-threaded execution. diff --git a/docs/Multithreading.md b/docs/Multithreading.md index 933296f79..1a46f6556 100644 --- a/docs/Multithreading.md +++ b/docs/Multithreading.md @@ -246,7 +246,7 @@ This will result in both OpenMP and pthreads implementations being compiled and ```c void bli_thread_set_thread_impl( timpl_t ti ); ``` -The function takes a `timpl_t`, which is an enumerated type that has three valid values corresponding to the three possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling: +The function takes a `timpl_t`, which is an enumerated type that has three valid values corresponding to the four possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, `BLIS_HPX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling: ```c bli_thread_set_thread_impl( BLIS_POSIX ) ``` @@ -321,7 +321,7 @@ This will result in both OpenMP and pthreads implementations being compiled and ```c void bli_rntm_set_thread_impl( timpl_t ti, rntm_t* rntm ); ``` -The function takes a `timpl_t`, which is an enumerated type that has three valid values corresponding to the three possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling: +The function takes a `timpl_t`, which is an enumerated type that has three valid values corresponding to the four possible threading implementations: `BLIS_OPENMP`, `BLIS_POSIX`, `BLIS_HPX`, and `BLIS_SINGLE`. Forcing use of pthreads is as simple as calling: ```c bli_rntm_set_thread_impl( BLIS_POSIX, &rntm ); ``` @@ -366,7 +366,7 @@ Also, you may pass in `NULL` for the `rntm_t*` parameter of an expert interface. This situation could lead to unexpectedly low multithreaded performance. Suppose the user calls `gemm` on a problem with a large m dimension and small k and n dimensions, and explicitly requests parallelism only in the IC loop, but also suppose that the storage of C does not match that of the microkernel's preference. After BLIS transposes the operation internally, the *effective* m dimension will no longer be large; instead, it will be small (because the original m and n dimension will have been swapped). The multithreaded implementation will then proceed to parallelize this small m dimension. There are currently no good *and* easy solutions to this problem. Eventually, though, we plan to add support for two microkernels per datatype per configuration--one for use with matrices C that are row-stored, and one for those that are column-stored. This will obviate the logic within BLIS that sometimes induces the operation transposition, and the problem will go away. - + * **Thread affinity when BLIS and MKL are used together.** Some users have reported that when running a program that links both BLIS (configured with OpenMP) and MKL, **and** when OpenMP thread affinity has been specified (e.g. via `OMP_PROC_BIND` and `OMP_PLACES`), that very poor performance is observed. This may be due to incorrect thread masking, causing all threads to run on one physical core. The exact circumstances leading to this behavior have not been identified, but unsetting the OpenMP thread affinity variables appears to be a solution. # Conclusion diff --git a/frame/3/bli_l3_decor.c b/frame/3/bli_l3_decor.c index e482d37a1..4160751e6 100644 --- a/frame/3/bli_l3_decor.c +++ b/frame/3/bli_l3_decor.c @@ -224,13 +224,16 @@ void bli_l3_thread_decorator_check #endif #ifndef BLIS_ENABLE_PTHREADS ti == BLIS_POSIX || +#endif +#ifndef BLIS_ENABLE_HPX + ti == BLIS_HPX || #endif FALSE ) { fprintf( stderr, "\n" ); - fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); - fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", ( ti == BLIS_OPENMP ? "openmp" : "pthreads" ) ); + fprintf( stderr, "libblis: User requested threading implementation \"%s\", but that method is\n", bli_thread_get_thread_impl_str( ti ) ); + fprintf( stderr, "libblis: unavailable. Try reconfiguring BLIS with \"-t %s\" and recompiling.\n", bli_thread_get_thread_impl_str( ti ) ); fprintf( stderr, "libblis: %s: line %d\n", __FILE__, ( int )__LINE__ ); bli_abort(); } diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index 9d6e181d3..1f00537d5 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -104,7 +104,8 @@ gint_t bli_info_get_enable_sba_pools( void ) gint_t bli_info_get_enable_threading( void ) { if ( bli_info_get_enable_openmp() || - bli_info_get_enable_pthreads() ) return 1; + bli_info_get_enable_pthreads() || + bli_info_get_enable_hpx() ) return 1; else return 0; } gint_t bli_info_get_enable_openmp( void ) @@ -123,6 +124,14 @@ gint_t bli_info_get_enable_pthreads( void ) return 0; #endif } +gint_t bli_info_get_enable_hpx( void ) +{ +#ifdef BLIS_ENABLE_HPX + return 1; +#else + return 0; +#endif +} gint_t bli_info_get_enable_openmp_as_default( void ) { #ifdef BLIS_ENABLE_OPENMP_AS_DEFAULT @@ -139,6 +148,14 @@ gint_t bli_info_get_enable_pthreads_as_default( void ) return 0; #endif } +gint_t bli_info_get_enable_hpx_as_default( void ) +{ +#ifdef BLIS_ENABLE_HPX_AS_DEFAULT + return 1; +#else + return 0; +#endif +} gint_t bli_info_get_thread_part_jrir_slab( void ) { #ifdef BLIS_ENABLE_JRIR_SLAB diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index b3514f434..08a99daea 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -70,8 +70,10 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); +BLIS_EXPORT_BLIS gint_t bli_info_get_enable_hpx( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp_as_default( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads_as_default( void ); +BLIS_EXPORT_BLIS gint_t bli_info_get_enable_hpx_as_default( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index 542973b18..633d7f671 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -83,12 +83,20 @@ // Default behavior is disabled. #endif +// Enable multithreading via HPX. +#ifdef BLIS_ENABLE_HPX + // No additional definitions needed. +#else + // Default behavior is disabled. +#endif + // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads, or both (as // opposed to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ - defined ( BLIS_ENABLE_PTHREADS ) + defined ( BLIS_ENABLE_PTHREADS ) || \ + defined ( BLIS_ENABLE_HPX ) #define BLIS_ENABLE_MULTITHREADING #endif diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 0c5d11e6b..014be18b7 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -44,9 +44,10 @@ #ifdef __cplusplus // For C++, include stdint.h. - #include + #include #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. + #include #include #include #else @@ -629,6 +630,7 @@ typedef enum BLIS_SINGLE = 0, BLIS_OPENMP, BLIS_POSIX, + BLIS_HPX, // BLIS_NUM_THREAD_IMPLS must be last! BLIS_NUM_THREAD_IMPLS diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index f0bba205a..e9f9d9dc7 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -74,16 +74,18 @@ static thrcomm_init_ft init_fpa[ BLIS_NUM_THREAD_IMPLS ] = [BLIS_OPENMP] = #if defined(BLIS_ENABLE_OPENMP) bli_thrcomm_init_openmp, -#elif defined(BLIS_ENABLE_PTHREADS) - NULL, #else NULL, #endif [BLIS_POSIX] = #if defined(BLIS_ENABLE_PTHREADS) bli_thrcomm_init_pthreads, -#elif defined(BLIS_ENABLE_OPENMP) +#else NULL, +#endif + [BLIS_HPX] = +#if defined(BLIS_ENABLE_HPX) + bli_thrcomm_init_hpx, #else NULL, #endif @@ -94,16 +96,18 @@ static thrcomm_cleanup_ft cleanup_fpa[ BLIS_NUM_THREAD_IMPLS ] = [BLIS_OPENMP] = #if defined(BLIS_ENABLE_OPENMP) bli_thrcomm_cleanup_openmp, -#elif defined(BLIS_ENABLE_PTHREADS) - NULL, #else NULL, #endif [BLIS_POSIX] = #if defined(BLIS_ENABLE_PTHREADS) bli_thrcomm_cleanup_pthreads, -#elif defined(BLIS_ENABLE_OPENMP) +#else NULL, +#endif + [BLIS_HPX] = +#if defined(BLIS_ENABLE_HPX) + bli_thrcomm_cleanup_hpx, #else NULL, #endif @@ -114,16 +118,18 @@ static thrcomm_barrier_ft barrier_fpa[ BLIS_NUM_THREAD_IMPLS ] = [BLIS_OPENMP] = #if defined(BLIS_ENABLE_OPENMP) bli_thrcomm_barrier_openmp, -#elif defined(BLIS_ENABLE_PTHREADS) - NULL, #else NULL, #endif [BLIS_POSIX] = #if defined(BLIS_ENABLE_PTHREADS) bli_thrcomm_barrier_pthreads, -#elif defined(BLIS_ENABLE_OPENMP) +#else NULL, +#endif + [BLIS_HPX] = +#if defined(BLIS_ENABLE_HPX) + bli_thrcomm_barrier_hpx, #else NULL, #endif diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index 7abd190c7..b65cb0b7a 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -94,6 +94,13 @@ typedef struct thrcomm_s #endif #endif + #ifdef BLIS_ENABLE_HPX + #ifdef BLIS_USE_HPX_BARRIER + hpx::barrier<> * barrier; + #endif + #endif + + } thrcomm_t; @@ -105,6 +112,7 @@ typedef struct thrcomm_s #include "bli_thrcomm_single.h" #include "bli_thrcomm_openmp.h" #include "bli_thrcomm_pthreads.h" +#include "bli_thrcomm_hpx.h" // Define a function pointer type for each of the functions that are // "overloaded" by each method of multithreading. diff --git a/frame/thread/bli_thrcomm_hpx.cpp b/frame/thread/bli_thrcomm_hpx.cpp new file mode 100644 index 000000000..d9fb258c2 --- /dev/null +++ b/frame/thread/bli_thrcomm_hpx.cpp @@ -0,0 +1,92 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022 Tactical Computing Laboratories, LLC + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_HPX + +extern "C" { + +#ifdef BLIS_USE_HPX_BARRIER + +// Define the pthread_barrier_t implementations of the init, cleanup, and +// barrier functions. + +void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm ) +{ + if ( comm == nullptr ) return; + comm->barrier = new hpx:barrier<>(); +} + +void bli_thrcomm_cleanup_hpx( thrcomm_t* comm ) +{ + if ( comm == nullptr ) return; + delete comm->barrier; +} + +void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) +{ + comm->barrier->arrive_and_wait(); +} + +#else + +// Define the non-hpx::barrier implementations of the init, cleanup, +// and barrier functions. These are the default unless the hpx::barrier +// versions are requested at compile-time. + +void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm ) +{ + if ( comm == nullptr ) return; + comm->sent_object = nullptr; + comm->n_threads = n_threads; + comm->barrier_sense = 0; + comm->barrier_threads_arrived = 0; +} + +void bli_thrcomm_cleanup_hpx( thrcomm_t* comm ) +{ +} + +void bli_thrcomm_barrier_hpx( dim_t t_id, thrcomm_t* comm ) +{ + bli_thrcomm_barrier_atomic( t_id, comm ); +} + +} // extern "C" + +#endif + +#endif + diff --git a/frame/thread/bli_thrcomm_hpx.h b/frame/thread/bli_thrcomm_hpx.h new file mode 100644 index 000000000..d80cd2268 --- /dev/null +++ b/frame/thread/bli_thrcomm_hpx.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022 Tactical Computing Laboratories, LLC + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_THRCOMM_HPX_H +#define BLIS_THRCOMM_HPX_H + +// Define these prototypes for situations when HPX multithreading is enabled. +#ifdef BLIS_ENABLE_HPX + +void bli_thrcomm_init_hpx( dim_t nt, thrcomm_t* comm ); +void bli_thrcomm_cleanup_hpx( thrcomm_t* comm ); +void bli_thrcomm_barrier_hpx( dim_t tid, thrcomm_t* comm ); + +#endif + +#endif + diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 8904c88e3..4cba76b20 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -35,6 +35,10 @@ #include "blis.h" +#ifdef BLIS_ENABLE_HPX +#include "bli_thread_hpx.h" +#endif + thrcomm_t BLIS_SINGLE_COMM = {}; // The global rntm_t structure. (The definition resides in bli_rntm.c.) @@ -57,16 +61,18 @@ static thread_launch_t thread_launch_fpa[ BLIS_NUM_THREAD_IMPLS ] = [BLIS_OPENMP] = #if defined(BLIS_ENABLE_OPENMP) bli_thread_launch_openmp, -#elif defined(BLIS_ENABLE_PTHREADS) - NULL, #else NULL, #endif [BLIS_POSIX] = #if defined(BLIS_ENABLE_PTHREADS) bli_thread_launch_pthreads, -#elif defined(BLIS_ENABLE_OPENMP) +#else NULL, +#endif + [BLIS_HPX] = +#if defined(BLIS_ENABLE_HPX) + bli_thread_launch_hpx, #else NULL, #endif @@ -1604,6 +1610,7 @@ static const char* bli_timpl_string[BLIS_NUM_THREAD_IMPLS] = [BLIS_SINGLE] = "single", [BLIS_OPENMP] = "openmp", [BLIS_POSIX] = "pthreads", + [BLIS_HPX] = "hpx", }; const char* bli_thread_get_thread_impl_str( timpl_t ti ) @@ -1713,6 +1720,7 @@ void bli_thread_init_rntm_from_env else if ( !strncmp( ti_env, "pthreads", 8 ) ) ti = BLIS_POSIX; else if ( !strncmp( ti_env, "pthread", 7 ) ) ti = BLIS_POSIX; else if ( !strncmp( ti_env, "posix", 5 ) ) ti = BLIS_POSIX; + else if ( !strncmp( ti_env, "hpx", 3 ) ) ti = BLIS_HPX; else ti = BLIS_SINGLE; #ifdef PRINT_IMPL @@ -1732,6 +1740,9 @@ void bli_thread_init_rntm_from_env #ifdef BLIS_ENABLE_PTHREADS_AS_DEFAULT ti = BLIS_POSIX; #endif + #ifdef BLIS_ENABLE_HPX_AS_DEFAULT + ti = BLIS_HPX; + #endif #ifdef PRINT_IMPL printf( "BLIS_THREAD_IMPL unset; defaulting to BLIS_THREAD_IMPL=%s.\n", diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 821e2fe7c..e61fc8b89 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -49,6 +49,7 @@ typedef void (*thread_func_t)( thrcomm_t* gl_comm, dim_t tid, const void* params // Include threading implementations. #include "bli_thread_openmp.h" #include "bli_thread_pthreads.h" +#include "bli_thread_hpx.h" #include "bli_thread_single.h" // Initialization-related prototypes. diff --git a/frame/thread/bli_thread_hpx.cpp b/frame/thread/bli_thread_hpx.cpp new file mode 100644 index 000000000..38c92481d --- /dev/null +++ b/frame/thread/bli_thread_hpx.cpp @@ -0,0 +1,85 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022 Tactical Computing Laboratories, LLC + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_HPX + +#include +#include +#include + +extern "C" +{ + +void bli_thread_launch_hpx + ( + dim_t n_threads, + thread_func_t func, + const void* params + ) +{ + const timpl_t ti = BLIS_HPX; + + // Allocate a global communicator for the root thrinfo_t structures. + pool_t* gl_comm_pool = nullptr; + thrcomm_t* gl_comm = bli_thrcomm_create( ti, gl_comm_pool, n_threads ); + + auto irange = hpx::util::detail::make_counting_shape(n_threads); + + hpx::for_each(hpx::execution::par, hpx::util::begin(irange), hpx::util::end(irange), + [&gl_comm, &func, ¶ms](const dim_t tid) + { + func( gl_comm, tid, params ); + }); + + // Free the global communicator, because the root thrinfo_t node + // never frees its communicator. + bli_thrcomm_free( gl_comm_pool, gl_comm ); +} + +void bli_thread_initialize_hpx( int argc, char** argv ) +{ + hpx::start( nullptr, argc, argv ); +} + +int bli_thread_finalize_hpx() +{ + hpx::apply([]() { hpx::finalize(); }); + return hpx::stop(); +} + +} // extern "C" + +#endif diff --git a/frame/thread/bli_thread_hpx.h b/frame/thread/bli_thread_hpx.h new file mode 100644 index 000000000..55d2758a9 --- /dev/null +++ b/frame/thread/bli_thread_hpx.h @@ -0,0 +1,54 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022 Tactical Computing Laboratories, LLC + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_THREAD_HPX_H +#define BLIS_THREAD_HPX_H + +// Definitions specific to situations when HPX multithreading is enabled. +#ifdef BLIS_ENABLE_HPX + +void bli_thread_launch_hpx + ( + dim_t nt, + thread_func_t func, + const void* params + ); + +void bli_thread_initialize_hpx( int argc, char** argv ); + +int bli_thread_finalize_hpx(); + +#endif + +#endif diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index aec9357ae..7ca314c5f 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -66,6 +66,10 @@ int main( int argc, char** argv ) test_params_t params; test_ops_t ops; +#ifdef BLIS_ENABLE_HPX + bli_thread_initialize_hpx( 1, argv ); +#endif + // Initialize libblis. //bli_init(); @@ -88,8 +92,12 @@ int main( int argc, char** argv ) // Finalize libblis. bli_finalize(); +#ifdef BLIS_ENABLE_HPX + return bli_thread_finalize_hpx(); +#else // Return peacefully. return 0; +#endif } @@ -782,26 +790,34 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) const bool has_openmp = bli_info_get_enable_openmp(); const bool has_pthreads = bli_info_get_enable_pthreads(); + const bool has_hpx = bli_info_get_enable_hpx(); const bool openmp_is_def = bli_info_get_enable_openmp_as_default(); const bool pthreads_is_def = bli_info_get_enable_pthreads_as_default(); + const bool hpx_is_def = bli_info_get_enable_hpx_as_default(); const timpl_t ti = bli_thread_get_thread_impl(); // List the available threading implementation(s). - if ( has_openmp && has_pthreads ) sprintf( impl_str, "openmp,pthreads,single" ); - else if ( has_openmp ) sprintf( impl_str, "openmp,single" ); - else if ( has_pthreads ) sprintf( impl_str, "pthreads,single" ); - else sprintf( impl_str, "single only" ); + if ( has_hpx && has_openmp && has_pthreads ) sprintf( impl_str, "openmp,pthreads,hpx,single" ); + else if ( has_hpx && has_openmp ) sprintf( impl_str, "openmp,hpx,single" ); + else if ( has_hpx && has_pthreads ) sprintf( impl_str, "pthreads,hpx,single" ); + else if ( has_hpx ) sprintf( impl_str, "hpx,single" ); + else if ( has_openmp && has_pthreads ) sprintf( impl_str, "openmp,pthreads,single" ); + else if ( has_openmp ) sprintf( impl_str, "openmp,single" ); + else if ( has_pthreads ) sprintf( impl_str, "pthreads,single" ); + else sprintf( impl_str, "single only" ); // Describe the default threading implementation that would be active if // or when BLIS_THREAD_IMPL is unset. if ( openmp_is_def ) sprintf( def_impl_unset_str, "openmp" ); else if ( pthreads_is_def ) sprintf( def_impl_unset_str, "pthreads" ); + else if ( hpx_is_def ) sprintf( def_impl_unset_str, "hpx" ); else sprintf( def_impl_unset_str, "single" ); // Describe the default threading implementation as the testsuite was // currently run. if ( ti == BLIS_OPENMP ) sprintf( def_impl_set_str, "openmp" ); else if ( ti == BLIS_POSIX ) sprintf( def_impl_set_str, "pthreads" ); + else if ( ti == BLIS_HPX ) sprintf( def_impl_set_str, "hpx" ); else sprintf( def_impl_set_str, "single" ); // Describe the status of jrir thread partitioning. From f0337b784d164ae505ca0e11277a1155680500d1 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sun, 13 Nov 2022 21:36:47 -0600 Subject: [PATCH 109/230] Trival whitespace/comment tweaks. Details: - Trivial whitespace and comment changes, most of which ideally would have been part of the previous commit pertaining to HPX (2b05948). --- common.mk | 9 ++++- configure | 56 +++++++++++++-------------- frame/include/bli_config_macro_defs.h | 2 +- frame/thread/bli_thrcomm.h | 3 +- frame/thread/bli_thread_hpx.cpp | 6 +-- testsuite/src/test_libblis.c | 4 +- 6 files changed, 43 insertions(+), 37 deletions(-) diff --git a/common.mk b/common.mk index 119d09e87..6b7403afb 100644 --- a/common.mk +++ b/common.mk @@ -256,6 +256,7 @@ files-that-dont-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f) rm-dups = $(if $1,$(firstword $1) $(call rm-dups,$(filter-out $(firstword $1),$1))) + # # --- Include makefile configuration file -------------------------------------- # @@ -440,6 +441,7 @@ ADDON_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(ADDON_DIR) SANDBOX_FRAG_PATH := ./obj/$(CONFIG_NAME)/$(SANDBOX_DIR) + # # --- Library name and local paths --------------------------------------------- # @@ -515,6 +517,8 @@ else LIBBLIS_SO_OUTPUT_NAME := $(LIBBLIS_SO_PATH) endif + + # # --- Utility program definitions ---------------------------------------------- # @@ -644,6 +648,7 @@ endif endif + # # --- Include makefile definitions file ---------------------------------------- # @@ -826,7 +831,7 @@ $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CXXLANGFLAGS,$(c)) # Enable clock_gettime() in time.h. CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -# Enable ip_mreq on macOS which is needed for ASIO which is needed for HPX +# Enable ip_mreq on macOS which is needed for ASIO which is needed for HPX. ifeq ($(OS_NAME),Darwin) CPPROCFLAGS += -D_DARWIN_C_SOURCE endif @@ -891,7 +896,7 @@ LDFLAGS += $(LIBPTHREAD) endif endif -# Threading flags for HPX +# Threading flags for HPX. ifneq ($(findstring hpx,$(THREADING_MODEL)),) HPX_CXXFLAGS := $(shell pkg-config --cflags hpx_component) HPX_LDFLAGS := $(filter-out -shared,$(shell pkg-config --libs hpx_component)) diff --git a/configure b/configure index f808134d3..286a66123 100755 --- a/configure +++ b/configure @@ -147,8 +147,8 @@ print_usage() echo " --enable-rpath, --disable-rpath" echo " " echo " Enable (disabled by default) setting an install_name for" - echo " dynamic libraries on macOS which starts with @rpath rather" - echo " than the absolute install path." + echo " dynamic libraries on macOS which starts with @rpath rather" + echo " than the absolute install path." echo " " echo " -e SYMBOLS, --export-shared[=SYMBOLS]" echo " " @@ -175,17 +175,17 @@ print_usage() echo " BLIS, and the choice of which to use will be determined at" echo " runtime. If the user does not express a preference (by" echo " setting the BLIS_THREAD_IMPL environment variable to" - echo " 'single', 'openmp', 'pthreads', or 'hpx'; by calling the global" - echo " runtime API bli_thread_set_thread_impl(); or by encoding a" - echo " choice on a per-call basis within a rntm_t passed into the" - echo " expert API), then the first model listed in MODEL will be" - echo " used by default. Note that 'single' is silently appended" - echo " to whatever the user specifies in MODEL, meaning that" - echo " single-threaded functionality will always be available," - echo " even if it is not requested and even if it is not enabled" - echo " by default. Even --disable-threading is actually shorthand" - echo " for --enable-threading=single (which is the default when" - echo " the option is not specified)." + echo " 'single', 'openmp', 'pthreads', or 'hpx'; by calling the" + echo " global runtime API bli_thread_set_thread_impl(); or by" + echo " encoding a choice on a per-call basis within a rntm_t" + echo " passed into the expert API), then the first model listed" + echo " in MODEL will be used by default. Note that 'single' is" + echo " silently appended to whatever the user specifies in MODEL," + echo " meaning that single-threaded functionality will always be" + echo " available, even if it is not requested and even if it is" + echo " not enabled by default. Even --disable-threading is" + echo " actually shorthand for --enable-threading=single (which is" + echo " the default when the option is not specified)." echo " " echo " --enable-system, --disable-system" echo " " @@ -1262,9 +1262,9 @@ has_libmemkind() # Depending on the return code from the compile step above, we set # enable_memkind accordingly. if [ "$?" == 0 ]; then - rval='yes' + rval='yes' else - rval='no' + rval='no' fi # Remove the executable generated above. @@ -1292,9 +1292,9 @@ has_pragma_omp_simd() # Depending on the return code from the compile step above, we set # enable_memkind accordingly. if [ "$?" == 0 ]; then - rval='yes' + rval='yes' else - rval='no' + rval='no' fi # Remove the executable generated above. @@ -1514,11 +1514,11 @@ get_compiler_version() # Begin parsing cc_vendor for the version string. if [ "${cc_vendor}" = "GCC" ]; then - # Conda gcc sometimes has GCC (all caps) in the version string + # Conda gcc sometimes has GCC (all caps) in the version string cc_vendor="gcc" fi if [ "${cc_vendor}" = "crosstool-NG" ]; then - # Treat compilers built by crosstool-NG (for eg: conda) as gcc. + # Treat compilers built by crosstool-NG (for eg: conda) as gcc. cc_vendor="gcc" fi if [ "${cc_vendor}" = "icc" -o \ @@ -1561,7 +1561,7 @@ get_compiler_version() cc_version=$(echo "${vendor_string}" \ | egrep -o 'AOCC.LLVM.[0-9]+\.[0-9]+\.?[0-9]*' \ | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' \ - | { read first rest ; echo $first ; }) + | { read first rest ; echo $first ; }) else # Grep for the AOCC_x.y.z substring first, and then isolate the @@ -1572,7 +1572,7 @@ get_compiler_version() cc_version=$(echo "${vendor_string}" \ | egrep -o 'AOCC_[0-9]+\.[0-9]+\.?[0-9]*' \ | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' \ - | { read first rest ; echo $first ; }) + | { read first rest ; echo $first ; }) fi elif [ "${cc_vendor}" = "oneAPI" ]; then @@ -2025,9 +2025,9 @@ try_assemble() ${cc} ${cflags} -c ${asm_src} -o ${asm_bin} > /dev/null 2>&1 if [ "$?" == 0 ]; then - rval='yes' + rval='yes' else - rval='no' + rval='no' fi # Remove the object file. @@ -2501,7 +2501,7 @@ main() enable_arg_max_hack='no' enable_static='yes' enable_shared='yes' - enable_rpath='no' + enable_rpath='no' export_shared='public' enable_pba_pools='yes' enable_sba_pools='yes' @@ -2944,7 +2944,7 @@ main() get_binutils_version check_assembler - # Check if there is any incompatibility due to the operating system. + # Check if there is any incompatibility due to the operating system. check_os # Remove duplicates and whitespace from the blacklist. @@ -3473,7 +3473,7 @@ main() if [ "x${enable_asan}" = "xyes" ]; then echo "${script_name}: enabling AddressSanitizer support (except for optimized kernels)." else - enable_asan='no' + enable_asan='no' echo "${script_name}: AddressSanitizer support disabled." fi @@ -3665,7 +3665,7 @@ main() elif [ "x${word}" = "xhpx" ]; then - echo "${script_name}: enabling support for threading via hpx." + echo "${script_name}: enabling support for threading via HPX." enable_hpx='yes' enable_hpx_01=1 @@ -3705,7 +3705,7 @@ main() fi - # If either OpenMP or pthreads was enabled, given that single-threaded mode is + # If OpenMP, pthreads, or HPX was enabled, given that single-threaded mode is # also always enabled, remind the user which one will serve as the default # (that is, absent any explicit choice at runtime). if [ "x${enable_openmp}" = "xyes" ] || diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index 633d7f671..bf9319f4f 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -94,7 +94,7 @@ // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads, or both (as // opposed to neither being used). -#if defined ( BLIS_ENABLE_OPENMP ) || \ +#if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) || \ defined ( BLIS_ENABLE_HPX ) #define BLIS_ENABLE_MULTITHREADING diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index b65cb0b7a..b55922acd 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -94,13 +94,14 @@ typedef struct thrcomm_s #endif #endif + // -- Fields specific to HPX -- + #ifdef BLIS_ENABLE_HPX #ifdef BLIS_USE_HPX_BARRIER hpx::barrier<> * barrier; #endif #endif - } thrcomm_t; diff --git a/frame/thread/bli_thread_hpx.cpp b/frame/thread/bli_thread_hpx.cpp index 38c92481d..a7818ffd6 100644 --- a/frame/thread/bli_thread_hpx.cpp +++ b/frame/thread/bli_thread_hpx.cpp @@ -71,13 +71,13 @@ void bli_thread_launch_hpx void bli_thread_initialize_hpx( int argc, char** argv ) { - hpx::start( nullptr, argc, argv ); + hpx::start( nullptr, argc, argv ); } int bli_thread_finalize_hpx() { - hpx::apply([]() { hpx::finalize(); }); - return hpx::stop(); + hpx::apply([]() { hpx::finalize(); }); + return hpx::stop(); } } // extern "C" diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 7ca314c5f..851102a2f 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -67,7 +67,7 @@ int main( int argc, char** argv ) test_ops_t ops; #ifdef BLIS_ENABLE_HPX - bli_thread_initialize_hpx( 1, argv ); + bli_thread_initialize_hpx( 1, argv ); #endif // Initialize libblis. @@ -93,7 +93,7 @@ int main( int argc, char** argv ) bli_finalize(); #ifdef BLIS_ENABLE_HPX - return bli_thread_finalize_hpx(); + return bli_thread_finalize_hpx(); #else // Return peacefully. return 0; From db10dd8e11a12d85017f84455558a82c0093b1da Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 29 Nov 2022 19:10:31 -0600 Subject: [PATCH 110/230] Fixed _gemm_small() prototype; disabled gemm_small. Details: - Fixed a mismatch between the prototype for bli_gemm_small() in bli_gemm_front.h and the actual definition of bli_gemm_small() in kernels/zen/3/bli_gemm_small.c. The former was erroneously declaring the cntl_t* argument as 'const'. Thanks to Jeff Diamond for reporting this issue. - Commented out BLIS_ENABLE_SMALL_MATRIX, BLIS_ENABLE_SMALL_MATRIX_TRSM macro definitions in config/zen3/bli_family_zen3.h. AMD's small matrix implementation should probably remain disabled in vanilla BLIS, at least for now. --- config/zen3/bli_family_zen3.h | 4 ++-- frame/3/gemm/bli_gemm_front.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/config/zen3/bli_family_zen3.h b/config/zen3/bli_family_zen3.h index 661313ca9..d03e2edc7 100644 --- a/config/zen3/bli_family_zen3.h +++ b/config/zen3/bli_family_zen3.h @@ -52,8 +52,8 @@ // All zen3 specific code should be included in this macro #define BLIS_CONFIG_ZEN3 -#define BLIS_ENABLE_SMALL_MATRIX -#define BLIS_ENABLE_SMALL_MATRIX_TRSM +//#define BLIS_ENABLE_SMALL_MATRIX +//#define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. diff --git a/frame/3/gemm/bli_gemm_front.h b/frame/3/gemm/bli_gemm_front.h index 9465c37d9..3acf29cfb 100644 --- a/frame/3/gemm/bli_gemm_front.h +++ b/frame/3/gemm/bli_gemm_front.h @@ -52,7 +52,7 @@ err_t bli_gemm_small const obj_t* beta, const obj_t* c, const cntx_t* cntx, - const cntl_t* cntl + cntl_t* cntl ); #endif From 4833ba224eba54df3f349bcb7e188bcc53442449 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 12 Dec 2022 20:26:02 -0600 Subject: [PATCH 111/230] Fixed perf of mt sup with packing, and mt gemmlike. (#696) Details: - Brought the gemmsup code path up to date relative to the latest thrinfo_t semantics introduced in the October Omnibus commit (aeb5f0c). This was done by passing the prenode (instead of the current node) into the packm variant within bli_l3_sup_packm.c as well as creating the prenodes and attaching them to the thrinfo_t tree in bli_l3_sup_thrinfo_create(). These changes erase the performance degradation introduced in the omnibus when running multithreaded sup with optional packing enabled. Special thanks to Devin Matthews for sussing out this fix in short order. - Fixed the gemmlike sandbox in a manner similar to that of sup with packing, described above. This also involved passing the prenode into the local gemmlike packm variant. (Recall that gemmlike recycles the use of bli_l3_sup_thrinfo_create(), so it automatically inherits that part of the sup fix described above.) - Updated bls_l3_packm_var[123].c to use bli_thrinfo_n_way() and bli_thrinfo_work_id() instead of bli_thrinfo_num_threads() and bli_thrinfo_thread_id(), respectively. --- frame/3/bli_l3_sup_packm.c | 4 ++-- frame/3/bli_l3_thrinfo.c | 9 +++++++++ sandbox/gemmlike/bls_l3_packm_a.c | 2 +- sandbox/gemmlike/bls_l3_packm_b.c | 2 +- sandbox/gemmlike/bls_l3_packm_var1.c | 4 ++-- sandbox/gemmlike/bls_l3_packm_var2.c | 4 ++-- sandbox/gemmlike/bls_l3_packm_var3.c | 4 ++-- 7 files changed, 19 insertions(+), 10 deletions(-) diff --git a/frame/3/bli_l3_sup_packm.c b/frame/3/bli_l3_sup_packm.c index 797335aeb..890980da3 100644 --- a/frame/3/bli_l3_sup_packm.c +++ b/frame/3/bli_l3_sup_packm.c @@ -394,7 +394,7 @@ void bli_packm_sup ( void* )a, rs_a, cs_a, *p, *rs_p, *cs_p, ( cntx_t* )cntx, - thread + bli_thrinfo_sub_prenode( thread ) ); } else // if ( schema == BLIS_PACKED_ROW_PANELS ) @@ -415,7 +415,7 @@ void bli_packm_sup *p, *rs_p, *cs_p, pd_p, *ps_p, ( cntx_t* )cntx, - thread + bli_thrinfo_sub_prenode( thread ) ); } diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index 0b45abbf6..95d2a5439 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -150,6 +150,15 @@ thrinfo_t* bli_l3_sup_thrinfo_create thrinfo_t* thread_jr = bli_thrinfo_split( n_way_jr, thread_pa ); thrinfo_t* thread_ir = bli_thrinfo_split( n_way_ir, thread_jr ); + const dim_t n_way_pb = bli_thrinfo_num_threads( thread_pb ); + const dim_t n_way_pa = bli_thrinfo_num_threads( thread_pa ); + + // Create and set the prenodes for the packb and packa thrinfo_t nodes. + thrinfo_t* thread_pb_single = bli_thrinfo_split( n_way_pb, thread_pb ); + thrinfo_t* thread_pa_single = bli_thrinfo_split( n_way_pa, thread_pa ); + bli_thrinfo_set_sub_prenode( thread_pb_single, thread_pb ); + bli_thrinfo_set_sub_prenode( thread_pa_single, thread_pa ); + bli_thrinfo_set_sub_node( thread_jc, root ); bli_thrinfo_set_sub_node( thread_pc, thread_jc ); bli_thrinfo_set_sub_node( thread_pb, thread_pc ); diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c index 412c6c24e..742c78bfb 100644 --- a/sandbox/gemmlike/bls_l3_packm_a.c +++ b/sandbox/gemmlike/bls_l3_packm_a.c @@ -276,7 +276,7 @@ void PASTECH2(bls_,ch,opname) \ *p, *rs_p, *cs_p, \ pd_p, *ps_p, \ cntx, \ - thread \ + bli_thrinfo_sub_prenode( thread ) \ ); \ \ /* Barrier so that packing is done before computation. */ \ diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c index cc9757b1d..db6bca8fc 100644 --- a/sandbox/gemmlike/bls_l3_packm_b.c +++ b/sandbox/gemmlike/bls_l3_packm_b.c @@ -276,7 +276,7 @@ void PASTECH2(bls_,ch,opname) \ *p, *rs_p, *cs_p, \ pd_p, *ps_p, \ cntx, \ - thread \ + bli_thrinfo_sub_prenode( thread ) \ ); \ \ /* Barrier so that packing is done before computation. */ \ diff --git a/sandbox/gemmlike/bls_l3_packm_var1.c b/sandbox/gemmlike/bls_l3_packm_var1.c index e4d566b44..7c2c4e9a9 100644 --- a/sandbox/gemmlike/bls_l3_packm_var1.c +++ b/sandbox/gemmlike/bls_l3_packm_var1.c @@ -121,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thrinfo_num_threads( thread ); \ - const dim_t tid = bli_thrinfo_thread_id( thread ); \ + const dim_t nt = bli_thrinfo_n_way( thread ); \ + const dim_t tid = bli_thrinfo_work_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/sandbox/gemmlike/bls_l3_packm_var2.c index 3e7e7888a..94ee0efcd 100644 --- a/sandbox/gemmlike/bls_l3_packm_var2.c +++ b/sandbox/gemmlike/bls_l3_packm_var2.c @@ -121,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thrinfo_num_threads( thread ); \ - const dim_t tid = bli_thrinfo_thread_id( thread ); \ + const dim_t nt = bli_thrinfo_n_way( thread ); \ + const dim_t tid = bli_thrinfo_work_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ diff --git a/sandbox/gemmlike/bls_l3_packm_var3.c b/sandbox/gemmlike/bls_l3_packm_var3.c index 4ccb1828d..48cd6dd60 100644 --- a/sandbox/gemmlike/bls_l3_packm_var3.c +++ b/sandbox/gemmlike/bls_l3_packm_var3.c @@ -121,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ - const dim_t nt = bli_thrinfo_num_threads( thread ); \ - const dim_t tid = bli_thrinfo_thread_id( thread ); \ + const dim_t nt = bli_thrinfo_n_way( thread ); \ + const dim_t tid = bli_thrinfo_work_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ From 3accacf57d11e9b109339754f91bf22329b6cb6a Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 16 Dec 2022 10:26:33 -0600 Subject: [PATCH 112/230] Skip 1m optimization when forcing hemm_l/symm_l. (#697) Details: - Fixed a bug in right-sided hemm when: - using the 1m method, - #defining BLIS_DISABLE_HEMM_RIGHT in the active subconfiguration, and - the storage of C matches the gemm microkernel IO preference PRIOR to the right-sidedness being detected and recast in terms of the left- side code path. It turns out that bli_gemm_ind_recast_1m_params() was applying its optimization (recasting a complex-domain macrokernel calling a 1m virtual microkernel to a real-domain macrokernel calling the real- domain microkernel) in situations in which it should not have. The optimization was silently assuming that the storage of C always matched that of the microkernel preference, since the front-end (in this case, bli_hemm_front()) would have already had a chance to transpose the operation to bring the two into agreement. However, by disabling right-sided hemm, we deprive BLIS of that flexibility (as a transposed left-sided case would necessarily have to become a right- sided case), and thus the assumption was no longer holding in all cases. Thanks to Nisanth M P for reporting this bug in Issue #621. - The aforementioned bug, and its bugfix, also apply to symm when BLIS_DISABLE_SYMM_RIGHT is defined. - Comment updates. - CREDITS file update. --- CREDITS | 1 + frame/3/gemm/bli_gemm_ker_var2.c | 12 +++--- frame/3/gemm/ind/bli_gemm_ind_opt.h | 64 +++++++++++++++++++++-------- 3 files changed, 54 insertions(+), 23 deletions(-) diff --git a/CREDITS b/CREDITS index 55c974f1b..939351c00 100644 --- a/CREDITS +++ b/CREDITS @@ -74,6 +74,7 @@ but many others have contributed code and feedback, including @nagsingh Bhaskar Nallani @BhaskarNallani (AMD) Stepan Nassyr @stepannassyr (Jülich Supercomputing Centre) + Nisanth M P @nisanthmp Nisanth Padinharepatt (AMD) Ajay Panyala @ajaypanyala Marc-Antoine Parent @maparent (Conversence) diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 51dceced2..d59695081 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -131,13 +131,10 @@ void bli_gemm_ker_var2 const char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b ); const char* beta_cast = bli_obj_internal_scalar_buffer( c ); - // If 1m is being employed on a column- or row-stored matrix with a - // real-valued beta, we can use the real domain macro-kernel, which - // eliminates a little overhead associated with the 1m virtual - // micro-kernel. - // Only employ this optimization if the storage datatype of C is - // equal to the execution/computation datatype. #if 1 + // Under certain conditions, we can avoid the overhead of calling the 1m + // virtual microkernel by having the real-domain macrokernel execute with + // the real-domain microkernel. (See the function definition for details.) if ( bli_cntx_method( cntx ) == BLIS_1M ) { bli_gemm_ind_recast_1m_params @@ -149,7 +146,8 @@ void bli_gemm_ker_var2 &m, &n, &k, &pd_a, &ps_a, &pd_b, &ps_b, - &rs_c, &cs_c + &rs_c, &cs_c, + cntx ); } #endif diff --git a/frame/3/gemm/ind/bli_gemm_ind_opt.h b/frame/3/gemm/ind/bli_gemm_ind_opt.h index 789d5895c..a57325580 100644 --- a/frame/3/gemm/ind/bli_gemm_ind_opt.h +++ b/frame/3/gemm/ind/bli_gemm_ind_opt.h @@ -34,27 +34,59 @@ BLIS_INLINE void bli_gemm_ind_recast_1m_params ( - num_t* dt_exec, - num_t* dt_c, - pack_t schema_a, - const obj_t* c, - dim_t* m, - dim_t* n, - dim_t* k, - inc_t* pd_a, inc_t* ps_a, - inc_t* pd_b, inc_t* ps_b, - inc_t* rs_c, inc_t* cs_c + num_t* dt_exec, + num_t* dt_c, + pack_t schema_a, + const obj_t* c, + dim_t* m, + dim_t* n, + dim_t* k, + inc_t* pd_a, inc_t* ps_a, + inc_t* pd_b, inc_t* ps_b, + inc_t* rs_c, inc_t* cs_c, + const cntx_t* cntx ) { obj_t beta; - /* Detach the beta scalar from c so that we can test its imaginary - component. */ + // Detach the beta scalar from c so that we can test its imaginary + // component. bli_obj_scalar_detach( c, &beta ); - /* If beta is in the real domain, and c is row- or column-stored, - then we may proceed with the optimization. */ - if ( bli_obj_imag_is_zero( &beta ) && +#if 1 + // Determine whether the storage of C matches the IO preference of the + // microkernel. (We cannot utilize the optimization below if there is a + // mismatch.) + const ukr_t ukr_id = BLIS_GEMM_VIR_UKR; + + const bool row_stored = bli_is_row_stored( *rs_c, *cs_c ); + const bool col_stored = !row_stored; + const bool row_pref = bli_cntx_ukr_prefers_rows_dt( *dt_c, ukr_id, cntx ); + const bool col_pref = !row_pref; + + const bool is_match = ( row_stored && row_pref ) || + ( col_stored && col_pref ); +#else + // This was the previous behavior, which resulted in buggy behavior + // when executing right-sided hemm, and: + // - the 1m method is enabled, + // - BLIS_DISABLE_HEMM_RIGHT is #defined, and + // - the storage of C matches the microkernel IO preference PRIOR to + // detecting the right-sidedness of the operation. + // See Issue #621 for details. + const bool is_match = TRUE; +#endif + + // If (a) the storage of C matches the IO pref of the ukernel, (b) beta is + // in the real domain, and (c) C is row- or column-stored, then we may + // proceed with the optimization below, which allows 1m to be induced by + // executing the real-domain macrokernel with the real-domain microkernel + // plus a few tweaked parameters. Otherwise, we must skip the optimization + // and allow 1m to execute via the complex-domain macrokernel calling the + // 1m virtual microkernel function, which will incur a little extra + // overhead. + if ( is_match && + bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); @@ -69,7 +101,7 @@ BLIS_INLINE void bli_gemm_ind_recast_1m_params *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } - else /* if ( bli_is_1r_packed( schema_a ) ) */ + else // if ( bli_is_1r_packed( schema_a ) ) { *m *= 1; *n *= 2; From 7d23dc2a064a371dc9883e2c2c7236a70912428c Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sun, 25 Dec 2022 19:09:14 -0600 Subject: [PATCH 113/230] Fix a race condition which manifested as incorrect results (rarely). (#702) The problem occurs when there are at least two teams of threads packing different parts of a matrix, and where each team has at least two threads; call them team A and team B. The problematic sequence is: 1. The chief of team A checks out a block B and broadcasts the pointer to its teammates. 2. Team A completely packs their data and perform a barrier amongst themselves. 3. Team A commences computing with the packed data. 4. The chief of team A finishes computing before its teammates, then calls bli_thrinfo_free on its thrinfo_t struct (which contains the mem_t object referencing the buffer B). This causes buffer B to be checked back in to the pba. 5. The chief of team B checks out the *same* block B that was just checked back in and broadcasts the pointer to its teammates. 6. DATA RACE: now the remaining threads of team A are reading *while* team B are writing to the same buffer B. If team A write new data before team B are done computing then an incorrect result is generated. The solution is to place a global barrier before the call to bli_thrinfo_free at the end of the computation. Co-authored-by: Field G. Van Zee --- frame/3/bli_l3_decor.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/frame/3/bli_l3_decor.c b/frame/3/bli_l3_decor.c index 4160751e6..88ec5def9 100644 --- a/frame/3/bli_l3_decor.c +++ b/frame/3/bli_l3_decor.c @@ -114,6 +114,11 @@ static void bli_l3_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, const bli_l3_cntl_free( sba_pool, cntl_use ); // Free the current thread's thrinfo_t structure. + // NOTE: The barrier here is very important as it prevents memory being + // released by the chief of some thread sub-group before its peers are done + // using it. See PR #702 for more info [1]. + // [1] https://github.com/flame/blis/pull/702 + bli_thrinfo_barrier( thread ); bli_thrinfo_free( thread ); } From 538150c5845ad903773ca797c740048174116aa4 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sun, 25 Dec 2022 22:28:09 -0600 Subject: [PATCH 114/230] Applied race condition fix to sup thread decorator. Details: - Applied the race condition bugfix in commit 7d23dc2 to the corresponding sup code in bli_l3_sup_decor.c. Note that in the case of sup, the race condition would have only manifested when optional packing was enabled at runtime (typically via setting BLIS_PACK_A and/or BLIS_PACK_B environment variables). - Both the fix in this commit and the fix in 7d23dc2 address bugs that were introduced when the thrinfo_t trees/communicators were restructured in the October omnibus commit (aeb5f0c). --- frame/3/bli_l3_sup_decor.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/frame/3/bli_l3_sup_decor.c b/frame/3/bli_l3_sup_decor.c index 5f415ac50..7cda8bdca 100644 --- a/frame/3/bli_l3_sup_decor.c +++ b/frame/3/bli_l3_sup_decor.c @@ -85,6 +85,11 @@ static void bli_l3_sup_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, co ); // Free the current thread's thrinfo_t structure. + // NOTE: The barrier here is very important as it prevents memory being + // released by the chief of some thread sub-group before its peers are done + // using it. See PR #702 for more info [1]. + // [1] https://github.com/flame/blis/pull/702 + bli_thrinfo_barrier( thread ); bli_thrinfo_free( thread ); } From f956b79922da412791e4c8b8b846b3aafc0a5ee0 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 31 Dec 2022 20:18:08 -0600 Subject: [PATCH 115/230] Switch to l3 sup decorator in gemmlike sandbox. (#704) Details: - Modified the gemmlike sandbox to call bli_l3_sup_thread_decorator() rather than a local analogue of that code. This reduces redundant logic and makes it easier for the sandbox to inherit future improvements to the framework's threading code. - Moved addon/gemmd to addon/old/gemmd. This code has fallen out of date and is taking too much effort to maintain. We will very likely reimplement it completely once future changes are made to the framework proper. --- .../{ => old}/gemmd/attic/bao_gemmd_bp_var2.c | 0 addon/{ => old}/gemmd/attic/bli_gemm_ex.c | 0 addon/{ => old}/gemmd/bao_gemmd.c | 0 addon/{ => old}/gemmd/bao_gemmd.h | 0 addon/{ => old}/gemmd/bao_gemmd_bp_var1.c | 0 addon/{ => old}/gemmd/bao_gemmd_check.c | 0 addon/{ => old}/gemmd/bao_gemmd_check.h | 0 addon/{ => old}/gemmd/bao_gemmd_var.h | 0 addon/{ => old}/gemmd/bao_l3_packm_a.c | 0 addon/{ => old}/gemmd/bao_l3_packm_a.h | 0 addon/{ => old}/gemmd/bao_l3_packm_b.c | 0 addon/{ => old}/gemmd/bao_l3_packm_b.h | 0 addon/{ => old}/gemmd/bao_l3_packm_var.h | 0 addon/{ => old}/gemmd/bao_l3_packm_var1.c | 0 addon/{ => old}/gemmd/bao_l3_packm_var2.c | 0 addon/{ => old}/gemmd/bao_packm_cxk.c | 0 addon/{ => old}/gemmd/bao_packm_cxk.h | 0 addon/{ => old}/gemmd/gemmd.h | 0 addon/{ => old}/gemmd/thread/bao_l3_decor.c | 0 addon/{ => old}/gemmd/thread/bao_l3_decor.h | 0 .../gemmd/thread/bao_l3_decor_openmp.c | 0 .../gemmd/thread/bao_l3_decor_openmp.h | 0 .../gemmd/thread/bao_l3_decor_pthreads.c | 0 .../gemmd/thread/bao_l3_decor_pthreads.h | 0 .../gemmd/thread/bao_l3_decor_single.c | 0 .../gemmd/thread/bao_l3_decor_single.h | 0 sandbox/gemmlike/bli_sandbox.h | 2 -- sandbox/gemmlike/bls_gemm.c | 35 ++++++++++--------- sandbox/gemmlike/bls_gemm.h | 18 +++++----- sandbox/gemmlike/bls_gemm_bp_var1.c | 20 +++++------ sandbox/gemmlike/bls_gemm_var.h | 16 ++++----- sandbox/gemmlike/{ => old}/bls_l3_decor.c | 0 sandbox/gemmlike/{ => old}/bls_l3_decor.h | 0 33 files changed, 42 insertions(+), 49 deletions(-) rename addon/{ => old}/gemmd/attic/bao_gemmd_bp_var2.c (100%) rename addon/{ => old}/gemmd/attic/bli_gemm_ex.c (100%) rename addon/{ => old}/gemmd/bao_gemmd.c (100%) rename addon/{ => old}/gemmd/bao_gemmd.h (100%) rename addon/{ => old}/gemmd/bao_gemmd_bp_var1.c (100%) rename addon/{ => old}/gemmd/bao_gemmd_check.c (100%) rename addon/{ => old}/gemmd/bao_gemmd_check.h (100%) rename addon/{ => old}/gemmd/bao_gemmd_var.h (100%) rename addon/{ => old}/gemmd/bao_l3_packm_a.c (100%) rename addon/{ => old}/gemmd/bao_l3_packm_a.h (100%) rename addon/{ => old}/gemmd/bao_l3_packm_b.c (100%) rename addon/{ => old}/gemmd/bao_l3_packm_b.h (100%) rename addon/{ => old}/gemmd/bao_l3_packm_var.h (100%) rename addon/{ => old}/gemmd/bao_l3_packm_var1.c (100%) rename addon/{ => old}/gemmd/bao_l3_packm_var2.c (100%) rename addon/{ => old}/gemmd/bao_packm_cxk.c (100%) rename addon/{ => old}/gemmd/bao_packm_cxk.h (100%) rename addon/{ => old}/gemmd/gemmd.h (100%) rename addon/{ => old}/gemmd/thread/bao_l3_decor.c (100%) rename addon/{ => old}/gemmd/thread/bao_l3_decor.h (100%) rename addon/{ => old}/gemmd/thread/bao_l3_decor_openmp.c (100%) rename addon/{ => old}/gemmd/thread/bao_l3_decor_openmp.h (100%) rename addon/{ => old}/gemmd/thread/bao_l3_decor_pthreads.c (100%) rename addon/{ => old}/gemmd/thread/bao_l3_decor_pthreads.h (100%) rename addon/{ => old}/gemmd/thread/bao_l3_decor_single.c (100%) rename addon/{ => old}/gemmd/thread/bao_l3_decor_single.h (100%) rename sandbox/gemmlike/{ => old}/bls_l3_decor.c (100%) rename sandbox/gemmlike/{ => old}/bls_l3_decor.h (100%) diff --git a/addon/gemmd/attic/bao_gemmd_bp_var2.c b/addon/old/gemmd/attic/bao_gemmd_bp_var2.c similarity index 100% rename from addon/gemmd/attic/bao_gemmd_bp_var2.c rename to addon/old/gemmd/attic/bao_gemmd_bp_var2.c diff --git a/addon/gemmd/attic/bli_gemm_ex.c b/addon/old/gemmd/attic/bli_gemm_ex.c similarity index 100% rename from addon/gemmd/attic/bli_gemm_ex.c rename to addon/old/gemmd/attic/bli_gemm_ex.c diff --git a/addon/gemmd/bao_gemmd.c b/addon/old/gemmd/bao_gemmd.c similarity index 100% rename from addon/gemmd/bao_gemmd.c rename to addon/old/gemmd/bao_gemmd.c diff --git a/addon/gemmd/bao_gemmd.h b/addon/old/gemmd/bao_gemmd.h similarity index 100% rename from addon/gemmd/bao_gemmd.h rename to addon/old/gemmd/bao_gemmd.h diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/old/gemmd/bao_gemmd_bp_var1.c similarity index 100% rename from addon/gemmd/bao_gemmd_bp_var1.c rename to addon/old/gemmd/bao_gemmd_bp_var1.c diff --git a/addon/gemmd/bao_gemmd_check.c b/addon/old/gemmd/bao_gemmd_check.c similarity index 100% rename from addon/gemmd/bao_gemmd_check.c rename to addon/old/gemmd/bao_gemmd_check.c diff --git a/addon/gemmd/bao_gemmd_check.h b/addon/old/gemmd/bao_gemmd_check.h similarity index 100% rename from addon/gemmd/bao_gemmd_check.h rename to addon/old/gemmd/bao_gemmd_check.h diff --git a/addon/gemmd/bao_gemmd_var.h b/addon/old/gemmd/bao_gemmd_var.h similarity index 100% rename from addon/gemmd/bao_gemmd_var.h rename to addon/old/gemmd/bao_gemmd_var.h diff --git a/addon/gemmd/bao_l3_packm_a.c b/addon/old/gemmd/bao_l3_packm_a.c similarity index 100% rename from addon/gemmd/bao_l3_packm_a.c rename to addon/old/gemmd/bao_l3_packm_a.c diff --git a/addon/gemmd/bao_l3_packm_a.h b/addon/old/gemmd/bao_l3_packm_a.h similarity index 100% rename from addon/gemmd/bao_l3_packm_a.h rename to addon/old/gemmd/bao_l3_packm_a.h diff --git a/addon/gemmd/bao_l3_packm_b.c b/addon/old/gemmd/bao_l3_packm_b.c similarity index 100% rename from addon/gemmd/bao_l3_packm_b.c rename to addon/old/gemmd/bao_l3_packm_b.c diff --git a/addon/gemmd/bao_l3_packm_b.h b/addon/old/gemmd/bao_l3_packm_b.h similarity index 100% rename from addon/gemmd/bao_l3_packm_b.h rename to addon/old/gemmd/bao_l3_packm_b.h diff --git a/addon/gemmd/bao_l3_packm_var.h b/addon/old/gemmd/bao_l3_packm_var.h similarity index 100% rename from addon/gemmd/bao_l3_packm_var.h rename to addon/old/gemmd/bao_l3_packm_var.h diff --git a/addon/gemmd/bao_l3_packm_var1.c b/addon/old/gemmd/bao_l3_packm_var1.c similarity index 100% rename from addon/gemmd/bao_l3_packm_var1.c rename to addon/old/gemmd/bao_l3_packm_var1.c diff --git a/addon/gemmd/bao_l3_packm_var2.c b/addon/old/gemmd/bao_l3_packm_var2.c similarity index 100% rename from addon/gemmd/bao_l3_packm_var2.c rename to addon/old/gemmd/bao_l3_packm_var2.c diff --git a/addon/gemmd/bao_packm_cxk.c b/addon/old/gemmd/bao_packm_cxk.c similarity index 100% rename from addon/gemmd/bao_packm_cxk.c rename to addon/old/gemmd/bao_packm_cxk.c diff --git a/addon/gemmd/bao_packm_cxk.h b/addon/old/gemmd/bao_packm_cxk.h similarity index 100% rename from addon/gemmd/bao_packm_cxk.h rename to addon/old/gemmd/bao_packm_cxk.h diff --git a/addon/gemmd/gemmd.h b/addon/old/gemmd/gemmd.h similarity index 100% rename from addon/gemmd/gemmd.h rename to addon/old/gemmd/gemmd.h diff --git a/addon/gemmd/thread/bao_l3_decor.c b/addon/old/gemmd/thread/bao_l3_decor.c similarity index 100% rename from addon/gemmd/thread/bao_l3_decor.c rename to addon/old/gemmd/thread/bao_l3_decor.c diff --git a/addon/gemmd/thread/bao_l3_decor.h b/addon/old/gemmd/thread/bao_l3_decor.h similarity index 100% rename from addon/gemmd/thread/bao_l3_decor.h rename to addon/old/gemmd/thread/bao_l3_decor.h diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.c b/addon/old/gemmd/thread/bao_l3_decor_openmp.c similarity index 100% rename from addon/gemmd/thread/bao_l3_decor_openmp.c rename to addon/old/gemmd/thread/bao_l3_decor_openmp.c diff --git a/addon/gemmd/thread/bao_l3_decor_openmp.h b/addon/old/gemmd/thread/bao_l3_decor_openmp.h similarity index 100% rename from addon/gemmd/thread/bao_l3_decor_openmp.h rename to addon/old/gemmd/thread/bao_l3_decor_openmp.h diff --git a/addon/gemmd/thread/bao_l3_decor_pthreads.c b/addon/old/gemmd/thread/bao_l3_decor_pthreads.c similarity index 100% rename from addon/gemmd/thread/bao_l3_decor_pthreads.c rename to addon/old/gemmd/thread/bao_l3_decor_pthreads.c diff --git a/addon/gemmd/thread/bao_l3_decor_pthreads.h b/addon/old/gemmd/thread/bao_l3_decor_pthreads.h similarity index 100% rename from addon/gemmd/thread/bao_l3_decor_pthreads.h rename to addon/old/gemmd/thread/bao_l3_decor_pthreads.h diff --git a/addon/gemmd/thread/bao_l3_decor_single.c b/addon/old/gemmd/thread/bao_l3_decor_single.c similarity index 100% rename from addon/gemmd/thread/bao_l3_decor_single.c rename to addon/old/gemmd/thread/bao_l3_decor_single.c diff --git a/addon/gemmd/thread/bao_l3_decor_single.h b/addon/old/gemmd/thread/bao_l3_decor_single.h similarity index 100% rename from addon/gemmd/thread/bao_l3_decor_single.h rename to addon/old/gemmd/thread/bao_l3_decor_single.h diff --git a/sandbox/gemmlike/bli_sandbox.h b/sandbox/gemmlike/bli_sandbox.h index f3782b3db..6f33da602 100644 --- a/sandbox/gemmlike/bli_sandbox.h +++ b/sandbox/gemmlike/bli_sandbox.h @@ -53,7 +53,5 @@ #include "bls_packm_cxk.h" -#include "bls_l3_decor.h" - #endif diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c index ba930ebc5..e0fb5bb8a 100644 --- a/sandbox/gemmlike/bls_gemm.c +++ b/sandbox/gemmlike/bls_gemm.c @@ -172,16 +172,16 @@ void bls_gemm_ex // Spawn threads (if applicable), where bls_gemm_int() is the thread entry // point function for each thread. This also begins the process of creating // the thrinfo_t tree, which contains thread communicators. - bls_l3_thread_decorator + bli_l3_sup_thread_decorator ( bls_gemm_int, BLIS_GEMM, // operation family id - ( obj_t* )alpha, - ( obj_t* )&a_local, - ( obj_t* )&b_local, - ( obj_t* )beta, - ( obj_t* )&c_local, - ( cntx_t* )cntx, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, &rntm_l ); } @@ -190,16 +190,16 @@ void bls_gemm_ex // -- Define the gemm-like operation's thread entry point ---------------------- // -void bls_gemm_int +err_t bls_gemm_int ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + const rntm_t* rntm, + thrinfo_t* thread ) { // In this function, we choose the gemm implementation that is executed @@ -214,9 +214,10 @@ void bls_gemm_int beta, c, cntx, - rntm, thread ); + + return BLIS_SUCCESS; } // diff --git a/sandbox/gemmlike/bls_gemm.h b/sandbox/gemmlike/bls_gemm.h index 7380f02ad..b8dba9cfd 100644 --- a/sandbox/gemmlike/bls_gemm.h +++ b/sandbox/gemmlike/bls_gemm.h @@ -60,16 +60,16 @@ void bls_gemm_ex // -- Prototype the gemm-like operation's thread entry point ------------------- // -void bls_gemm_int +err_t bls_gemm_int ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + const rntm_t* rntm, + thrinfo_t* thread ); // diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c index 28c5032bc..02f7458ad 100644 --- a/sandbox/gemmlike/bls_gemm_bp_var1.c +++ b/sandbox/gemmlike/bls_gemm_bp_var1.c @@ -49,7 +49,6 @@ typedef void (*FUNCPTR_T) void* restrict beta, void* restrict c, inc_t rs_c, inc_t cs_c, cntx_t* restrict cntx, - rntm_t* restrict rntm, thrinfo_t* restrict thread ); @@ -63,14 +62,13 @@ static FUNCPTR_T GENARRAY_PREF(ftypes,bls_,gemm_bp_var1); void bls_gemm_bp_var1 ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm, - thrinfo_t* thread + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + thrinfo_t* thread ) { const num_t dt = bli_obj_dt( c ); @@ -114,8 +112,7 @@ void bls_gemm_bp_var1 buf_b, rs_b, cs_b, buf_beta, buf_c, rs_c, cs_c, - cntx, - rntm, + ( cntx_t* )cntx, thread ); } @@ -140,7 +137,6 @@ void PASTECH2(bls_,ch,varname) \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ) \ { \ diff --git a/sandbox/gemmlike/bls_gemm_var.h b/sandbox/gemmlike/bls_gemm_var.h index 7c515f8c3..0a41afde4 100644 --- a/sandbox/gemmlike/bls_gemm_var.h +++ b/sandbox/gemmlike/bls_gemm_var.h @@ -42,14 +42,13 @@ \ void PASTECH(bls_,opname) \ ( \ - obj_t* alpha, \ - obj_t* a, \ - obj_t* b, \ - obj_t* beta, \ - obj_t* c, \ - cntx_t* cntx, \ - rntm_t* rntm, \ - thrinfo_t* thread \ + const obj_t* alpha, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* beta, \ + const obj_t* c, \ + const cntx_t* cntx, \ + thrinfo_t* thread \ ); GENPROT( gemm_bp_var1 ) @@ -75,7 +74,6 @@ void PASTECH2(bls_,ch,varname) \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ cntx_t* restrict cntx, \ - rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); diff --git a/sandbox/gemmlike/bls_l3_decor.c b/sandbox/gemmlike/old/bls_l3_decor.c similarity index 100% rename from sandbox/gemmlike/bls_l3_decor.c rename to sandbox/gemmlike/old/bls_l3_decor.c diff --git a/sandbox/gemmlike/bls_l3_decor.h b/sandbox/gemmlike/old/bls_l3_decor.h similarity index 100% rename from sandbox/gemmlike/bls_l3_decor.h rename to sandbox/gemmlike/old/bls_l3_decor.h From b6735ca26b9d459d9253795dc5841ae8de9e84c9 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 6 Jan 2023 14:10:01 -0600 Subject: [PATCH 116/230] Refactor structure awareness in packm_blk_var1.c. (#707) Details: - Factored some of the structure awareness out of the loop in bli_packm_blk_var1(). So instead of having a single loop with conditionals in the body to handle various kinds of structure (and stored/unstored submatrix placement), we now have a conditional branch to handle various structure/storage scenarios with a loop in each section. This change was originally motivated to choose slab or round- robin partitioning (in the context of triangular matrices) based on the structure of the entire block (or panel) being packed rather than each micropanel individually. Previously, the code would attempt to limit rr to the portion of the block that intersects the diagonal and use slab for the remainder. However, that approach was not well-thought out and in many situations this would lead to inferior load balancing when compared to using round-robin for the entire block (or panel). This commit has the added benefit of incurring less overhead during the packing process now that each of the new loops is simpler. --- frame/1m/packm/bli_packm_blk_var1.c | 174 ++++++++++++++-------------- 1 file changed, 87 insertions(+), 87 deletions(-) diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 05263c4b7..b8f4f945d 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -178,71 +178,98 @@ void bli_packm_blk_var1 char* p_begin = p_cast; - // Iterate over every logical micropanel in the source matrix. - for ( dim_t ic = ic0, ip = ip0, it = 0; it < n_iter; - ic += ic_inc, ip += ip_inc, it += 1 ) + if ( !bli_is_triangular( strucc ) || + bli_is_stored_subpart_n( diagoffc, uploc, iter_dim, panel_len_full ) ) { - dim_t panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); - dim_t panel_dim_off_i = panel_dim_off + ic; - - doff_t diagoffc_i = diagoffc + (ip )*diagoffc_inc; - char* c_begin = c_cast + (ic )*incc*dt_c_size; - - inc_t p_inc = ps_p; - - // NOTE: We MUST use round-robin work allocation (bli_packm_my_iter_rr()) - // when packing micropanels of a triangular matrix. Hermitian/symmetric - // and general packing may use slab or round-robin (bli_packm_my_iter()), - // depending on which was selected at configure-time. - bool my_iter = ( bli_is_triangular( strucc ) && - bli_intersects_diag_n( diagoffc_i, panel_dim_i, - panel_len_full ) - ? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) - : bli_packm_my_iter ( it, it_start, it_end, tid, nt ) - ); - - if ( bli_is_triangular( strucc ) && - bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) ) + // This case executes if the panel is either dense, belongs + // to a Hermitian or symmetric matrix, which includes stored, + // unstored, and diagonal-intersecting panels, or belongs + // to a completely stored part of a triangular matrix. + + // Iterate over every logical micropanel in the source matrix. + for ( dim_t ic = ic0, ip = ip0, it = 0; it < n_iter; + ic += ic_inc, ip += ip_inc, it += 1 ) { - // This case executes if the panel belongs to a triangular - // matrix AND is completely unstored (ie: zero). If the panel - // is unstored, we do nothing. (Notice that we don't even - // increment p_begin.) + dim_t panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); + dim_t panel_dim_off_i = panel_dim_off + ic; - continue; + char* c_begin = c_cast + (ic )*incc*dt_c_size; + + // Hermitian/symmetric and general packing may use slab or + // round-robin (bli_packm_my_iter()), depending on which was + // selected at configure-time. + if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) + { + packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc, + diagc, + uploc, + conjc, + schema, + invdiag, + panel_dim_i, + panel_len_full, + panel_dim_max, + panel_len_max, + panel_dim_off_i, + panel_len_off, + kappa_cast, + c_begin, incc, ldc, + p_begin, ldp, is_p, + ( cntx_t* )cntx, + params ); + } + + p_begin += ps_p*dt_p_size; } - else if ( bli_is_triangular( strucc ) && - bli_intersects_diag_n( diagoffc_i, panel_dim_i, panel_len_full ) ) + } + else + { + // This case executes if the panel belongs to a diagonal-intersecting + // part of a triangular matrix. + + // Iterate over every logical micropanel in the source matrix. + for ( dim_t ic = ic0, ip = ip0, it = 0; it < n_iter; + ic += ic_inc, ip += ip_inc, it += 1 ) { - // This case executes if the panel belongs to a triangular - // matrix AND is diagonal-intersecting. Notice that we - // cannot bury the following conditional logic into - // packm_struc_cxk() because we need to know the value of - // panel_len_max_i so we can properly increment p_inc. - - // Sanity check. Diagonals should not intersect the short end of - // a micro-panel. If they do, then somehow the constraints on - // cache blocksizes being a whole multiple of the register - // blocksizes was somehow violated. - if ( diagoffc_i < 0 ) + dim_t panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); + dim_t panel_dim_off_i = panel_dim_off + ic; + + doff_t diagoffc_i = diagoffc + (ip )*diagoffc_inc; + char* c_begin = c_cast + (ic )*incc*dt_c_size; + + if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, + panel_len_full ) ) + continue; + + // Sanity check. Diagonals should not intersect the short edge of + // a micro-panel (typically corresponding to a register blocksize). + // If they do, then the constraints on cache blocksizes being a + // whole multiple of the register blocksizes was somehow violated. + if ( ( diagoffc_i > -panel_dim_i && + diagoffc_i < 0 ) || + ( diagoffc_i > panel_len_full && + diagoffc_i < panel_len_full + panel_dim_i ) ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); - dim_t panel_off_i; - dim_t panel_len_i; - dim_t panel_len_max_i; + dim_t panel_off_i = 0; + dim_t panel_len_i = panel_len_full; + dim_t panel_len_max_i = panel_len_max; - if ( bli_is_lower( uploc ) ) + if ( bli_intersects_diag_n( diagoffc_i, panel_dim_i, panel_len_full ) ) { - panel_off_i = 0; - panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; - panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, - panel_len_max ); - } - else // if ( bli_is_upper( uploc ) ) - { - panel_off_i = bli_abs( diagoffc_i ); - panel_len_i = panel_len_full - panel_off_i; - panel_len_max_i = panel_len_max - panel_off_i; + if ( bli_is_lower( uploc ) ) + { + panel_off_i = 0; + panel_len_i = diagoffc_i + panel_dim_i; + panel_len_max_i = bli_min( diagoffc_i + panel_dim_max, + panel_len_max ); + } + else // if ( bli_is_upper( uploc ) ) + { + panel_off_i = diagoffc_i; + panel_len_i = panel_len_full - panel_off_i; + panel_len_max_i = panel_len_max - panel_off_i; + } } dim_t panel_len_off_i = panel_off_i + panel_len_off; @@ -259,7 +286,9 @@ void bli_packm_blk_var1 // We nudge the imaginary stride up by one if it is odd. is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); - if ( my_iter ) + // NOTE: We MUST use round-robin work allocation (bli_packm_my_iter_rr()) + // when packing micropanels of a triangular matrix. + if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) { packm_ker_cast( strucc, diagc, @@ -284,37 +313,8 @@ void bli_packm_blk_var1 // NOTE: This value is usually LESS than ps_p because triangular // matrices usually have several micro-panels that are shorter // than a "full" micro-panel. - p_inc = is_p_use; + p_begin += is_p_use*dt_p_size; } - else - { - // This case executes if the panel is either dense, or belongs - // to a Hermitian or symmetric matrix, which includes stored, - // unstored, and diagonal-intersecting panels. - - if ( my_iter ) - { - packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc, - diagc, - uploc, - conjc, - schema, - invdiag, - panel_dim_i, - panel_len_full, - panel_dim_max, - panel_len_max, - panel_dim_off_i, - panel_len_off, - kappa_cast, - c_begin, incc, ldc, - p_begin, ldp, is_p, - ( cntx_t* )cntx, - params ); - } - } - - p_begin += p_inc*dt_p_size; } } From 2e1ba9d13c23a06a7b6f8bd326af428f7ea68c31 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 10 Jan 2023 21:05:54 -0600 Subject: [PATCH 117/230] Tile-level partitioning in jr/ir loops (ex-trsm). (#695) Details: - Reimplemented parallelization of the JR loop in gemmt (which is recycled for herk, her2k, syrk, and syr2k). Previously, the rectangular region of the current MC x NC panel of C would be parallelized separately from from the diagonal region of that same submatrix, with the rectangular portion being assigned to threads via slab or round-robin (rr) partitioning (as determined at configure- time) and the diagonal region being assigned via round-robin. This approach did not work well when extracting lots of parallelism from the JR loop and was often suboptimal even for smaller degrees of parallelism. This commit implements tile-level load balancing (tlb) in which the IR loop is effectively subjugated in service of more equitably dividing work in the JR loop. This approach is especially potent for certain situations where the diagonal region of the MC x NR panel of C are significant relative to the entire region. However, it also seems to benefit many problem sizes of other level-3 operations (excluding trsm, which has an inherent algorithmic dependency in the IR loop that prevents the application of tlb). For now, tlb is implemented as _var2b.c macrokernels for gemm (which forms the basis for gemm, hemm, and symm), gemmt (which forms the basis of herk, her2k, syrk, and syr2k), and trmm (which forms the basis of trmm and trmm3). Which function pointers (_var2() or _var2b()) are embedded in the control tree will depend on whether the BLIS_ENABLE_JRIR_TLB cpp macro is defined, which is controlled by the value passed to the existing --thread-part-jrir=METHOD (or -r METHOD) configure option. This script adds 'tlb' as a valid option alongside the previously supported values of 'slab' and 'rr'. ('slab' is still the default.) Thanks to Leick Robinson for abstractly inspiring this work, and to Minh Quan Ho for inquiring (in PR #562, and before that in Issue #437) about the possibility of improved load balance in macrokernel loops, and even prototyping what it might look like, long before I fully understood the problem. - In bli_thread_range_weighted_sub(), tweaked the the way we compute the area of the current MC x NC trapezoidal panel of C by better taking into account the microtile structure along the diagonal. Previously, it was an underestimate, as it assumed MR = NR = 1 (that is, it assumed that the microtile column of C that overlapped with microtiles exactly coincided with the diagonal). Now, we only assume MR = NR. This is still a slight underestimate when MR != NR, so the additional area is scaled by 1.5 in a hackish attempt to compensate for this, as well as other additional effects that are difficult to model (such as the increased cost of writing to temporary tiles before finally updating C). The net effect of this better estimation of the trapezoidal area should be (on average) slightly larger regions assigned to threads that have little or no overlap with the diagonal region (and correspondingly slightly smaller regions in the diagonal region), which we expect will lead to slightly better load balancing in most situations. - Spun off the contents of bli_thread.[ch] that relate to computing thread ranges into one of three source/header file pairs: - bli_thread_range.[ch], which define functions that are not specific to the jr/ir loops; - bli_thread_range_slab_rr.[ch], which define functions that implement slab or round-robin partitioning for the jr/ir loops; - bli_thread_range_tlb.[ch], which define functions that implement tlb for the jr/ir loops. - Fixed the computation of a_next in the last iteration of the IR loop in bli_gemmt_l_ker_var2(). Previously, it always "wrapped" back around to the first micropanel of the current MC x KC packed block of A. However, this is almost never actually the micropanel that is used next. A new macro, bli_gemmt_l_wrap_a_upanel(), computes a_next correctly, with a similarly named bli_gemmt_u_wrap_a_upanel() for use in the upper-stored case (which *does* actually always choose the first micropanel of A as its a_next at the end of the IR loop). - Removed adjustments for a_next/b_next (a2/b2) for the diagonal- intersecting case of gemmt_l_ker_var2() and the above-diagonal case of gemmt_u_ker_var2() since these cases will only coincide with the last iteration of the IR loop in very small problems. - Defined bli_is_last_iter_l() and bli_is_last_iter_u(), the latter of which explicitly considers whether the current microtile is the last tile that intersects the diagonal. (The former does the same, but the computation coincides with the original bli_is_last_iter().) These functions are now used in gemmt to test when a_next (or a2) should "wrap" (as discussed above). Also defined bli_is_last_iter_tlb_l() and bli_is_last_iter_tlb_u(), which are similar to the aforementioned functions but are used when employing tlb in gemmt. - Redefined macros in bli_packm_thrinfo.h, which test whether an iteration of work is assigned to a thread, as static inline functions in bli_param_macro_defs.h (and then deleted bli_packm_thrinfo.h). In the process of redefining these macros, I also renamed them from bli_packm_my_iter_rr/sl() to bli_is_my_iter_rr/sl(). - Renamed bli_thread_range_jrir_rr() -> bli_thread_range_rr() bli_thread_range_jrir_sl() -> bli_thread_range_sl() bli_thread_range_jrir() -> bli_thread_range_slrr() - Renamed bli_is_last_iter() -> bli_is_last_iter_slrr() - Defined bli_info_get_thread_jrir_tlb() and renamed: - bli_info_get_thread_part_jrir_slab() -> bli_info_get_thread_jrir_slab() - bli_info_get_thread_part_jrir_rr() -> bli_info_get_thread_jrir_rr() - Modified bli_rntm_set_ways_for_op() to redirect IR loop parallelism into the JR loop when tlb is enabled for non-trsm level-3 operations. - Added a sanity check to prevent bli_prune_unref_mparts() from being used on packed objects. This prohibition is necessary because the current implementation does not take into account the atomicity of packed micropanel widths relative to the diagonal of structured matrices. That is, the function prunes greedily without regard to whether doing so would prune off part of a micropanel *which has already been packed* and assigned to a thread for inclusion in the computation. - Further restricted early returns in bli_prune_unref_mparts() to situations where the primary matrix is not only of general structure but also dense (in terms of its uplo_t value). The addition of the matrix's dense-ness to the conditional is required because gemmt is somewhat unusual in that its C matrix has general structure but is marked as lower- or upper-stored via its uplo_t. By only checking for general structure, attempts to prune gemmt C matrices would incorrectly result in early returns, even though that operation effectively treats the matrix as symmetric (and stored in only one triangle). - Fixed a latent bug in bli_thread_range_rr() wherein incorrect ranges were computed when 1 < bf. Thankfully, this bug was not yet manifesting since all current invocations used bf == 1. - Fixed a latent bug in some unexercised code in bli_?gemmt_l_ker_var2() that would perform incorrect pruning of unreferenced regions above where the diagonal of a lower-stored matrix intersects the right edge. Thankfully, the bug was not harming anything since those unreferenced regions were being pruned prior to the macrokernel. - Rewrote slab/rr-based gemmt macrokernels so that they no longer carved C into rectangular and diagonal regions prior to parallelizing each separately. The new macrokernels use a unified loop structure where quadratic (slab) partitioning is used. - Updated all level-3 macrokernels to have a more uniform coding style, such as wrt combining variable declarations with initializations as well as the use of const. - Updated bls_l3_packm_var[123].c to use bli_thrinfo_n_way() and bli_thrinfo_work_id() instead of bli_thrinfo_num_threads() and bli_thrinfo_thread_id(), respectively. This change probably should have been included in aeb5f0c. - Removed old prototypes in bli_gemmt_var.h and bli_trmm_var.h that corresponded to functions that were removed in aeb5f0c. - Other very minor cleanups. - Comment updates. --- build/bli_config.h.in | 4 + configure | 61 +- frame/1m/packm/bli_packm.h | 1 - frame/1m/packm/bli_packm_blk_var1.c | 16 +- frame/3/bli_l3_sup_packm_var.c | 16 +- frame/3/bli_l3_sup_var12.c | 4 +- frame/3/bli_l3_thrinfo.h | 12 +- frame/3/gemm/bli_gemm_cntl.c | 23 +- frame/3/gemm/bli_gemm_ker_var2.c | 93 +- frame/3/gemm/bli_gemm_ker_var2b.c | 379 ++++ frame/3/gemm/bli_gemm_var.h | 3 +- frame/3/gemmt/attic/bli_gemmt_l_ker_var2b.c | 429 +++++ frame/3/gemmt/attic/bli_gemmt_u_ker_var2b.c | 418 ++++ frame/3/gemmt/bli_gemmt_l_ker_var2.c | 274 +-- frame/3/gemmt/bli_gemmt_l_ker_var2b.c | 387 ++++ frame/3/gemmt/bli_gemmt_u_ker_var2.c | 273 +-- frame/3/gemmt/bli_gemmt_u_ker_var2b.c | 386 ++++ frame/3/gemmt/bli_gemmt_var.h | 45 +- frame/3/gemmt/bli_gemmt_x_ker_var2.c | 14 +- frame/3/gemmt/bli_gemmt_x_ker_var2b.c | 73 + .../3/gemmt/other/bli_gemmt_l_ker_var2.c.prev | 507 +++++ .../other/bli_gemmt_l_ker_var2b.c.before | 427 +++++ .../3/gemmt/other/bli_gemmt_u_ker_var2.c.prev | 510 +++++ .../other/bli_gemmt_u_ker_var2b.c.before | 415 ++++ frame/3/trmm/bli_trmm_ll_ker_var2.c | 99 +- frame/3/trmm/bli_trmm_ll_ker_var2b.c | 365 ++++ frame/3/trmm/bli_trmm_lu_ker_var2.c | 99 +- frame/3/trmm/bli_trmm_lu_ker_var2b.c | 366 ++++ frame/3/trmm/bli_trmm_rl_ker_var2.c | 75 +- frame/3/trmm/bli_trmm_rl_ker_var2b.c | 392 ++++ frame/3/trmm/bli_trmm_ru_ker_var2.c | 77 +- frame/3/trmm/bli_trmm_ru_ker_var2b.c | 390 ++++ frame/3/trmm/bli_trmm_var.h | 53 +- frame/3/trmm/bli_trmm_xx_ker_var2.c | 14 +- frame/3/trmm/bli_trmm_xx_ker_var2b.c | 87 + .../3/trmm/other/bli_trmm_rl_ker_var2.c.prev | 371 ++++ .../trmm/other/bli_trmm_rl_ker_var2.c.unified | 324 ++++ frame/3/trmm/other/bli_trmm_ru_ker_var2.c | 2 +- frame/3/trsm/bli_trsm_ll_ker_var2.c | 65 +- frame/3/trsm/bli_trsm_lu_ker_var2.c | 69 +- frame/3/trsm/bli_trsm_rl_ker_var2.c | 143 +- frame/3/trsm/bli_trsm_ru_ker_var2.c | 12 +- frame/3/trsm/bli_trsm_var.h | 2 +- frame/3/trsm/bli_trsm_xx_ker_var2.c | 14 +- frame/base/bli_info.c | 12 +- frame/base/bli_info.h | 5 +- frame/base/bli_prune.c | 39 +- frame/base/bli_rntm.c | 40 +- frame/include/bli_config_macro_defs.h | 10 + frame/include/bli_kernel_macro_defs.h | 2 + frame/include/bli_param_macro_defs.h | 51 +- frame/include/blis.h | 4 + frame/thread/bli_thread.c | 901 --------- frame/thread/bli_thread.h | 180 +- frame/thread/bli_thread_range.c | 1121 +++++++++++ frame/thread/bli_thread_range.h | 128 ++ frame/thread/bli_thread_range_slab_rr.c | 134 ++ frame/thread/bli_thread_range_slab_rr.h | 116 ++ frame/thread/bli_thread_range_tlb.c | 1699 +++++++++++++++++ frame/thread/bli_thread_range_tlb.h | 192 ++ frame/thread/old/bli_thread_range_snake.c | 120 ++ .../old/bli_thread_range_snake.h} | 46 +- sandbox/gemmlike/bls_gemm_bp_var1.c | 4 +- sandbox/gemmlike/bls_l3_packm_var1.c | 8 +- sandbox/gemmlike/bls_l3_packm_var2.c | 8 +- testsuite/src/test_libblis.c | 7 +- testsuite/src/test_trmm.c | 3 + 67 files changed, 10597 insertions(+), 2022 deletions(-) create mode 100644 frame/3/gemm/bli_gemm_ker_var2b.c create mode 100644 frame/3/gemmt/attic/bli_gemmt_l_ker_var2b.c create mode 100644 frame/3/gemmt/attic/bli_gemmt_u_ker_var2b.c create mode 100644 frame/3/gemmt/bli_gemmt_l_ker_var2b.c create mode 100644 frame/3/gemmt/bli_gemmt_u_ker_var2b.c create mode 100644 frame/3/gemmt/bli_gemmt_x_ker_var2b.c create mode 100644 frame/3/gemmt/other/bli_gemmt_l_ker_var2.c.prev create mode 100644 frame/3/gemmt/other/bli_gemmt_l_ker_var2b.c.before create mode 100644 frame/3/gemmt/other/bli_gemmt_u_ker_var2.c.prev create mode 100644 frame/3/gemmt/other/bli_gemmt_u_ker_var2b.c.before create mode 100644 frame/3/trmm/bli_trmm_ll_ker_var2b.c create mode 100644 frame/3/trmm/bli_trmm_lu_ker_var2b.c create mode 100644 frame/3/trmm/bli_trmm_rl_ker_var2b.c create mode 100644 frame/3/trmm/bli_trmm_ru_ker_var2b.c create mode 100644 frame/3/trmm/bli_trmm_xx_ker_var2b.c create mode 100644 frame/3/trmm/other/bli_trmm_rl_ker_var2.c.prev create mode 100644 frame/3/trmm/other/bli_trmm_rl_ker_var2.c.unified create mode 100644 frame/thread/bli_thread_range.c create mode 100644 frame/thread/bli_thread_range.h create mode 100644 frame/thread/bli_thread_range_slab_rr.c create mode 100644 frame/thread/bli_thread_range_slab_rr.h create mode 100644 frame/thread/bli_thread_range_tlb.c create mode 100644 frame/thread/bli_thread_range_tlb.h create mode 100644 frame/thread/old/bli_thread_range_snake.c rename frame/{1m/packm/bli_packm_thrinfo.h => thread/old/bli_thread_range_snake.h} (70%) diff --git a/build/bli_config.h.in b/build/bli_config.h.in index 41e76d214..7dc67059f 100644 --- a/build/bli_config.h.in +++ b/build/bli_config.h.in @@ -80,6 +80,10 @@ #define BLIS_ENABLE_JRIR_RR #endif +#if @enable_jrir_tlb@ +#define BLIS_ENABLE_JRIR_TLB +#endif + #if @enable_pba_pools@ #define BLIS_ENABLE_PBA_POOLS #else diff --git a/configure b/configure index 286a66123..06201b4fa 100755 --- a/configure +++ b/configure @@ -340,16 +340,36 @@ print_usage() echo " " echo " -r METHOD, --thread-part-jrir=METHOD" echo " " - echo " Request a method of assigning micropanels to threads in" - echo " the JR and IR loops. Valid values for METHOD are 'slab'" - echo " and 'rr'. Using 'slab' assigns (as much as possible)" - echo " contiguous regions of micropanels to each thread while" - echo " using 'rr' assigns micropanels to threads in a round-" - echo " robin fashion. The chosen method also applies during" - echo " the packing of A and B. The default method is 'slab'." - echo " NOTE: Specifying this option constitutes a request," - echo " which may be ignored in select situations if the" - echo " implementation has a good reason to do so." + echo " Select a strategy for partitioning computation in JR and" + echo " IR loops and assigning that computation to threads. Valid" + echo " values for METHOD are 'rr', 'slab', and 'tlb':" + echo " 'rr': Assign the computation associated with whole" + echo " columns of microtiles to threads in a round-" + echo " robin fashion. When selected, round-robin" + echo " assignment is also employed during packing." + echo " 'slab': Partition the computation into N contiguous" + echo " regions, where each region contains a whole" + echo " number of microtile columns, and assign one" + echo " region to each thread. For some operations, the" + echo " number of microtile columns contained within a" + echo " given region may differ from that of other" + echo " regions, depending on how much work is implied" + echo " by each region. When selected, slab assignment" + echo " is also employed during packing." + echo " 'tlb': Tile-level load balancing is similar to slab," + echo " except that regions will be divided at a more" + echo " granular level (individual microtiles instead" + echo " of whole columns of microtiles) to ensure more" + echo " equitable assignment of work to threads. When" + echo " selected, tlb will only be employed for level-3" + echo " operations except trsm; due to practical and" + echo " algorithmic limitations, slab partitioning will" + echo " be used instead during packing and for trsm." + echo " The default strategy is 'slab'. NOTE: Specifying this" + echo " option constitutes a request, which may be ignored in" + echo " select situations if implementation has a good reason to" + echo " do so. (See description of 'tlb' above for an example of" + echo " this.)" echo " " echo " --disable-trsm-preinversion, --enable-trsm-preinversion" echo " " @@ -3731,16 +3751,20 @@ main() # Check the method of assigning micropanels to threads in the JR and IR # loops. - enable_jrir_slab_01=0 enable_jrir_rr_01=0 - if [ "x${thread_part_jrir}" = "xslab" ]; then - echo "${script_name}: requesting slab threading in jr and ir loops." - enable_jrir_slab_01=1 - elif [ "x${thread_part_jrir}" = "xrr" ]; then - echo "${script_name}: requesting round-robin threading in jr and ir loops." + enable_jrir_slab_01=0 + enable_jrir_tlb_01=0 + if [ "x${thread_part_jrir}" = "xrr" ]; then + echo "${script_name}: requesting round-robin (rr) work partitioning in jr and/or ir loops." enable_jrir_rr_01=1 + elif [ "x${thread_part_jrir}" = "xslab" ]; then + echo "${script_name}: requesting slab work partitioning in jr and/or ir loops." + enable_jrir_slab_01=1 + elif [ "x${thread_part_jrir}" = "xtlb" ]; then + echo "${script_name}: requesting tile-level load balancing (tlb) in unified jr+ir loop." + enable_jrir_tlb_01=1 else - echo "${script_name}: *** Unsupported method of thread partitioning in jr and ir loops: ${thread_part_jrir}." + echo "${script_name}: *** Unsupported method of work partitioning in jr/ir loops: ${thread_part_jrir}." exit 1 fi @@ -4177,8 +4201,9 @@ main() | sed -e "s/@enable_pthreads_as_def@/${enable_pthreads_as_def_01}/g" \ | sed -e "s/@enable_hpx@/${enable_hpx_01}/g" \ | sed -e "s/@enable_hpx_as_def@/${enable_hpx_as_def_01}/g" \ - | sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \ | sed -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \ + | sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \ + | sed -e "s/@enable_jrir_tlb@/${enable_jrir_tlb_01}/g" \ | sed -e "s/@enable_pba_pools@/${enable_pba_pools_01}/g" \ | sed -e "s/@enable_sba_pools@/${enable_sba_pools_01}/g" \ | sed -e "s/@enable_mem_tracing@/${enable_mem_tracing_01}/g" \ diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 80878fba0..7d73bf903 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -39,7 +39,6 @@ #include "bli_packm_init.h" #include "bli_packm_int.h" #include "bli_packm_scalar.h" -#include "bli_packm_thrinfo.h" #include "bli_packm_part.h" diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index b8f4f945d..561988e7f 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -170,11 +170,11 @@ void bli_packm_blk_var1 const dim_t tid = bli_thrinfo_work_id( thread ); // Determine the thread range and increment using the current thread's - // packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + // packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr() // will depend on whether slab or round-robin partitioning was requested // at configure-time. dim_t it_start, it_end, it_inc; - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); char* p_begin = p_cast; @@ -195,10 +195,10 @@ void bli_packm_blk_var1 char* c_begin = c_cast + (ic )*incc*dt_c_size; - // Hermitian/symmetric and general packing may use slab or - // round-robin (bli_packm_my_iter()), depending on which was - // selected at configure-time. - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) + // Hermitian/symmetric and general packing may use slab or round- + // robin (bli_is_my_iter()), depending on which was selected at + // configure-time. + if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) { packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc, diagc, @@ -286,9 +286,9 @@ void bli_packm_blk_var1 // We nudge the imaginary stride up by one if it is odd. is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); - // NOTE: We MUST use round-robin work allocation (bli_packm_my_iter_rr()) + // NOTE: We MUST use round-robin work allocation (bli_is_my_iter_rr()) // when packing micropanels of a triangular matrix. - if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) + if ( bli_is_my_iter_rr( it, tid, nt ) ) { packm_ker_cast( strucc, diagc, diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c index e47f65aea..67b33f407 100644 --- a/frame/3/bli_l3_sup_packm_var.c +++ b/frame/3/bli_l3_sup_packm_var.c @@ -155,10 +155,10 @@ void PASTEMAC(ch,varname) \ dim_t it_start, it_end, it_inc; \ \ /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ \ /* Iterate over every logical micropanel in the source matrix. */ \ for ( ic = ic0, it = 0; it < n_iter; \ @@ -175,9 +175,9 @@ void PASTEMAC(ch,varname) \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ - /* The definition of bli_packm_my_iter() will depend on whether slab + /* The definition of bli_is_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ f \ ( \ @@ -398,10 +398,10 @@ void PASTEMAC(ch,varname) \ dim_t it_start, it_end, it_inc; \ \ /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ \ /* Iterate over every logical micropanel in the source matrix. */ \ for ( it = 0; it < n_iter; it += 1 ) \ @@ -412,9 +412,9 @@ void PASTEMAC(ch,varname) \ ctype* p_use = p_begin; \ \ { \ - /* The definition of bli_packm_my_iter() will depend on whether slab + /* The definition of bli_is_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ PASTEMAC2(ch,scal2v,BLIS_TAPI_EX_SUF) \ ( \ diff --git a/frame/3/bli_l3_sup_var12.c b/frame/3/bli_l3_sup_var12.c index d65482243..4162c3d33 100644 --- a/frame/3/bli_l3_sup_var12.c +++ b/frame/3/bli_l3_sup_var12.c @@ -357,11 +357,11 @@ void PASTEMAC(ch,varname) \ object. */ \ /* ctype* a2 = bli_gemm_get_next_a_upanel( a_ir, irstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, ir_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_slrr( i, ir_iter, 0, 1 ) ) \ { \ a2 = a_00; \ b2 = bli_gemm_get_next_b_upanel( b_jr, jrstep_b, jr_inc ); \ - if ( bli_is_last_iter( j, jr_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_slrr( j, jr_iter, 0, 1 ) ) \ b2 = b_00; \ } \ \ diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index b1290df50..2ea7a3fc2 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -39,22 +39,22 @@ // gemm -// NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to -// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt -// NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to -// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) +// NOTE: Here, we assume NO parallelism in the IR loop. +#define bli_gemmt_l_wrap_a_upanel( a0, step, doff_j, mr, nr ) \ + ( a0 + ( (-doff_j + 1*nr) / mr ) * step ) +#define bli_gemmt_u_wrap_a_upanel( a0, step, doff_j, mr, nr ) \ + ( a0 ) + // trmm -// NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to -// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index bd8d97d13..b9c231cf7 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -61,10 +61,25 @@ cntl_t* bli_gemmbp_cntl_create void_fp macro_kernel_fp; // Choose the default macrokernel based on the operation family... - if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2; - else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2; - else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2; - else /* should never execute */ macro_kernel_fp = NULL; + if ( family == BLIS_GEMM ) macro_kernel_fp = + #ifdef BLIS_ENABLE_JRIR_TLB + bli_gemm_ker_var2b; + #else // ifdef ( _SLAB || _RR ) + bli_gemm_ker_var2; + #endif + else if ( family == BLIS_GEMMT ) macro_kernel_fp = + #ifdef BLIS_ENABLE_JRIR_TLB + bli_gemmt_x_ker_var2b; + #else // ifdef ( _SLAB || _RR ) + bli_gemmt_x_ker_var2; + #endif + else if ( family == BLIS_TRMM ) macro_kernel_fp = + #ifdef BLIS_ENABLE_JRIR_TLB + bli_trmm_xx_ker_var2b; + #else // ifdef ( _SLAB || _RR ) + bli_trmm_xx_ker_var2; + #endif + else /* should never execute */ macro_kernel_fp = NULL; // ...unless a non-NULL kernel function pointer is passed in, in which // case we use that instead. diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index d59695081..3e862e6c5 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -47,7 +47,7 @@ typedef void (*xpbys_mxn_vft) #undef GENTFUNC2 #define GENTFUNC2(ctypex,ctypey,chx,chy,op) \ \ -void PASTEMAC2(chx,chy,op) \ +BLIS_INLINE void PASTEMAC2(chx,chy,op) \ ( \ dim_t m, \ dim_t n, \ @@ -77,31 +77,31 @@ static xpbys_mxn_vft GENARRAY2_ALL(xpbys_mxn, xpbys_mxn_fn); void bli_gemm_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { num_t dt_exec = bli_obj_exec_dt( c ); num_t dt_c = bli_obj_dt( c ); - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); const char* a_cast = bli_obj_buffer_at_off( a ); - inc_t is_a = bli_obj_imag_stride( a ); + const inc_t is_a = bli_obj_imag_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); const char* b_cast = bli_obj_buffer_at_off( b ); - inc_t is_b = bli_obj_imag_stride( b ); + const inc_t is_b = bli_obj_imag_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); @@ -116,8 +116,7 @@ void bli_gemm_ker_var2 // NOTE: We know that the internal scalars of A and B are already of the // target datatypes because the necessary typecasting would have already // taken place during bli_packm_init(). - obj_t scalar_a; - obj_t scalar_b; + obj_t scalar_a, scalar_b; bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); @@ -217,22 +216,19 @@ void bli_gemm_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_c_size; - inc_t cstep_c = cs_c * NR * dt_c_size; + const inc_t rstep_c = rs_c * MR * dt_c_size; + const inc_t cstep_c = cs_c * NR * dt_c_size; auxinfo_t aux; @@ -255,20 +251,19 @@ void bli_gemm_ker_var2 thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); - dim_t ir_nt = bli_thrinfo_n_way( caucus ); - dim_t ir_tid = bli_thrinfo_work_id( caucus ); + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - dim_t jr_start, jr_end; - dim_t ir_start, ir_end; - dim_t jr_inc, ir_inc; + dim_t jr_start, jr_end, jr_inc; + dim_t ir_start, ir_end, ir_inc; // Determine the thread range and increment for the 2nd and 1st loops. - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -276,7 +271,9 @@ void bli_gemm_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + // Compute the current microtile's width. + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -287,15 +284,17 @@ void bli_gemm_ker_var2 const char* a1 = a_cast + i * rstep_a; char* c11 = c1 + i * rstep_c; - const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + // Compute the current microtile's length. + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); // Compute the addresses of the next panels of A and B. const char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); - if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) + if ( bli_is_last_iter_slrr( i, ir_end, ir_tid, ir_nt ) ) { a2 = a_cast; b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) + if ( bli_is_last_iter_slrr( j, jr_end, jr_tid, jr_nt ) ) b2 = b_cast; } @@ -342,22 +341,20 @@ void bli_gemm_ker_var2 ( cntx_t* )cntx ); - // Accumulate to C with type-casting. + // Accumulate to C with typecasting. xpbys_mxn[ dt_exec ][ dt_c ] ( - m_cur, n_cur, - &ct, rs_ct, cs_ct, - ( void* )beta_cast, - c11, rs_c, cs_c + m_cur, n_cur, + &ct, rs_ct, cs_ct, + ( void* )beta_cast, + c11, rs_c, cs_c ); } } } - -/* -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); -*/ } +//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); + diff --git a/frame/3/gemm/bli_gemm_ker_var2b.c b/frame/3/gemm/bli_gemm_ker_var2b.c new file mode 100644 index 000000000..50375708a --- /dev/null +++ b/frame/3/gemm/bli_gemm_ker_var2b.c @@ -0,0 +1,379 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +typedef void (*xpbys_mxn_vft) + ( + dim_t m, + dim_t n, + void* x, inc_t rs_x, inc_t cs_x, + void* b, + void* y, inc_t rs_y, inc_t cs_y + ); + +#undef GENTFUNC2 +#define GENTFUNC2(ctypex,ctypey,chx,chy,op) \ +\ +BLIS_INLINE void PASTEMAC2(chx,chy,op) \ + ( \ + dim_t m, \ + dim_t n, \ + void* x, inc_t rs_x, inc_t cs_x, \ + void* b, \ + void* y, inc_t rs_y, inc_t cs_y \ + ) \ +{ \ + ctypex* restrict x_cast = x; \ + ctypey* restrict b_cast = b; \ + ctypey* restrict y_cast = y; \ +\ + PASTEMAC3(chx,chy,chy,xpbys_mxn) \ + ( \ + m, n, \ + x_cast, rs_x, cs_x, \ + b_cast, \ + y_cast, rs_y, cs_y \ + ); \ +} + +INSERT_GENTFUNC2_BASIC0(xpbys_mxnb_fn); +INSERT_GENTFUNC2_MIXDP0(xpbys_mxnb_fn); + +static xpbys_mxn_vft GENARRAY2_ALL(xpbys_mxn, xpbys_mxnb_fn); + + +void bli_gemm_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + num_t dt_c = bli_obj_dt( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const char* a_cast = bli_obj_buffer_at_off( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + const char* b_cast = bli_obj_buffer_at_off( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + char* c_cast = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Detach and multiply the scalars attached to A and B. + // NOTE: We know that the internal scalars of A and B are already of the + // target datatypes because the necessary typecasting would have already + // taken place during bli_packm_init(). + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + // NOTE: We know that scalar_b is of type dt_exec due to the above code + // that casts the scalars of A and B to dt_exec via scalar_a and scalar_b, + // and we know that the internal scalar in C is already of the type dt_c + // due to the casting in the implementation of bli_obj_scalar_attach(). + const char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b ); + const char* beta_cast = bli_obj_internal_scalar_buffer( c ); + + // If 1m is being employed on a column- or row-stored matrix with a + // real-valued beta, we can use the real domain macro-kernel, which + // eliminates a little overhead associated with the 1m virtual + // micro-kernel. + // Only employ this optimization if the storage datatype of C is + // equal to the execution/computation datatype. +#if 1 + if ( bli_cntx_method( cntx ) == BLIS_1M ) + { + bli_gemm_ind_recast_1m_params + ( + &dt_exec, + &dt_c, + schema_a, + c, + &m, &n, &k, + &pd_a, &ps_a, + &pd_b, &ps_b, + &rs_c, &cs_c, + cntx + ); + } +#endif + +#ifdef BLIS_ENABLE_GEMM_MD + // Tweak parameters in select mixed domain cases (rcc, crc, ccr). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_gemm_md_ker_var2_recast + ( + &dt_exec, + bli_obj_dt( a ), + bli_obj_dt( b ), + &dt_c, + &m, &n, &k, + &pd_a, &ps_a, + &pd_b, &ps_b, + c, + &rs_c, &cs_c + ); + } +#endif + + const siz_t dt_size = bli_dt_size( dt_exec ); + const siz_t dt_c_size = bli_dt_size( dt_c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx ); + + // Query the params field from the obj_t. If it is non-NULL, grab the ukr + // field of the params struct. If that function pointer is non-NULL, use it + // as our microkernel instead of the default microkernel queried from the + // cntx above. + const gemm_ker_params_t* params = bli_obj_ker_params( c ); + gemm_ukr_vft user_ukr = params ? params->ukr : NULL; + if ( user_ukr ) gemm_ukr = user_ukr; + + // Temporary C buffer for edge cases. Note that the strides of this + // temporary buffer are set so that they match the storage of the + // original C matrix. For example, if C is column-stored, ct will be + // column-stored as well. + char ct[ BLIS_STACK_BUF_MAX_SIZE ] + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx ); + const inc_t rs_ct = ( col_pref ? 1 : NR ); + const inc_t cs_ct = ( col_pref ? MR : 1 ); + const char* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); + + // + // Assumptions/assertions: + // rs_a == 1 + // cs_a == PACKMR + // pd_a == MR + // ps_a == stride to next micro-panel of A + // rs_b == PACKNR + // cs_b == 1 + // pd_b == NR + // ps_b == stride to next micro-panel of B + // rs_c == (no assumptions) + // cs_c == (no assumptions) + // + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_c_size; + const inc_t cstep_c = cs_c * NR * dt_c_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // Save the imaginary stride of A and B to the auxinfo_t object. + bli_auxinfo_set_is_a( is_a, &aux ); + bli_auxinfo_set_is_b( is_b, &aux ); + + // Save the virtual microkernel address and the params. + bli_auxinfo_set_ukr( gemm_ukr, &aux ); + bli_auxinfo_set_params( params, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Notice that this variant doesn't utilize + // parallelism in the 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + // Determine the starting microtile offsets and number of microtiles to + // compute for each thread. Note that assignment of microtiles is done + // according to the tlb policy. + dim_t jr_st, ir_st; + const dim_t n_ut_for_me + = + bli_thread_range_tlb_d( jr_nt, jr_tid, m_iter, n_iter, MR, NR, &jr_st, &ir_st ); + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + + // Start the jr/ir loops with the current thread's microtile offsets computed + // by bli_thread_range_tlb(). + dim_t i = ir_st; + dim_t j = jr_st; + + // Initialize a counter to track the number of microtiles computed by the + // current thread. + dim_t ut = 0; + + // Loop over the n dimension (NR columns at a time). + for ( ; true; ++j ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + // Compute the current microtile's width. + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + bli_auxinfo_set_next_b( b2, &aux ); + + // Loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the current microtile's length. + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Edge case handling now occurs within the microkernel itself, but + // we must still explicitly accumulate to a temporary microtile in + // situations where a virtual microkernel is being used, such as + // during the 1m method or some cases of mixed datatypes. + if ( dt_exec == dt_c ) + { + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + else + { + // Invoke the gemm micro-kernel. + gemm_ukr + ( + MR, + NR, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )zero, + &ct, rs_ct, cs_ct, + &aux, + ( cntx_t* )cntx + ); + + // Accumulate to C with typecasting. + xpbys_mxn[ dt_exec ][ dt_c ] + ( + m_cur, n_cur, + &ct, rs_ct, cs_ct, + ( void* )beta_cast, + c11, rs_c, cs_c + ); + } + + ut += 1; + if ( ut == n_ut_for_me ) return; + } + + i = 0; + } +} + +//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: b1", k, NR, b1, NR, 1, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: a1", MR, k, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index 24f7ecfb9..f69327db0 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -65,6 +65,7 @@ GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) -GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) +GENPROT( gemm_ker_var2b ) + diff --git a/frame/3/gemmt/attic/bli_gemmt_l_ker_var2b.c b/frame/3/gemmt/attic/bli_gemmt_l_ker_var2b.c new file mode 100644 index 000000000..fbfafebb0 --- /dev/null +++ b/frame/3/gemmt/attic/bli_gemmt_l_ker_var2b.c @@ -0,0 +1,429 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +typedef void (*xpbys_mxn_l_vft) + ( + doff_t diagoff, + dim_t m, + dim_t n, + void* x, inc_t rs_x, inc_t cs_x, + void* b, + void* y, inc_t rs_y, inc_t cs_y + ); + +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +\ +void PASTEMAC(ch,op) \ + ( \ + doff_t diagoff, \ + dim_t m, \ + dim_t n, \ + void* x, inc_t rs_x, inc_t cs_x, \ + void* b, \ + void* y, inc_t rs_y, inc_t cs_y \ + ) \ +{ \ + ctype* restrict x_cast = x; \ + ctype* restrict b_cast = b; \ + ctype* restrict y_cast = y; \ +\ + PASTEMAC3(ch,ch,ch,xpbys_mxn_l) \ + ( \ + diagoff, \ + m, n, \ + x_cast, rs_x, cs_x, \ + b_cast, \ + y_cast, rs_y, cs_y \ + ); \ +} + +INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn); + +static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn); + +void bli_gemmt_l_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + ftypes[dt_exec] + ( + diagoffc, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, is_a, + pd_a, ps_a, + ( void* )buf_b, rs_b, is_b, + pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely above the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region above where the diagonal of C intersects + the left edge of the panel, adjust the pointer to C and A and treat + this case as if the diagonal offset were zero. + NOTE: It's possible that after this pruning that the diagonal offset + is still negative (though its absolute value is guaranteed to be less + than MR). */ \ + if ( diagoffc < 0 ) \ + { \ + const dim_t ip = -diagoffc / MR; \ + const dim_t i = ip * MR; \ +\ + m = m - i; \ + diagoffc = diagoffc % MR; \ + c_cast = c_cast + (i )*rs_c; \ + a_cast = a_cast + (ip )*ps_a; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of C intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffc + m < n ) \ + { \ + n = diagoffc + m; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); \ + const dim_t n_left = n % NR; \ +\ + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); \ + const dim_t m_left = m % MR; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + const inc_t rstep_a = ps_a; \ +\ + const inc_t cstep_b = ps_b; \ +\ + const inc_t rstep_c = rs_c * MR; \ + const inc_t cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + const dim_t jr_inc = 1; \ + const dim_t ir_inc = 1; \ +\ + /* Determine the starting microtile offsets and number of microtiles to + compute for each thread. Note that assignment of microtiles is done + according to the tlb policy. */ \ + dim_t jr_st, ir_st; \ + const dim_t n_ut_for_me \ + = \ + bli_thread_range_tlb( thread, diagoffc, BLIS_LOWER, m, n, MR, NR, \ + &jr_st, &ir_st ); \ +\ + /* It's possible that there are so few microtiles relative to the number + of threads that one or more threads gets no work. If that happens, those + threads can return early. */ \ + if ( n_ut_for_me == 0 ) return; \ +\ + /* Start the jr/ir loops with the current thread's microtile offsets computed + by bli_thread_range_tlb(). */ \ + dim_t i = ir_st; \ + dim_t j = jr_st; \ +\ + /* Initialize a counter to track the number of microtiles computed by the + current thread. */ \ + dim_t ut = 0; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( ; true; ++j ) \ + { \ + ctype* restrict b1 = b_cast + j * cstep_b; \ + ctype* restrict c1 = c_cast + j * cstep_c; \ +\ + /* Compute the diagonal offset for the column of microtiles at (0,j). */ \ + const doff_t diagoffc_j = diagoffc - (doff_t)j*NR; \ + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) \ + ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + ctype* restrict b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( ; i < m_iter; ++i ) \ + { \ + /* Compute the diagonal offset for the microtile at (i,j). */ \ + const doff_t diagoffc_ij = diagoffc_j + (doff_t)i*MR; \ + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) \ + ? MR : m_left ); \ +\ + /* If the diagonal intersects the current MR x NR microtile, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the microtile is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we simply advance + to last microtile before the diagonal. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + MR, \ + NR, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ +\ + ut += 1; \ + if ( ut == n_ut_for_me ) return; \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_tlb_l( i, m_iter ) ) \ + { \ + a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, \ + diagoffc_j, MR, NR ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + /* We don't bother computing b2 for the last iteration of the + jr loop since the current thread won't know its j_st until + the next time it calls bli_thread_range_tlb(). */ \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ +\ + ut += 1; \ + if ( ut == n_ut_for_me ) return; \ + } \ + else /* if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) */ \ + { \ + /* Skip ahead to the last microtile strictly above the diagonal. */ \ + i = -diagoffc_j / MR - 1; \ + } \ + } \ +\ + /* Upon reaching the end of the column of microtiles, get ready to begin at + the beginning of the next column (i.e., the next jr loop iteration). */ \ + i = 0; \ + } \ +} + +INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2b ) + diff --git a/frame/3/gemmt/attic/bli_gemmt_u_ker_var2b.c b/frame/3/gemmt/attic/bli_gemmt_u_ker_var2b.c new file mode 100644 index 000000000..311180d19 --- /dev/null +++ b/frame/3/gemmt/attic/bli_gemmt_u_ker_var2b.c @@ -0,0 +1,418 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmt_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2b); + + +void bli_gemmt_u_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + const num_t dt_exec = bli_obj_exec_dt( c ); + + const doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + ftypes[dt_exec] + ( + diagoffc, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, is_a, + pd_a, ps_a, + ( void* )buf_b, rs_b, is_b, + pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely below the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region to the left of where the diagonal of C + intersects the top edge of the panel, adjust the pointer to C and B + and treat this case as if the diagonal offset were zero. + NOTE: It's possible that after this pruning that the diagonal offset + is still positive (though it is guaranteed to be less than NR). */ \ + if ( diagoffc > 0 ) \ + { \ + const dim_t jp = diagoffc / NR; \ + const dim_t j = jp * NR; \ +\ + n = n - j; \ + diagoffc = diagoffc % NR; \ + c_cast = c_cast + (j )*cs_c; \ + b_cast = b_cast + (jp )*ps_b; \ + } \ +\ + /* If there is a zero region below where the diagonal of C intersects + the right edge of the panel, shrink it to prevent "no-op" iterations + from executing. */ \ + if ( -diagoffc + n < m ) \ + { \ + m = -diagoffc + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); \ + const dim_t n_left = n % NR; \ +\ + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); \ + const dim_t m_left = m % MR; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + const inc_t rstep_a = ps_a; \ +\ + const inc_t cstep_b = ps_b; \ +\ + const inc_t rstep_c = rs_c * MR; \ + const inc_t cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the virtual microkernel address and the params. */ \ + /*bli_auxinfo_set_ukr( gemm_ukr, &aux );*/ \ + /*bli_auxinfo_set_params( params, &aux );*/ \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ +\ + const dim_t jr_inc = 1; \ + const dim_t ir_inc = 1; \ +\ + /* Determine the starting microtile offsets and number of microtiles to + compute for each thread. Note that assignment of microtiles is done + according to the tlb policy. */ \ + dim_t jr_st, ir_st; \ + const dim_t n_ut_for_me \ + = \ + bli_thread_range_tlb( thread, diagoffc, BLIS_UPPER, m, n, MR, NR, \ + &jr_st, &ir_st ); \ +\ + /* It's possible that there are so few microtiles relative to the number + of threads that one or more threads gets no work. If that happens, those + threads can return early. */ \ + if ( n_ut_for_me == 0 ) return; \ +\ + /* Start the jr/ir loops with the current thread's microtile offsets computed + by bli_thread_range_tlb(). */ \ + dim_t i = ir_st; \ + dim_t j = jr_st; \ +\ + /* Initialize a counter to track the number of microtiles computed by the + current thread. */ \ + dim_t ut = 0; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( ; true; ++j ) \ + { \ + ctype* restrict b1 = b_cast + j * cstep_b; \ + ctype* restrict c1 = c_cast + j * cstep_c; \ +\ + /* Compute the diagonal offset for the column of microtiles at (0,j). */ \ + const doff_t diagoffc_j = diagoffc - (doff_t)j*NR; \ + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) \ + ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + ctype* restrict b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( ; i < m_iter; ++i ) \ + { \ + /* Compute the diagonal offset for the microtile at (i,j). */ \ + const doff_t diagoffc_ij = diagoffc_j + (doff_t)i*MR; \ + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) \ + ? MR : m_left ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we simply advance + to last microtile before the bottom of the matrix. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_tlb_u( diagoffc_ij, MR, NR ) ) \ + { \ + a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, \ + diagoffc_j, MR, NR ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + /* We don't bother computing b2 for the last iteration of the + jr loop since the current thread won't know its j_st until + the next time it calls bli_thread_range_tlb(). */ \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + MR, \ + NR, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ +\ + ut += 1; \ + if ( ut == n_ut_for_me ) return; \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ +\ + ut += 1; \ + if ( ut == n_ut_for_me ) return; \ + } \ + else /* if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) */ \ + { \ + /* Skip past the microtiles strictly below the diagonal. */ \ + i = m_iter - 1; \ + } \ + } \ +\ + i = 0; \ + } \ +} + +INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2b ) + diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c index 4a3a48304..fd726da6f 100644 --- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c @@ -48,7 +48,7 @@ typedef void (*xpbys_mxn_l_vft) #undef GENTFUNC #define GENTFUNC(ctype,ch,op) \ \ -void PASTEMAC(ch,op) \ +BLIS_INLINE void PASTEMAC(ch,op) \ ( \ doff_t diagoff, \ dim_t m, \ @@ -76,18 +76,19 @@ INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn); static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn); + void bli_gemmt_l_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { - const num_t dt = bli_obj_exec_dt( c ); - const dim_t dt_size = bli_dt_size( dt ); + const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt_c = bli_obj_dt( c ); doff_t diagoffc = bli_obj_diag_offset( c ); @@ -113,7 +114,7 @@ void bli_gemmt_l_ker_var2 const inc_t cs_c = bli_obj_col_stride( c ); // Detach and multiply the scalars attached to A and B. - obj_t scalar_a, scalar_b; + obj_t scalar_a, scalar_b; bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); @@ -123,14 +124,17 @@ void bli_gemmt_l_ker_var2 const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + const siz_t dt_size = bli_dt_size( dt_exec ); + const siz_t dt_c_size = bli_dt_size( dt_c ); + // Alias some constants to simpler names. const dim_t MR = pd_a; const dim_t NR = pd_b; // Query the context for the micro-kernel address and cast it to its // function pointer type. - gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); - xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt ]; + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx ); + xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt_exec ]; // Temporary C buffer for edge cases. Note that the strides of this // temporary buffer are set so that they match the storage of the @@ -138,11 +142,11 @@ void bli_gemmt_l_ker_var2 // column-stored as well. char ct[ BLIS_STACK_BUF_MAX_SIZE ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); - const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx ); const inc_t rs_ct = ( col_pref ? 1 : NR ); const inc_t cs_ct = ( col_pref ? MR : 1 ); - const void* zero = bli_obj_buffer_for_const( dt, &BLIS_ZERO ); + const void* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); const char* a_cast = buf_a; const char* b_cast = buf_b; char* c_cast = buf_c; @@ -175,12 +179,13 @@ void bli_gemmt_l_ker_var2 // this case as if the diagonal offset were zero. if ( diagoffc < 0 ) { - dim_t ip = -diagoffc / MR; - dim_t i = ip * MR; - m = m - i; - diagoffc = -diagoffc % MR; - c_cast = c_cast + (i )*rs_c*dt_size; - a_cast = a_cast + (ip )*ps_a*dt_size; + const dim_t ip = -diagoffc / MR; + const dim_t i = ip * MR; + + m = m - i; + diagoffc = diagoffc % MR; + c_cast = c_cast + (i )*rs_c*dt_c_size; + a_cast = a_cast + (ip )*ps_a*dt_size; } // If there is a zero region to the right of where the diagonal @@ -193,25 +198,23 @@ void bli_gemmt_l_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_c_size; + const inc_t cstep_c = cs_c * NR * dt_c_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); @@ -219,9 +222,6 @@ void bli_gemmt_l_ker_var2 bli_auxinfo_set_is_a( is_a, &aux ); bli_auxinfo_set_is_b( is_b, &aux ); - // Save the desired output datatype (indicating no typecasting). - //bli_auxinfo_set_dt_on_output( dt, &aux );*/ - // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) // loop around the microkernel. Here we query the thrinfo_t node for the // 1st (ir) loop around the microkernel. @@ -229,48 +229,21 @@ void bli_gemmt_l_ker_var2 thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); - dim_t ir_nt = bli_thrinfo_n_way( caucus ); - dim_t ir_tid = bli_thrinfo_work_id( caucus ); - - dim_t jr_start, jr_end; - dim_t ir_start, ir_end; - dim_t jr_inc, ir_inc; + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - // Note that we partition the 2nd loop into two regions: the rectangular - // part of C, and the triangular portion. - dim_t n_iter_rct; - dim_t n_iter_tri; + dim_t jr_start, jr_end, jr_inc; + dim_t ir_start, ir_end, ir_inc; - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) - { - // If the entire panel of C does not intersect the diagonal, there is - // no triangular region, and therefore we can skip the second set of - // loops. - n_iter_rct = n_iter; - n_iter_tri = 0; - } - else - { - // If the panel of C does intersect the diagonal, compute the number of - // iterations in the rectangular region by dividing NR into the diagonal - // offset. Any remainder from this integer division is discarded, which - // is what we want. That is, we want the rectangular region to contain - // as many columns of whole microtiles as possible without including any - // microtiles that intersect the diagonal. The number of iterations in - // the triangular (or trapezoidal) region is computed as the remaining - // number of iterations in the n dimension. - n_iter_rct = diagoffc / NR; - n_iter_tri = n_iter - n_iter_rct; - } - - // Determine the thread range and increment for the 2nd and 1st loops for - // the initial rectangular region of C (if it exists). - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // Determine the thread range and increment for the 2nd and 1st loops. + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. - bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + bli_thread_range_quad( thread, diagoffc, BLIS_LOWER, m, n, NR, + FALSE, &jr_start, &jr_end, &jr_inc ); + //bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -278,7 +251,12 @@ void bli_gemmt_l_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + // Compute the diagonal offset for the column of microtiles at (0,j). + const doff_t diagoffc_j = diagoffc - ( doff_t )j*NR; + + // Compute the current microtile's width. + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -286,115 +264,34 @@ void bli_gemmt_l_ker_var2 // Interior loop over the m dimension (MR rows at a time). for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) { - const char* a1 = a_cast + i * rstep_a; - char* c11 = c1 + i * rstep_c; - - // No need to compute the diagonal offset for the rectangular - // region. - //diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ + // Compute the diagonal offset for the microtile at (i,j). + const doff_t diagoffc_ij = diagoffc_j + ( doff_t )i*MR; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + // Compute the current microtile's length. + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); - // Compute the addresses of the next panels of A and B. - const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) - { - a2 = a_cast; - b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; - } - - // Save addresses of next panels of A and B to the auxinfo_t - // object. - bli_auxinfo_set_next_a( a2, &aux ); - bli_auxinfo_set_next_b( b2, &aux ); - - // If the diagonal intersects the current MR x NR submatrix, we + // If the diagonal intersects the current MR x NR microtile, we // compute it the temporary buffer and then add in the elements // on or below the diagonal. - // Otherwise, if the submatrix is strictly below the diagonal, + // Otherwise, if the microtile is strictly below the diagonal, // we compute and store as we normally would. // And if we're strictly above the diagonal, we do nothing and - // continue. + // continue on through the IR loop to consider the next MR x NR + // microtile. + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) { - // Invoke the gemm micro-kernel. - gemm_ukr - ( - m_cur, - n_cur, - k, - ( void* )alpha_cast, - ( void* )a1, - ( void* )b1, - ( void* )beta_cast, - c11, rs_c, cs_c, - &aux, - ( cntx_t* )cntx - ); - } - } - } - - // If there is no triangular region, then we're done. - if ( n_iter_tri == 0 ) return; - - // Use round-robin assignment of micropanels to threads in the 2nd loop - // and the default (slab or rr) partitioning in the 1st loop for the - // remaining triangular region of C. - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - - // Advance the start and end iteration offsets for the triangular region - // by the number of iterations used for the rectangular region. - jr_start += n_iter_rct; - jr_end += n_iter_rct; - - // Loop over the n dimension (NR columns at a time). - for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) - { - const char* b1 = b_cast + j * cstep_b; - char* c1 = c_cast + j * cstep_c; - - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); - - // Initialize our next panel of B to be the current panel of B. - const char* b2 = b1; - - // Interior loop over the m dimension (MR rows at a time). - for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) - { - const char* a1 = a_cast + i * rstep_a; - char* c11 = c1 + i * rstep_c; + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; - // Compute the diagonal offset for the submatrix at (i,j). - doff_t diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; + // Compute the addresses of the next panel of A. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); - // Compute the addresses of the next panels of A and B. - const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) - { - a2 = a_cast; - b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; - } - - // Save addresses of next panels of A and B to the auxinfo_t - // object. - bli_auxinfo_set_next_a( a2, &aux ); - bli_auxinfo_set_next_b( b2, &aux ); - - // If the diagonal intersects the current MR x NR submatrix, we - // compute it the temporary buffer and then add in the elements - // on or below the diagonal. - // Otherwise, if the submatrix is strictly below the diagonal, - // we compute and store as we normally would. - // And if we're strictly above the diagonal, we do nothing and - // continue. - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) - { // Invoke the gemm micro-kernel. gemm_ukr ( @@ -411,14 +308,35 @@ void bli_gemmt_l_ker_var2 ); // Scale C and add the result to only the stored part. - xpbys_mxn_l_ukr( diagoffc_ij, - m_cur, n_cur, - ct, rs_ct, cs_ct, - ( void* )beta_cast, - c11, rs_c, cs_c ); + xpbys_mxn_l_ukr + ( + diagoffc_ij, + m_cur, n_cur, + ct, rs_ct, cs_ct, + ( void* )beta_cast, + c11, rs_c, cs_c + ); } else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); + if ( bli_is_last_iter_l( i, m_iter, ir_tid, ir_nt ) ) + { + a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, diagoffc_j, MR, NR ); + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); + if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + // Invoke the gemm micro-kernel. gemm_ukr ( diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2b.c b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c new file mode 100644 index 000000000..7c50a4a54 --- /dev/null +++ b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c @@ -0,0 +1,387 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +typedef void (*xpbys_mxn_l_vft) + ( + doff_t diagoff, + dim_t m, + dim_t n, + void* x, inc_t rs_x, inc_t cs_x, + void* b, + void* y, inc_t rs_y, inc_t cs_y + ); + +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +\ +BLIS_INLINE void PASTEMAC(ch,op) \ + ( \ + doff_t diagoff, \ + dim_t m, \ + dim_t n, \ + void* x, inc_t rs_x, inc_t cs_x, \ + void* b, \ + void* y, inc_t rs_y, inc_t cs_y \ + ) \ +{ \ + ctype* restrict x_cast = x; \ + ctype* restrict b_cast = b; \ + ctype* restrict y_cast = y; \ +\ + PASTEMAC3(ch,ch,ch,xpbys_mxn_l) \ + ( \ + diagoff, \ + m, n, \ + x_cast, rs_x, cs_x, \ + b_cast, \ + y_cast, rs_y, cs_y \ + ); \ +} + +INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn); + +static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn); + + +void bli_gemmt_l_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt_c = bli_obj_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + const siz_t dt_size = bli_dt_size( dt_exec ); + const siz_t dt_c_size = bli_dt_size( dt_c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx ); + xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt_exec ]; + + // Temporary C buffer for edge cases. Note that the strides of this + // temporary buffer are set so that they match the storage of the + // original C matrix. For example, if C is column-stored, ct will be + // column-stored as well. + char ct[ BLIS_STACK_BUF_MAX_SIZE ] + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx ); + const inc_t rs_ct = ( col_pref ? 1 : NR ); + const inc_t cs_ct = ( col_pref ? MR : 1 ); + + const void* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of C is entirely above the diagonal, + // it is not stored. So we do nothing. + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; + + // If there is a zero region above where the diagonal of C intersects + // the left edge of the panel, adjust the pointer to C and A and treat + // this case as if the diagonal offset were zero. + // NOTE: It's possible that after this pruning that the diagonal offset + // is still negative (though its absolute value is guaranteed to be less + // than MR). + if ( diagoffc < 0 ) + { + const dim_t ip = -diagoffc / MR; + const dim_t i = ip * MR; + + m = m - i; + diagoffc = diagoffc % MR; + c_cast = c_cast + (i )*rs_c*dt_c_size; + a_cast = a_cast + (ip )*ps_a*dt_size; + } + + // If there is a zero region to the right of where the diagonal + // of C intersects the bottom of the panel, shrink it to prevent + // "no-op" iterations from executing. + if ( diagoffc + m < n ) + { + n = diagoffc + m; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_c_size; + const inc_t cstep_c = cs_c * NR * dt_c_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // Save the imaginary stride of A and B to the auxinfo_t object. + bli_auxinfo_set_is_a( is_a, &aux ); + bli_auxinfo_set_is_b( is_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + // Determine the starting microtile offsets and number of microtiles to + // compute for each thread. Note that assignment of microtiles is done + // according to the tlb policy. + dim_t jr_st, ir_st; + const dim_t n_ut_for_me + = + bli_thread_range_tlb_l( jr_nt, jr_tid, diagoffc, m_iter, n_iter, MR, NR, + &jr_st, &ir_st ); + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + + // Start the jr/ir loops with the current thread's microtile offsets computed + // by bli_thread_range_tlb(). + dim_t i = ir_st; + dim_t j = jr_st; + + // Initialize a counter to track the number of microtiles computed by the + // current thread. + dim_t ut = 0; + + // Loop over the n dimension (NR columns at a time). + for ( ; true; ++j ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + // Compute the diagonal offset for the column of microtiles at (0,j). + const doff_t diagoffc_j = diagoffc - ( doff_t )j*NR; + + // Compute the current microtile's width. + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + bli_auxinfo_set_next_b( b2, &aux ); + + // Interior loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + // Compute the diagonal offset for the microtile at (i,j). + const doff_t diagoffc_ij = diagoffc_j + ( doff_t )i*MR; + + // Compute the current microtile's length. + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // If the diagonal intersects the current MR x NR microtile, we + // compute it the temporary buffer and then add in the elements + // on or below the diagonal. + // Otherwise, if the microtile is strictly below the diagonal, + // we compute and store as we normally would. + // And if we're strictly above the diagonal, we simply advance + // to the last microtile before the diagonal. + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the addresses of the next panel of A. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, 1 ); + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + MR, + NR, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )zero, + ct, rs_ct, cs_ct, + &aux, + ( cntx_t* )cntx + ); + + // Scale C and add the result to only the stored part. + xpbys_mxn_l_ukr + ( + diagoffc_ij, + m_cur, n_cur, + ct, rs_ct, cs_ct, + ( void* )beta_cast, + c11, rs_c, cs_c + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; + if ( ut == n_ut_for_me ) return; + } + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_tlb_l( i, m_iter ) ) + { + a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, diagoffc_j, MR, NR ); + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; + if ( ut == n_ut_for_me ) return; + } + else // if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + // Skip ahead to the last microtile strictly above the diagonal. + i = -diagoffc_j / MR - 1; + } + } + + // Upon reaching the end of the column of microtiles, get ready to begin + // at the beginning of the next column (i.e., the next jr loop iteration). + i = 0; + } +} + diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c index 5b4e1ccd9..78d5b869d 100644 --- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c @@ -48,7 +48,7 @@ typedef void (*xpbys_mxn_u_vft) #undef GENTFUNC #define GENTFUNC(ctype,ch,op) \ \ -void PASTEMAC(ch,op) \ +BLIS_INLINE void PASTEMAC(ch,op) \ ( \ doff_t diagoff, \ dim_t m, \ @@ -76,18 +76,19 @@ INSERT_GENTFUNC_BASIC0(xpbys_mxn_u_fn); static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn); + void bli_gemmt_u_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { - const num_t dt = bli_obj_exec_dt( c ); - const dim_t dt_size = bli_dt_size( dt ); + const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt_c = bli_obj_dt( c ); doff_t diagoffc = bli_obj_diag_offset( c ); @@ -113,7 +114,7 @@ void bli_gemmt_u_ker_var2 const inc_t cs_c = bli_obj_col_stride( c ); // Detach and multiply the scalars attached to A and B. - obj_t scalar_a, scalar_b; + obj_t scalar_a, scalar_b; bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); @@ -123,14 +124,17 @@ void bli_gemmt_u_ker_var2 const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + const siz_t dt_size = bli_dt_size( dt_exec ); + const siz_t dt_c_size = bli_dt_size( dt_c ); + // Alias some constants to simpler names. const dim_t MR = pd_a; const dim_t NR = pd_b; // Query the context for the micro-kernel address and cast it to its // function pointer type. - gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); - xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt ]; + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx ); + xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt_exec ]; // Temporary C buffer for edge cases. Note that the strides of this // temporary buffer are set so that they match the storage of the @@ -138,11 +142,11 @@ void bli_gemmt_u_ker_var2 // column-stored as well. char ct[ BLIS_STACK_BUF_MAX_SIZE ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); - const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx ); const inc_t rs_ct = ( col_pref ? 1 : NR ); const inc_t cs_ct = ( col_pref ? MR : 1 ); - const void* zero = bli_obj_buffer_for_const( dt, &BLIS_ZERO ); + const void* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); const char* a_cast = buf_a; const char* b_cast = buf_b; char* c_cast = buf_c; @@ -177,12 +181,13 @@ void bli_gemmt_u_ker_var2 // is still positive (though it is guaranteed to be less than NR). if ( diagoffc > 0 ) { - dim_t jp = diagoffc / NR; - dim_t j = jp * NR; - n = n - j; - diagoffc = diagoffc % NR; - c_cast = c_cast + (j )*cs_c*dt_size; - b_cast = b_cast + (jp )*ps_b*dt_size; + const dim_t jp = diagoffc / NR; + const dim_t j = jp * NR; + + n = n - j; + diagoffc = diagoffc % NR; + c_cast = c_cast + (j )*cs_c*dt_c_size; + b_cast = b_cast + (jp )*ps_b*dt_size; } // If there is a zero region below where the diagonal of C intersects @@ -195,25 +200,23 @@ void bli_gemmt_u_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_c_size; + const inc_t cstep_c = cs_c * NR * dt_c_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); @@ -221,9 +224,6 @@ void bli_gemmt_u_ker_var2 bli_auxinfo_set_is_a( is_a, &aux ); bli_auxinfo_set_is_b( is_b, &aux ); - // Save the desired output datatype (indicating no typecasting). - //bli_auxinfo_set_dt_on_output( dt, &aux );*/ - // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) // loop around the microkernel. Here we query the thrinfo_t node for the // 1st (ir) loop around the microkernel. @@ -231,47 +231,21 @@ void bli_gemmt_u_ker_var2 thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); - dim_t ir_nt = bli_thrinfo_n_way( caucus ); - dim_t ir_tid = bli_thrinfo_work_id( caucus ); - - dim_t jr_start, jr_end; - dim_t ir_start, ir_end; - dim_t jr_inc, ir_inc; + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - // Note that we partition the 2nd loop into two regions: the triangular - // part of C, and the rectangular portion. - dim_t n_iter_tri; - dim_t n_iter_rct; + dim_t jr_start, jr_end, jr_inc; + dim_t ir_start, ir_end, ir_inc; - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) - { - // If the entire panel of C does not intersect the diagonal, there is - // no triangular region, and therefore we can skip the first set of - // loops. - n_iter_tri = 0; - n_iter_rct = n_iter; - } - else - { - // If the panel of C does intersect the diagonal, compute the number of - // iterations in the triangular (or trapezoidal) region by dividing NR - // into the number of rows in C. A non-zero remainder means we need to - // add one additional iteration. That is, we want the triangular region - // to contain as few columns of whole microtiles as possible while still - // including all microtiles that intersect the diagonal. The number of - // iterations in the rectangular region is computed as the remaining - // number of iterations in the n dimension. - n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); - n_iter_rct = n_iter - n_iter_tri; - } - - // Use round-robin assignment of micropanels to threads in the 2nd loop - // and the default (slab or rr) partitioning in the 1st loop for the - // initial triangular region of C (if it exists). - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - bli_thread_range_jrir ( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + // Determine the thread range and increment for the 2nd and 1st loops. + // NOTE: The definition of bli_thread_range_slrr() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + bli_thread_range_quad( thread, diagoffc, BLIS_UPPER, m, n, NR, + FALSE, &jr_start, &jr_end, &jr_inc ); + //bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -279,7 +253,12 @@ void bli_gemmt_u_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + // Compute the diagonal offset for the column of microtiles at (0,j). + const doff_t diagoffc_j = diagoffc - ( doff_t )j*NR; + + // Compute the current microtile's width. + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -287,38 +266,41 @@ void bli_gemmt_u_ker_var2 // Interior loop over the m dimension (MR rows at a time). for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) { - const char* a1 = a_cast + i * rstep_a; - char* c11 = c1 + i * rstep_c; - - // Compute the diagonal offset for the submatrix at (i,j). - doff_t diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; + // Compute the diagonal offset for the microtile at (i,j). + const doff_t diagoffc_ij = diagoffc_j + ( doff_t )i*MR; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + // Compute the current microtile's length. + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); - // Compute the addresses of the next panels of A and B. - const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) - { - a2 = a_cast; - b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; - } - - // Save addresses of next panels of A and B to the auxinfo_t - // object. - bli_auxinfo_set_next_a( a2, &aux ); - bli_auxinfo_set_next_b( b2, &aux ); - - // If the diagonal intersects the current MR x NR submatrix, we + // If the diagonal intersects the current MR x NR microtile, we // compute it the temporary buffer and then add in the elements // on or below the diagonal. - // Otherwise, if the submatrix is strictly above the diagonal, + // Otherwise, if the microtile is strictly above the diagonal, // we compute and store as we normally would. // And if we're strictly below the diagonal, we do nothing and - // continue. + // continue on through the IR loop to consider the next MR x NR + // microtile. if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); + if ( bli_is_last_iter_u( diagoffc_ij, MR, NR, ir_inc ) ) + { + a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, diagoffc_j, MR, NR ); + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); + if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + // Invoke the gemm micro-kernel. gemm_ukr ( @@ -335,93 +317,28 @@ void bli_gemmt_u_ker_var2 ); // Scale C and add the result to only the stored part. - xpbys_mxn_u_ukr( diagoffc_ij, - m_cur, n_cur, - ct, rs_ct, cs_ct, - ( void* )beta_cast, - c11, rs_c, cs_c ); - } - else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) - { - // Invoke the gemm micro-kernel. - gemm_ukr + xpbys_mxn_u_ukr ( - m_cur, - n_cur, - k, - ( void* )alpha_cast, - ( void* )a1, - ( void* )b1, + diagoffc_ij, + m_cur, n_cur, + ct, rs_ct, cs_ct, ( void* )beta_cast, - c11, rs_c, cs_c, - &aux, - ( cntx_t* )cntx + c11, rs_c, cs_c ); } - } - } - - // If there is no rectangular region, then we're done. - if ( n_iter_rct == 0 ) return; - - // Determine the thread range and increment for the 2nd loop of the - // remaining rectangular region of C (and also use default partitioning - // for the 1st loop). - // NOTE: The definition of bli_thread_range_jrir() will depend on whether - // slab or round-robin partitioning was requested at configure-time. - bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - - // Advance the start and end iteration offsets for the rectangular region - // by the number of iterations used for the triangular region. - jr_start += n_iter_tri; - jr_end += n_iter_tri; - - // Loop over the n dimension (NR columns at a time). - for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) - { - const char* b1 = b_cast + j * cstep_b; - char* c1 = c_cast + j * cstep_c; - - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); - - // Initialize our next panel of B to be the current panel of B. - const char* b2 = b1; - - // Interior loop over the m dimension (MR rows at a time). - for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) - { - const char* a1 = a_cast + i * rstep_a; - char* c11 = c1 + i * rstep_c; - - // No need to compute the diagonal offset for the rectangular - // region. - //diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ - - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); - - // Compute the addresses of the next panels of A and B. - const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) { - a2 = a_cast; - b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; - } + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; - // Save addresses of next panels of A and B to the auxinfo_t - // object. - bli_auxinfo_set_next_a( a2, &aux ); - bli_auxinfo_set_next_b( b2, &aux ); + // Compute the addresses of the next panel of A. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); - // If the diagonal intersects the current MR x NR submatrix, we - // compute it the temporary buffer and then add in the elements - // on or below the diagonal. - // Otherwise, if the submatrix is strictly above the diagonal, - // we compute and store as we normally would. - // And if we're strictly below the diagonal, we do nothing and - // continue. - { // Invoke the gemm micro-kernel. gemm_ukr ( diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2b.c b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c new file mode 100644 index 000000000..91275577a --- /dev/null +++ b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c @@ -0,0 +1,386 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +typedef void (*xpbys_mxn_u_vft) + ( + doff_t diagoff, + dim_t m, + dim_t n, + void* x, inc_t rs_x, inc_t cs_x, + void* b, + void* y, inc_t rs_y, inc_t cs_y + ); + +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +\ +BLIS_INLINE void PASTEMAC(ch,op) \ + ( \ + doff_t diagoff, \ + dim_t m, \ + dim_t n, \ + void* x, inc_t rs_x, inc_t cs_x, \ + void* b, \ + void* y, inc_t rs_y, inc_t cs_y \ + ) \ +{ \ + ctype* restrict x_cast = x; \ + ctype* restrict b_cast = b; \ + ctype* restrict y_cast = y; \ +\ + PASTEMAC3(ch,ch,ch,xpbys_mxn_u) \ + ( \ + diagoff, \ + m, n, \ + x_cast, rs_x, cs_x, \ + b_cast, \ + y_cast, rs_y, cs_y \ + ); \ +} + +INSERT_GENTFUNC_BASIC0(xpbys_mxn_u_fn); + +static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn); + + +void bli_gemmt_u_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt_c = bli_obj_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + const siz_t dt_size = bli_dt_size( dt_exec ); + const siz_t dt_c_size = bli_dt_size( dt_c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx ); + xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt_exec ]; + + // Temporary C buffer for edge cases. Note that the strides of this + // temporary buffer are set so that they match the storage of the + // original C matrix. For example, if C is column-stored, ct will be + // column-stored as well. + char ct[ BLIS_STACK_BUF_MAX_SIZE ] + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx ); + const inc_t rs_ct = ( col_pref ? 1 : NR ); + const inc_t cs_ct = ( col_pref ? MR : 1 ); + + const void* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of C is entirely below the diagonal, + // it is not stored. So we do nothing. + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; + + // If there is a zero region to the left of where the diagonal of C + // intersects the top edge of the panel, adjust the pointer to C and B + // and treat this case as if the diagonal offset were zero. + // NOTE: It's possible that after this pruning that the diagonal offset + // is still positive (though it is guaranteed to be less than NR). + if ( diagoffc > 0 ) + { + const dim_t jp = diagoffc / NR; + const dim_t j = jp * NR; + + n = n - j; + diagoffc = diagoffc % NR; + c_cast = c_cast + (j )*cs_c*dt_c_size; + b_cast = b_cast + (jp )*ps_b*dt_size; + } + + // If there is a zero region below where the diagonal of C intersects + // the right edge of the panel, shrink it to prevent "no-op" iterations + // from executing. + if ( -diagoffc + n < m ) + { + m = -diagoffc + n; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_c_size; + const inc_t cstep_c = cs_c * NR * dt_c_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // Save the imaginary stride of A and B to the auxinfo_t object. + bli_auxinfo_set_is_a( is_a, &aux ); + bli_auxinfo_set_is_b( is_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + // Determine the starting microtile offsets and number of microtiles to + // compute for each thread. Note that assignment of microtiles is done + // according to the tlb policy. + dim_t jr_st, ir_st; + const dim_t n_ut_for_me + = + bli_thread_range_tlb_u( jr_nt, jr_tid, diagoffc, m_iter, n_iter, MR, NR, + &jr_st, &ir_st ); + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + + // Start the jr/ir loops with the current thread's microtile offsets computed + // by bli_thread_range_tlb(). + dim_t i = ir_st; + dim_t j = jr_st; + + // Initialize a counter to track the number of microtiles computed by the + // current thread. + dim_t ut = 0; + + // Loop over the n dimension (NR columns at a time). + for ( ; true; ++j ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + // Compute the diagonal offset for the column of microtiles at (0,j). + const doff_t diagoffc_j = diagoffc - ( doff_t )j*NR; + + // Compute the current microtile's width. + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + bli_auxinfo_set_next_b( b2, &aux ); + + // Interior loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + // Compute the diagonal offset for the microtile at (i,j). + const doff_t diagoffc_ij = diagoffc_j + ( doff_t )i*MR; + + // Compute the current microtile's length. + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // If the diagonal intersects the current MR x NR microtile, we + // compute it the temporary buffer and then add in the elements + // on or below the diagonal. + // Otherwise, if the microtile is strictly above the diagonal, + // we compute and store as we normally would. + // And if we're strictly below the diagonal, we simply advance + // to last microtile before the bottom of the matrix. + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_tlb_u( diagoffc_ij, MR, NR ) ) + { + a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, diagoffc_j, MR, NR ); + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + MR, + NR, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )zero, + ct, rs_ct, cs_ct, + &aux, + ( cntx_t* )cntx + ); + + // Scale C and add the result to only the stored part. + xpbys_mxn_u_ukr + ( + diagoffc_ij, + m_cur, n_cur, + ct, rs_ct, cs_ct, + ( void* )beta_cast, + c11, rs_c, cs_c + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; + if ( ut == n_ut_for_me ) return; + } + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the addresses of the next panel of A. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, 1 ); + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; + if ( ut == n_ut_for_me ) return; + } + else // if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + // Skip past the microtiles strictly below the diagonal. + i = m_iter - 1; + } + } + + // Upon reaching the end of the column of microtiles, get ready to begin + // at the beginning of the next column (i.e., the next jr loop iteration). + i = 0; + } +} + diff --git a/frame/3/gemmt/bli_gemmt_var.h b/frame/3/gemmt/bli_gemmt_var.h index eb6e16018..339b93755 100644 --- a/frame/3/gemmt/bli_gemmt_var.h +++ b/frame/3/gemmt/bli_gemmt_var.h @@ -43,46 +43,19 @@ \ void PASTEMAC0(opname) \ ( \ - const obj_t* a, \ - const obj_t* ah, \ - const obj_t* c, \ - const cntx_t* cntx, \ - const cntl_t* cntl, \ - thrinfo_t* thread \ + const obj_t* a, \ + const obj_t* ah, \ + const obj_t* c, \ + const cntx_t* cntx, \ + const cntl_t* cntl, \ + thrinfo_t* thread_par \ ); GENPROT( gemmt_x_ker_var2 ) - GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) -INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) +GENPROT( gemmt_x_ker_var2b ) +GENPROT( gemmt_l_ker_var2b ) +GENPROT( gemmt_u_ker_var2b ) diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c index 207e1c938..8081537b9 100644 --- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c @@ -42,12 +42,12 @@ static l3_var_oft vars[2] = void bli_gemmt_x_ker_var2 ( - const obj_t* a, - const obj_t* ah, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* ah, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par ) { dim_t uplo; @@ -67,7 +67,7 @@ void bli_gemmt_x_ker_var2 c, cntx, cntl, - thread + thread_par ); } diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2b.c b/frame/3/gemmt/bli_gemmt_x_ker_var2b.c new file mode 100644 index 000000000..132d7c13a --- /dev/null +++ b/frame/3/gemmt/bli_gemmt_x_ker_var2b.c @@ -0,0 +1,73 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +static l3_var_oft vars[2] = +{ + bli_gemmt_l_ker_var2b, bli_gemmt_u_ker_var2b, +}; + +void bli_gemmt_x_ker_var2b + ( + const obj_t* a, + const obj_t* ah, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + dim_t uplo; + + // Set a bool based on the uplo field of C's root object. + if ( bli_obj_root_is_lower( c ) ) uplo = 0; + else uplo = 1; + + // Index into the variant array to extract the correct function pointer. + l3_var_oft f = vars[uplo]; + + // Call the macrokernel. + f + ( + a, + ah, + c, + cntx, + cntl, + thread_par + ); +} + diff --git a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c.prev b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c.prev new file mode 100644 index 000000000..aed0359ec --- /dev/null +++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c.prev @@ -0,0 +1,507 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmt_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2); + + +void bli_gemmt_l_ker_var2 + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + const num_t dt_exec = bli_obj_exec_dt( c ); + + const doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + ftypes[dt_exec] + ( + diagoffc, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, is_a, + pd_a, ps_a, + ( void* )buf_b, rs_b, is_b, + pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, ip; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely above the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region above where the diagonal of C intersects + the left edge of the panel, adjust the pointer to C and A and treat + this case as if the diagonal offset were zero. */ \ + if ( diagoffc < 0 ) \ + { \ + ip = -diagoffc / MR; \ + i = ip * MR; \ + m = m - i; \ + diagoffc = -diagoffc % MR; \ + c_cast = c_cast + (i )*rs_c; \ + a_cast = a_cast + (ip )*ps_a; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of C intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffc + m < n ) \ + { \ + n = diagoffc + m; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the rectangular + part of C, and the triangular portion. */ \ + dim_t n_iter_rct; \ + dim_t n_iter_tri; \ +\ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \ + { \ + /* If the entire panel of C does not intersect the diagonal, there is + no triangular region, and therefore we can skip the second set of + loops. */ \ + n_iter_rct = n_iter; \ + n_iter_tri = 0; \ + } \ + else \ + { \ + /* If the panel of C does intersect the diagonal, compute the number of + iterations in the rectangular region by dividing NR into the diagonal + offset. Any remainder from this integer division is discarded, which + is what we want. That is, we want the rectangular region to contain + as many columns of whole microtiles as possible without including any + microtiles that intersect the diagonal. The number of iterations in + the triangular (or trapezoidal) region is computed as the remaining + number of iterations in the n dimension. */ \ + n_iter_rct = diagoffc / NR; \ + n_iter_tri = n_iter - n_iter_rct; \ + } \ +\ + /* Determine the thread range and increment for the 2nd and 1st loops for + the initial rectangular region of C (if it exists). + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* No need to compute the diagonal offset for the rectangular + region. */ \ + /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue. */ \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +\ + /* If there is no triangular region, then we're done. */ \ + if ( n_iter_tri == 0 ) return; \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop + and the default (slab or rr) partitioning in the 1st loop for the + remaining triangular region of C. */ \ + bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Advance the start and end iteration offsets for the triangular region + by the number of iterations used for the rectangular region. */ \ + jr_start += n_iter_rct; \ + jr_end += n_iter_rct; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + MR, \ + NR, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 ) + diff --git a/frame/3/gemmt/other/bli_gemmt_l_ker_var2b.c.before b/frame/3/gemmt/other/bli_gemmt_l_ker_var2b.c.before new file mode 100644 index 000000000..4285bd135 --- /dev/null +++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2b.c.before @@ -0,0 +1,427 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmt_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2b); + + +void bli_gemmt_l_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + const num_t dt_exec = bli_obj_exec_dt( c ); + + const doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + ftypes[dt_exec] + ( + diagoffc, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, is_a, + pd_a, ps_a, + ( void* )buf_b, rs_b, is_b, + pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely above the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region above where the diagonal of C intersects + the left edge of the panel, adjust the pointer to C and A and treat + this case as if the diagonal offset were zero. + NOTE: It's possible that after this pruning that the diagonal offset + is still negative (though its absolute value is guaranteed to be less + than MR). */ \ + if ( diagoffc < 0 ) \ + { \ + const dim_t ip = -diagoffc / MR; \ + const dim_t i = ip * MR; \ +\ + m = m - i; \ + diagoffc = diagoffc % MR; \ + c_cast = c_cast + (i )*rs_c; \ + a_cast = a_cast + (ip )*ps_a; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of C intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffc + m < n ) \ + { \ + n = diagoffc + m; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); \ + const dim_t n_left = n % NR; \ +\ + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); \ + const dim_t m_left = m % MR; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + const inc_t rstep_a = ps_a; \ +\ + const inc_t cstep_b = ps_b; \ +\ + const inc_t rstep_c = rs_c * MR; \ + const inc_t cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the virtual microkernel address and the params. */ \ + /*bli_auxinfo_set_ukr( gemm_ukr, &aux );*/ \ + /*bli_auxinfo_set_params( params, &aux );*/ \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + const dim_t jr_nt = bli_thread_n_way( thread ); \ + const dim_t jr_tid = bli_thread_work_id( thread ); \ + const dim_t ir_nt = bli_thread_n_way( caucus ); \ + const dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Determine the thread range and increment for the 2nd and 1st loops. + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. */ \ +/* +*/ \ + bli_thread_range_weighted_jr( thread, diagoffc, BLIS_LOWER, m, n, NR, \ + FALSE, &jr_start, &jr_end, &jr_inc ); \ + /*bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );*/ \ +/* +*/ \ + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +/* + dim_t jr_st, ir_st; \ + const dim_t n_ut_for_me \ + = \ + bli_thread_range_tlb( thread, diagoffc, BLIS_LOWER, m, n, MR, NR, \ + &jr_st, &ir_st ); \ +*/ \ +\ +/* +printf( "bli_gemmt_l_ker_var2b(): tid %d: m n = %d %d st en in = %3d %3d %3d do %d\n", (int)jr_tid, (int)m, (int)n, (int)jr_start, (int)jr_end, (int)jr_inc, (int)diagoffc ); \ +*/ \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict b1 = b_cast + j * cstep_b; \ + ctype* restrict c1 = c_cast + j * cstep_c; \ +\ + /* Compute the diagonal offset for the column of microtiles at (0,j). */ \ + const doff_t diagoffc_j = diagoffc - (doff_t)j*NR; \ + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) \ + ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + ctype* restrict b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + /* Compute the diagonal offset for the microtile at (i,j). */ \ + const doff_t diagoffc_ij = diagoffc_j + (doff_t)i*MR; \ + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) \ + ? MR : m_left ); \ +\ + /* If the diagonal intersects the current MR x NR microtile, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the microtile is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue on through the IR loop to consider the next MR x NR + microtile. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, \ + diagoffc_j, MR, NR ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + MR, \ + NR, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, \ + diagoffc_j, MR, NR ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2b ) + diff --git a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c.prev b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c.prev new file mode 100644 index 000000000..87d77ee55 --- /dev/null +++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c.prev @@ -0,0 +1,510 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmt_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2); + + +void bli_gemmt_u_ker_var2 + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + const num_t dt_exec = bli_obj_exec_dt( c ); + + const doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + ftypes[dt_exec] + ( + diagoffc, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, is_a, + pd_a, ps_a, + ( void* )buf_b, rs_b, is_b, + pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, jp; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely below the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region to the left of where the diagonal of C + intersects the top edge of the panel, adjust the pointer to C and B + and treat this case as if the diagonal offset were zero. + NOTE: It's possible that after this pruning that the diagonal offset + is still positive (though it is guaranteed to be less than NR). */ \ + if ( diagoffc > 0 ) \ + { \ + jp = diagoffc / NR; \ + j = jp * NR; \ + n = n - j; \ + diagoffc = diagoffc % NR; \ + c_cast = c_cast + (j )*cs_c; \ + b_cast = b_cast + (jp )*ps_b; \ + } \ +\ + /* If there is a zero region below where the diagonal of C intersects + the right edge of the panel, shrink it to prevent "no-op" iterations + from executing. */ \ + if ( -diagoffc + n < m ) \ + { \ + m = -diagoffc + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the triangular + part of C, and the rectangular portion. */ \ + dim_t n_iter_tri; \ + dim_t n_iter_rct; \ +\ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \ + { \ + /* If the entire panel of C does not intersect the diagonal, there is + no triangular region, and therefore we can skip the first set of + loops. */ \ + n_iter_tri = 0; \ + n_iter_rct = n_iter; \ + } \ + else \ + { \ + /* If the panel of C does intersect the diagonal, compute the number of + iterations in the triangular (or trapezoidal) region by dividing NR + into the number of rows in C. A non-zero remainder means we need to + add one additional iteration. That is, we want the triangular region + to contain as few columns of whole microtiles as possible while still + including all microtiles that intersect the diagonal. The number of + iterations in the rectangular region is computed as the remaining + number of iterations in the n dimension. */ \ + n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \ + n_iter_rct = n_iter - n_iter_tri; \ + } \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop + and the default (slab or rr) partitioning in the 1st loop for the + initial triangular region of C (if it exists). */ \ + bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir ( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + MR, \ + NR, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +\ + /* If there is no rectangular region, then we're done. */ \ + if ( n_iter_rct == 0 ) return; \ +\ + /* Determine the thread range and increment for the 2nd loop of the + remaining rectangular region of C (and also use default partitioning + for the 1st loop). + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Advance the start and end iteration offsets for the rectangular region + by the number of iterations used for the triangular region. */ \ + jr_start += n_iter_tri; \ + jr_end += n_iter_tri; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* No need to compute the diagonal offset for the rectangular + region. */ \ + /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue. */ \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 ) + diff --git a/frame/3/gemmt/other/bli_gemmt_u_ker_var2b.c.before b/frame/3/gemmt/other/bli_gemmt_u_ker_var2b.c.before new file mode 100644 index 000000000..dbf8f389f --- /dev/null +++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2b.c.before @@ -0,0 +1,415 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmt_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2b); + + +void bli_gemmt_u_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + const num_t dt_exec = bli_obj_exec_dt( c ); + + const doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + ftypes[dt_exec] + ( + diagoffc, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, is_a, + pd_a, ps_a, + ( void* )buf_b, rs_b, is_b, + pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely below the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region to the left of where the diagonal of C + intersects the top edge of the panel, adjust the pointer to C and B + and treat this case as if the diagonal offset were zero. + NOTE: It's possible that after this pruning that the diagonal offset + is still positive (though it is guaranteed to be less than NR). */ \ + if ( diagoffc > 0 ) \ + { \ + const dim_t jp = diagoffc / NR; \ + const dim_t j = jp * NR; \ +\ + n = n - j; \ + diagoffc = diagoffc % NR; \ + c_cast = c_cast + (j )*cs_c; \ + b_cast = b_cast + (jp )*ps_b; \ + } \ +\ + /* If there is a zero region below where the diagonal of C intersects + the right edge of the panel, shrink it to prevent "no-op" iterations + from executing. */ \ + if ( -diagoffc + n < m ) \ + { \ + m = -diagoffc + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); \ + const dim_t n_left = n % NR; \ +\ + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); \ + const dim_t m_left = m % MR; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + const inc_t rstep_a = ps_a; \ +\ + const inc_t cstep_b = ps_b; \ +\ + const inc_t rstep_c = rs_c * MR; \ + const inc_t cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the virtual microkernel address and the params. */ \ + /*bli_auxinfo_set_ukr( gemm_ukr, &aux );*/ \ + /*bli_auxinfo_set_params( params, &aux );*/ \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Determine the thread range and increment for the 2nd and 1st loops. + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. */ \ + bli_thread_range_weighted_jr( thread, diagoffc, BLIS_UPPER, m, n, NR, \ + FALSE, &jr_start, &jr_end, &jr_inc ); \ + /*bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );*/ \ + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ +/* +printf( "bli_gemmt_u_ker_var2b(): tid %d: m n = %d %d st en in = %3d %3d %3d do %d\n", (int)jr_tid, (int)m, (int)n, (int)jr_start, (int)jr_end, (int)jr_inc, (int)diagoffc ); \ +*/ \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict b1 = b_cast + j * cstep_b; \ + ctype* restrict c1 = c_cast + j * cstep_c; \ +\ + /* Compute the diagonal offset for the column of microtiles at (0,j). */ \ + const doff_t diagoffc_j = diagoffc - (doff_t)j*NR; \ + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) \ + ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + ctype* restrict b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + /* Compute the diagonal offset for the microtile at (i,j). */ \ + const doff_t diagoffc_ij = diagoffc_j + (doff_t)i*MR; \ + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) \ + ? MR : m_left ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue on through the IR loop to consider the next MR x NR + microtile. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, \ + diagoffc_j, MR, NR ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + MR, \ + NR, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, \ + diagoffc_j, MR, NR ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2b ) + diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 3bc4e3c6b..0c5cde72c 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -37,11 +37,11 @@ void bli_trmm_ll_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -83,10 +83,10 @@ void bli_trmm_ll_ker_var2 const void* buf_beta = bli_obj_internal_scalar_buffer( c ); // Alias some constants to simpler names. - const dim_t MR = pd_a; - const dim_t NR = pd_b; - const dim_t PACKMR = cs_a; - const dim_t PACKNR = rs_b; + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; // Query the context for the micro-kernel address and cast it to its // function pointer type. @@ -140,50 +140,45 @@ void bli_trmm_ll_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) // loop around the microkernel. Here we query the thrinfo_t node for the // 1st (ir) loop around the microkernel. - //thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread ); + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); - //dim_t ir_nt = bli_thrinfo_n_way( ir_thread ); - //dim_t ir_tid = bli_thrinfo_work_id( ir_thread ); + //const dim_t jr_nt = bli_thrinfo_n_way( thread ); + //const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - dim_t jr_start, jr_end; - //dim_t ir_start, ir_end; - dim_t jr_inc; + dim_t jr_start, jr_end, jr_inc; // Determine the thread range and increment for the 2nd loop. - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. // NOTE: Parallelism in the 1st loop is disabled for now. - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - //bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -191,20 +186,24 @@ void bli_trmm_ll_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; + // Initialize pointers for stepping through the block of A and current + // column of microtiles of C. const char* a1 = a_cast; char* c11 = c1; // Loop over the m dimension (MR rows at a time). for ( dim_t i = 0; i < m_iter; ++i ) { - doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); // If the current panel of A intersects the diagonal, scale C // by beta. If it is strictly below the diagonal, scale by one. @@ -215,8 +214,8 @@ void bli_trmm_ll_ker_var2 // Determine the offset to and length of the panel that was // packed so we can index into the corresponding location in // b1. - dim_t off_a1011 = 0; - dim_t k_a1011 = bli_min( diagoffa_i + MR, k ); + const dim_t off_a1011 = 0; + const dim_t k_a1011 = bli_min( diagoffa_i + MR, k ); // Compute the panel stride for the current diagonal- // intersecting micro-panel. @@ -230,13 +229,13 @@ void bli_trmm_ll_ker_var2 const char* b1_i = b1 + off_a1011 * PACKNR * dt_size; // Compute the addresses of the next panels of A and B. - const char* a2 = a1; - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_slrr( i, m_iter, 0, 1 ) ) { a2 = a_cast; - b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + //if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) + // b2 = b_cast; } // Save addresses of next panels of A and B to the auxinfo_t @@ -268,13 +267,13 @@ void bli_trmm_ll_ker_var2 //if ( bli_trmm_my_iter( i, ir_thread ) ) { // Compute the addresses of the next panels of A and B. - const char* a2 = a1; - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_slrr( i, m_iter, 0, 1 ) ) { a2 = a_cast; - b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + //if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) + // b2 = b_cast; } // Save addresses of next panels of A and B to the auxinfo_t @@ -306,6 +305,6 @@ void bli_trmm_ll_ker_var2 } } -//PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" ); -//PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" ); +//PASTEMAC(ch,printm)( "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,printm)( "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" ); diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2b.c b/frame/3/trmm/bli_trmm_ll_ker_var2b.c new file mode 100644 index 000000000..bb6de00f5 --- /dev/null +++ b/frame/3/trmm/bli_trmm_ll_ker_var2b.c @@ -0,0 +1,365 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_ll_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current block of A is entirely above the diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; + + // If there is a zero region above where the diagonal of A intersects the + // left edge of the block, adjust the pointer to C and treat this case as + // if the diagonal offset were zero. This skips over the region that was + // not packed. (Note we assume the diagonal offset is a multiple of MR; + // this assumption will hold as long as the cache blocksizes KC nd MC are + // each a multiple of MR.) + if ( diagoffa < 0 ) + { + m += diagoffa; + c_cast -= diagoffa * rs_c * dt_size; + diagoffa = 0; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Computing the number of MR x MR tiles in the k dimension is needed + // when computing the thread ranges below. + const dim_t k_iter = k / MR + ( k % MR ? 1 : 0 ); + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for the JR loop. + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + dim_t jr_st, ir_st; + const dim_t n_ut_for_me + = + bli_thread_range_tlb_trmm_ll( jr_nt, jr_tid, diagoffa, m_iter, n_iter, k_iter, + MR, NR, &jr_st, &ir_st ); + +#if 0 + printf( "tid: %ld m,n,k_iter: %ld %ld %ld\n", tid, m_iter, n_iter, k_iter ); + printf( "tid: %ld trmm_ll_tlb begins at: %ld %ld (n_ut: %ld)\n", + tid, jr_st, ir_st, n_ut_for_me ); +#endif + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + + // Start the jr/ir loops with the current thread's microtile offsets computed + // by bli_thread_range_tlb_trmm_ll(). + dim_t i = ir_st; + dim_t j = jr_st; + + // Initialize a counter to track the number of microtiles computed by the + // current thread. + dim_t ut = 0; + + const char* a1 = a_cast; + + // Get pointers into position by stepping through to the ith micropanel of + // A and ith microtile of C (within the appropriate column of microtiles). + for ( dim_t ii = 0; ii < ir_st; ++ii ) + { + const doff_t diagoffa_ii = diagoffa + ( doff_t )ii*MR; + + if ( bli_intersects_diag_n( diagoffa_ii, MR, k ) ) + { + // Determine the length of the panel that was packed. + const dim_t k_a1011 = bli_min( diagoffa_ii + MR, k ); + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_a_cur = k_a1011 * PACKMR; + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); + ps_a_cur *= dt_size; + + a1 += ps_a_cur; + } + else if ( bli_is_strictly_below_diag_n( diagoffa_ii, MR, k ) ) + { + a1 += rstep_a; + } + } + + // Loop over the n dimension (NR columns at a time). + for ( ; true; ++j ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + bli_auxinfo_set_next_b( b2, &aux ); + + // Loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + char* c11 = c1 + i * rstep_c; + + const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // If the current panel of A intersects the diagonal, scale C + // by beta. If it is strictly below the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) + { + // Determine the offset to and length of the panel that was + // packed so we can index into the corresponding location in B. + const dim_t off_a1011 = 0; + const dim_t k_a1011 = bli_min( diagoffa_i + MR, k ); + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_a_cur = k_a1011 * PACKMR; + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); + ps_a_cur *= dt_size; + + const char* b1_i = b1 + off_a1011 * PACKNR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, ps_a_cur, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_a1011, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1_i, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + + a1 += ps_a_cur; + } + else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) + { + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + + a1 += rstep_a; + } + } + + // Upon reaching the end of the column of microtiles, reset the ir + // loop index so that we're ready to start the next pass through the + // m dimension (i.e., the next jr loop iteration). + i = 0; + + // Reset the a1 pointer to the beginning of the packed matrix A. + a1 = a_cast; + } +} + +//PASTEMAC(ch,printm)( "trmm_ll_ker_var2b: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,printm)( "trmm_ll_ker_var2b: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" ); + +//printf( "tid: %ld intersects diag. j,i: %ld %ld (ut: %ld)\n", tid, j, i, ut ); +//printf( "tid: %ld strictbelow diag j,i: %ld %ld (ut: %ld)\n", tid, j, i, ut ); + +//printf( "tid: %ld incrementing by ps_a_cur: %ld (k_a1011: %ld)\n", +// tid, ps_a_cur, k_a1011 ); +//printf( "tid: %ld incrementing by rstep_a: %ld (k : %ld)\n", +// tid, rstep_a, k ); + diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 265e21a66..039bcc292 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -37,11 +37,11 @@ void bli_trmm_lu_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -83,10 +83,10 @@ void bli_trmm_lu_ker_var2 const void* buf_beta = bli_obj_internal_scalar_buffer( c ); // Alias some constants to simpler names. - const dim_t MR = pd_a; - const dim_t NR = pd_b; - const dim_t PACKMR = cs_a; - const dim_t PACKNR = rs_b; + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; // Query the context for the micro-kernel address and cast it to its // function pointer type. @@ -147,50 +147,45 @@ void bli_trmm_lu_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) // loop around the microkernel. Here we query the thrinfo_t node for the // 1st (ir) loop around the microkernel. - //thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread ); + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); - //dim_t ir_nt = bli_thrinfo_n_way( ir_thread ); - //dim_t ir_tid = bli_thrinfo_work_id( ir_thread ); + //const dim_t jr_nt = bli_thrinfo_n_way( thread ); + //const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - dim_t jr_start, jr_end; - //dim_t ir_start, ir_end; - dim_t jr_inc; + dim_t jr_start, jr_end, jr_inc; // Determine the thread range and increment for the 2nd loop. - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. // NOTE: Parallelism in the 1st loop is disabled for now. - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - //bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -198,20 +193,24 @@ void bli_trmm_lu_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; + // Initialize pointers for stepping through the block of A and current + // column of microtiles of C. const char* a1 = a_cast; char* c11 = c1; // Loop over the m dimension (MR rows at a time). for ( dim_t i = 0; i < m_iter; ++i ) { - doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); // If the current panel of A intersects the diagonal, scale C // by beta. If it is strictly above the diagonal, scale by one. @@ -222,8 +221,8 @@ void bli_trmm_lu_ker_var2 // Determine the offset to and length of the panel that was // packed so we can index into the corresponding location in // b1. - dim_t off_a1112 = diagoffa_i; - dim_t k_a1112 = k - off_a1112; + const dim_t off_a1112 = diagoffa_i; + const dim_t k_a1112 = k - off_a1112; // Compute the panel stride for the current diagonal- // intersecting micro-panel. @@ -237,13 +236,13 @@ void bli_trmm_lu_ker_var2 const char* b1_i = b1 + off_a1112 * PACKNR * dt_size; // Compute the addresses of the next panels of A and B. - const char* a2 = a1; - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_slrr( i, m_iter, 0, 1 ) ) { a2 = a_cast; - b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + //if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) + // b2 = b_cast; } // Save addresses of next panels of A and B to the auxinfo_t @@ -275,13 +274,13 @@ void bli_trmm_lu_ker_var2 //if ( bli_trmm_my_iter( i, ir_thread ) ) { // Compute the addresses of the next panels of A and B. - const char* a2 = a1; - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_slrr( i, m_iter, 0, 1 ) ) { a2 = a_cast; - b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + //if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) + // b2 = b_cast; } // Save addresses of next panels of A and B to the auxinfo_t @@ -313,6 +312,6 @@ void bli_trmm_lu_ker_var2 } } -//PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" ); -//PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" ); +//PASTEMAC(ch,printm)( "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,printm)( "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" ); diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2b.c b/frame/3/trmm/bli_trmm_lu_ker_var2b.c new file mode 100644 index 000000000..39640ad6b --- /dev/null +++ b/frame/3/trmm/bli_trmm_lu_ker_var2b.c @@ -0,0 +1,366 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_lu_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current block of A is entirely below the diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; + + // If there is a zero region to the left of where the diagonal of A + // intersects the top edge of the block, adjust the pointer to B and + // treat this case as if the diagonal offset were zero. Note that we + // don't need to adjust the pointer to A since packm would have simply + // skipped over the region that was not stored. (Note we assume the + // diagonal offset is a multiple of MR; this assumption will hold as + // long as the cache blocksizes KC nd MC are each a multiple of MR.) + if ( diagoffa > 0 ) + { + k -= diagoffa; + b_cast += diagoffa * PACKNR * dt_size; + diagoffa = 0; + } + + // If there is a zero region below where the diagonal of A intersects the + // right side of the block, shrink it to prevent "no-op" iterations from + // executing. + if ( -diagoffa + k < m ) + { + m = -diagoffa + k; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Computing the number of MR x MR tiles in the k dimension is needed + // when computing the thread ranges below. + const dim_t k_iter = k / MR + ( k % MR ? 1 : 0 ); + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + dim_t jr_st, ir_st; + const dim_t n_ut_for_me + = + bli_thread_range_tlb_trmm_lu( jr_nt, jr_tid, diagoffa, m_iter, n_iter, k_iter, + MR, NR, &jr_st, &ir_st ); + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + + // Start the jr/ir loops with the current thread's microtile offsets computed + // by bli_thread_range_tlb_trmm_ll(). + dim_t i = ir_st; + dim_t j = jr_st; + + // Initialize a counter to track the number of microtiles computed by the + // current thread. + dim_t ut = 0; + + const char* a1 = a_cast; + + // Get pointers into position by stepping through to the ith micropanel of + // A and ith microtile of C (within the appropriate column of microtiles). + for ( dim_t ii = 0; ii < ir_st; ++ii ) + { + const doff_t diagoffa_ii = diagoffa + ( doff_t )ii*MR; + + if ( bli_intersects_diag_n( diagoffa_ii, MR, k ) ) + { + // Determine the length of the panel that was packed. + const dim_t k_a1112 = k - diagoffa_ii; + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_a_cur = k_a1112 * PACKMR; + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); + ps_a_cur *= dt_size; + + a1 += ps_a_cur; + } + else if ( bli_is_strictly_above_diag_n( diagoffa_ii, MR, k ) ) + { + a1 += rstep_a; + } + } + + // Loop over the n dimension (NR columns at a time). + for ( ; true; ++j ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + bli_auxinfo_set_next_b( b2, &aux ); + + // Loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + char* c11 = c1 + i * rstep_c; + + const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // If the current panel of A intersects the diagonal, scale C + // by beta. If it is strictly above the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) + { + // Determine the offset to and length of the panel that was + // packed so we can index into the corresponding location in B. + const dim_t off_a1112 = diagoffa_i; + const dim_t k_a1112 = k - off_a1112; + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_a_cur = k_a1112 * PACKMR; + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); + ps_a_cur *= dt_size; + + const char* b1_i = b1 + off_a1112 * PACKNR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, ps_a_cur, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_a1112, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1_i, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + + a1 += ps_a_cur; + } + else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) + { + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + + a1 += rstep_a; + } + } + + // Upon reaching the end of the column of microtiles, reset the ir + // loop index so that we're ready to start the next pass through the + // m dimension (i.e., the next jr loop iteration). + i = 0; + + // Reset the a1 pointer to the beginning of the packed matrix A. + a1 = a_cast; + } +} + +//PASTEMAC(ch,printm)( "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,printm)( "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" ); + +#if 0 + printf( "tid: %ld m,n,k_iter: %ld %ld %ld\n", tid, m_iter, n_iter, k_iter ); + printf( "tid: %ld trmm_lu_tlb begins at: %ld %ld (n_ut: %ld)\n", + tid, jr_st, ir_st, n_ut_for_me ); +#endif + diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 785f2cf5f..f8d0fc6c8 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -37,11 +37,11 @@ void bli_trmm_rl_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -147,39 +147,40 @@ void bli_trmm_rl_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); - dim_t ir_nt = bli_thrinfo_n_way( caucus ); - dim_t ir_tid = bli_thrinfo_work_id( caucus ); + // Query the number of threads and thread ids for each loop. + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - dim_t jr_start, jr_end; - dim_t ir_start, ir_end; - dim_t jr_inc, ir_inc; + dim_t jr_start, jr_end, jr_inc; + dim_t ir_start, ir_end, ir_inc; // Note that we partition the 2nd loop into two regions: the rectangular // part of B, and the triangular portion. @@ -207,11 +208,11 @@ void bli_trmm_rl_ker_var2 // Determine the thread range and increment for the 2nd and 1st loops for // the initial rectangular region of B (if it exists). - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. // NOTE: Parallelism in the 1st loop is disabled for now. - bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + bli_thread_range_slrr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -219,7 +220,7 @@ void bli_trmm_rl_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -231,15 +232,15 @@ void bli_trmm_rl_ker_var2 const char* a1 = a_cast + i * rstep_a; char* c11 = c1 + i * rstep_c; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); // Compute the addresses of the next panels of A and B. const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) + if ( bli_is_last_iter_slrr( i, m_iter, ir_tid, ir_nt ) ) { a2 = a_cast; b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) b2 = b_cast; } @@ -271,7 +272,7 @@ void bli_trmm_rl_ker_var2 // Use round-robin assignment of micropanels to threads in the 2nd and // 1st loops for the remaining triangular region of B (if it exists). - // NOTE: We don't need to call bli_thread_range_jrir_rr() here since we + // NOTE: We don't need to call bli_thread_range_rr() here since we // employ a hack that calls for each thread to execute every iteration // of the jr and ir loops but skip all but the pointer increment for // iterations that are not assigned to it. @@ -285,18 +286,18 @@ void bli_trmm_rl_ker_var2 // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < n_iter; ++j ) { - doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; // Determine the offset to the beginning of the panel that // was packed so we can index into the corresponding location // in A. Then compute the length of that panel. - dim_t off_b1121 = bli_max( -diagoffb_j, 0 ); - dim_t k_b1121 = k - off_b1121; + const dim_t off_b1121 = bli_max( -diagoffb_j, 0 ); + const dim_t k_b1121 = k - off_b1121; const char* a1 = a_cast; char* c11 = c1; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -319,7 +320,7 @@ void bli_trmm_rl_ker_var2 { if ( bli_trmm_my_iter_rr( i, caucus ) ) { - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); const char* a1_i = a1 + off_b1121 * PACKMR * dt_size; diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2b.c b/frame/3/trmm/bli_trmm_rl_ker_var2b.c new file mode 100644 index 000000000..7f2757c3a --- /dev/null +++ b/frame/3/trmm/bli_trmm_rl_ker_var2b.c @@ -0,0 +1,392 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_rl_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of B is entirely above the diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; + + // If there is a zero region above where the diagonal of B intersects + // the left edge of the panel, adjust the pointer to A and treat this + // case as if the diagonal offset were zero. Note that we don't need to + // adjust the pointer to B since packm would have simply skipped over + // the region that was not stored. (Note we assume the diagonal offset + // is a multiple of NR; this assumption will hold as long as the cache + // blocksizes KC and NC are each a multiple of NR.) + if ( diagoffb < 0 ) + { + k += diagoffb; + a_cast -= diagoffb * PACKMR * dt_size; + diagoffb = 0; + } + + // If there is a zero region to the right of where the diagonal + // of B intersects the bottom of the panel, shrink it to prevent + // "no-op" iterations from executing. + if ( diagoffb + k < n ) + { + n = diagoffb + k; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Computing the number of NR x NR tiles in the k dimension is needed + // when computing the thread ranges below. + const dim_t k_iter = k / NR + ( k % NR ? 1 : 0 ); + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel while the 'caucus' points to the thrinfo_t + // node for the 1st loop (ir). + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. +#if 0 +{ + const dim_t jr_nt = 17; + const dim_t jr_tid = jr_nt - 1; + + const doff_t m_iter = 10; + const doff_t k_iter = 10; + const doff_t n_iter = 20; + + diagoffb = 30 * NR; +#else + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); +#endif + dim_t jr_st, ir_st; + const dim_t n_ut_for_me + = + bli_thread_range_tlb_trmm_rl( jr_nt, jr_tid, diagoffb, m_iter, n_iter, k_iter, + MR, NR, &jr_st, &ir_st ); + +#if 0 + printf( "tid %ld: final range: jr_st, ir_st: %ld %ld (n_ut_for_me: %ld)\n", + jr_tid, jr_st, ir_st, n_ut_for_me ); + return; +} +const dim_t n_ut_for_me = -1; dim_t jr_st, ir_st; +#endif + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + + // Start the jr/ir loops with the current thread's microtile offsets computed + // by bli_thread_range_tlb_trmm_r(). + dim_t i = ir_st; + dim_t j = jr_st; + + // Initialize a counter to track the number of microtiles computed by the + // current thread. + dim_t ut = 0; + + const char* b1 = b_cast; + + // Get pointers into position by stepping through to the jth micropanel of + // B and jth microtile of C (within the appropriate row of microtiles). + for ( dim_t jj = 0; jj < jr_st; ++jj ) + { + const doff_t diagoffb_jj = diagoffb - ( doff_t )jj*NR; + + if ( bli_intersects_diag_n( diagoffb_jj, k, NR ) ) + { + // Determine the length of the panel that was packed. + const dim_t off_b1121 = bli_max( -diagoffb_jj, 0 ); + const dim_t k_b1121 = k - off_b1121; + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_b_cur = k_b1121 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + b1 += ps_b_cur; + } + else if ( bli_is_strictly_below_diag_n( diagoffb_jj, k, NR ) ) + { + b1 += cstep_b; + } + } + + // Loop over the n dimension (NR columns at a time). + for ( ; true; ++j ) + { + char* c1 = c_cast + j * cstep_c; + + const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Determine the offset to and length of the panel that was packed + // so we can index into the corresponding location in A. + const dim_t off_b1121 = bli_max( -diagoffb_j, 0 ); + const dim_t k_b1121 = k - off_b1121; + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + bli_auxinfo_set_next_b( b2, &aux ); + + // If the current panel of B intersects the diagonal, scale C + // by beta. If it is strictly below the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) + { + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_b_cur = k_b1121 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + // Loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + const char* a1_i = a1 + off_b1121 * PACKMR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, ps_b_cur, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_b1121, + ( void* )alpha_cast, + ( void* )a1_i, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + } + + // Upon reaching the end of the column of microtiles, reset the ir + // loop index so that we're ready to start the next pass through the + // m dimension (i.e., the next jr loop iteration). + i = 0; + + b1 += ps_b_cur; + } + else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) + { + // Loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + } + + // Upon reaching the end of the column of microtiles, reset the ir + // loop index so that we're ready to start the next pass through the + // m dimension (i.e., the next jr loop iteration). + i = 0; + + b1 += cstep_b; + } + } +} + +//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" ); + diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index ca27caef1..a031b6794 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -37,11 +37,11 @@ void bli_trmm_ru_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -148,25 +148,23 @@ void bli_trmm_ru_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); @@ -177,14 +175,13 @@ void bli_trmm_ru_ker_var2 thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); - dim_t ir_nt = bli_thrinfo_n_way( caucus ); - dim_t ir_tid = bli_thrinfo_work_id( caucus ); + //const dim_t jr_nt = bli_thrinfo_n_way( thread ); + //const dim_t jr_tid = bli_thrinfo_work_id( thread ); + const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - dim_t jr_start, jr_end; - dim_t ir_start, ir_end; - dim_t jr_inc, ir_inc; + dim_t jr_start, jr_end, jr_inc; + dim_t ir_start, ir_end, ir_inc; // Note that we partition the 2nd loop into two regions: the triangular // part of C, and the rectangular portion. @@ -212,7 +209,7 @@ void bli_trmm_ru_ker_var2 // Use round-robin assignment of micropanels to threads in the 2nd and // 1st loops for the initial triangular region of B (if it exists). - // NOTE: We don't need to call bli_thread_range_jrir_rr() here since we + // NOTE: We don't need to call bli_thread_range_rr() here since we // employ a hack that calls for each thread to execute every iteration // of the jr and ir loops but skip all but the pointer increment for // iterations that are not assigned to it. @@ -223,17 +220,18 @@ void bli_trmm_ru_ker_var2 // Loop over the n dimension (NR columns at a time). for ( dim_t j = 0; j < n_iter_tri; ++j ) { - doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; // Determine the offset to and length of the panel that was packed // so we can index into the corresponding location in A. - dim_t off_b0111 = 0; - dim_t k_b0111 = bli_min( k, -diagoffb_j + NR ); + const dim_t off_b0111 = 0; + const dim_t k_b0111 = bli_min( k, -diagoffb_j + NR ); const char* a1 = a_cast; char* c11 = c1; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -256,7 +254,8 @@ void bli_trmm_ru_ker_var2 { if ( bli_trmm_my_iter_rr( i, caucus ) ) { - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); const char* a1_i = a1 + off_b0111 * PACKMR * dt_size; @@ -266,8 +265,6 @@ void bli_trmm_ru_ker_var2 { a2 = a_cast; b2 = b1; - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; } // Save addresses of next panels of A and B to the auxinfo_t @@ -307,11 +304,11 @@ void bli_trmm_ru_ker_var2 // Determine the thread range and increment for the 2nd and 1st loops for // the remaining rectangular region of B. - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. // NOTE: Parallelism in the 1st loop is disabled for now. - bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + bli_thread_range_slrr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); // Advance the start and end iteration offsets for the rectangular region // by the number of iterations used for the triangular region. @@ -332,7 +329,8 @@ void bli_trmm_ru_ker_var2 b1 = b_cast + (j-jb0) * cstep_b; c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -348,16 +346,15 @@ void bli_trmm_ru_ker_var2 const char* a1 = a_cast + i * rstep_a; char* c11 = c1 + i * rstep_c; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); // Compute the addresses of the next panels of A and B. const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) + if ( bli_is_last_iter_slrr( i, m_iter, ir_tid, ir_nt ) ) { a2 = a_cast; b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; } // Save addresses of next panels of A and B to the auxinfo_t diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2b.c b/frame/3/trmm/bli_trmm_ru_ker_var2b.c new file mode 100644 index 000000000..8aae2386a --- /dev/null +++ b/frame/3/trmm/bli_trmm_ru_ker_var2b.c @@ -0,0 +1,390 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_ru_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of B is entirely below its diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; + + // If there is a zero region to the left of where the diagonal of B + // intersects the top edge of the panel, adjust the pointer to C and + // treat this case as if the diagonal offset were zero. This skips over + // the region that was not packed. (Note we assume the diagonal offset + // is a multiple of NR; this assumption will hold as long as the cache + // blocksizes KC and NC are each a multiple of NR.) + if ( diagoffb > 0 ) + { + n -= diagoffb; + c_cast += diagoffb * cs_c * dt_size; + diagoffb = 0; + } + + // If there is a zero region below where the diagonal of B intersects the + // right side of the block, shrink it to prevent "no-op" iterations from + // executing. + if ( -diagoffb + n < k ) + { + k = -diagoffb + n; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Computing the number of NR x NR tiles in the k dimension is needed + // when computing the thread ranges below. + const dim_t k_iter = k / NR + ( k % NR ? 1 : 0 ); + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. +#if 0 +{ + const dim_t jr_nt = 1; + const dim_t jr_tid = 0; //jr_nt - 1; + + const doff_t m_iter = 10; + const doff_t k_iter = 10; + const doff_t n_iter = 20; + + diagoffb = 0 * NR; +#else + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); +#endif + dim_t jr_st, ir_st; + const dim_t n_ut_for_me + = + bli_thread_range_tlb_trmm_ru( jr_nt, jr_tid, diagoffb, m_iter, n_iter, k_iter, + MR, NR, &jr_st, &ir_st ); + +#if 0 + printf( "tid %ld: final range: jr_st, ir_st: %ld %ld (n_ut_for_me: %ld)\n", + jr_tid, jr_st, ir_st, n_ut_for_me ); + return; +} +const dim_t n_ut_for_me = -1; dim_t jr_st, ir_st; +#endif + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + + // Start the jr/ir loops with the current thread's microtile offsets computed + // by bli_thread_range_tlb_trmm_r(). + dim_t i = ir_st; + dim_t j = jr_st; + + // Initialize a counter to track the number of microtiles computed by the + // current thread. + dim_t ut = 0; + + const char* b1 = b_cast; + + // Get pointers into position by stepping through to the jth micropanel of + // B and jth microtile of C (within the appropriate row of microtiles). + for ( dim_t jj = 0; jj < jr_st; ++jj ) + { + const doff_t diagoffb_jj = diagoffb - ( doff_t )jj*NR; + + if ( bli_intersects_diag_n( diagoffb_jj, k, NR ) ) + { + // Determine the length of the panel that was packed. + const dim_t k_b0111 = bli_min( k, -diagoffb_jj + NR ); + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_b_cur = k_b0111 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + b1 += ps_b_cur; + } + else if ( bli_is_strictly_above_diag_n( diagoffb_jj, k, NR ) ) + { + b1 += cstep_b; + } + } + + // Loop over the n dimension (NR columns at a time). + for ( ; true; ++j ) + { + char* c1 = c_cast + j * cstep_c; + + const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Determine the offset to and length of the panel that was packed + // so we can index into the corresponding location in A. + const dim_t off_b0111 = 0; + const dim_t k_b0111 = bli_min( k, -diagoffb_j + NR ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + bli_auxinfo_set_next_b( b2, &aux ); + + // If the current panel of B intersects the diagonal, scale C + // by beta. If it is strictly above the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) + { + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_b_cur = k_b0111 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + // Loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + const char* a1_i = a1 + off_b0111 * PACKMR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, ps_b_cur, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_b0111, + ( void* )alpha_cast, + ( void* )a1_i, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + } + + // Upon reaching the end of the column of microtiles, reset the ir + // loop index so that we're ready to start the next pass through the + // m dimension (i.e., the next jr loop iteration). + i = 0; + + b1 += ps_b_cur; + } + else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) + { + // Loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + } + + // Upon reaching the end of the column of microtiles, reset the ir + // loop index so that we're ready to start the next pass through the + // m dimension (i.e., the next jr loop iteration). + i = 0; + + b1 += cstep_b; + } + } +} + +//PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" ); + diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h index f8c3d7ee2..0a605ba86 100644 --- a/frame/3/trmm/bli_trmm_var.h +++ b/frame/3/trmm/bli_trmm_var.h @@ -43,54 +43,23 @@ \ void PASTEMAC0(opname) \ ( \ - const obj_t* a, \ - const obj_t* b, \ - const obj_t* c, \ - const cntx_t* cntx, \ - const cntl_t* cntl, \ - thrinfo_t* thread \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* c, \ + const cntx_t* cntx, \ + const cntl_t* cntl, \ + thrinfo_t* thread_par \ ); -//GENPROT( trmm_blk_var1 ) -//GENPROT( trmm_blk_var2 ) -//GENPROT( trmm_blk_var3 ) - GENPROT( trmm_xx_ker_var2 ) - GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoff, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) -INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) -INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) -INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) +GENPROT( trmm_xx_ker_var2b ) +GENPROT( trmm_ll_ker_var2b ) +GENPROT( trmm_lu_ker_var2b ) +GENPROT( trmm_rl_ker_var2b ) +GENPROT( trmm_ru_ker_var2b ) diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c index 60030bf4a..918b8f973 100644 --- a/frame/3/trmm/bli_trmm_xx_ker_var2.c +++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c @@ -43,12 +43,12 @@ static l3_var_oft vars[2][2] = void bli_trmm_xx_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par ) { dim_t side; @@ -81,7 +81,7 @@ void bli_trmm_xx_ker_var2 c, cntx, cntl, - thread + thread_par ); } diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2b.c b/frame/3/trmm/bli_trmm_xx_ker_var2b.c new file mode 100644 index 000000000..57894165c --- /dev/null +++ b/frame/3/trmm/bli_trmm_xx_ker_var2b.c @@ -0,0 +1,87 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +static l3_var_oft vars[2][2] = +{ + { bli_trmm_ll_ker_var2b, bli_trmm_lu_ker_var2b }, + { bli_trmm_rl_ker_var2b, bli_trmm_ru_ker_var2b } +}; + +void bli_trmm_xx_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + dim_t side; + dim_t uplo; + + // Set two bools: one based on the implied side parameter (the structure + // of the root object) and one based on the uplo field of the triangular + // matrix's root object (whether that is matrix A or matrix B). + if ( bli_obj_root_is_triangular( a ) ) + { + side = 0; + if ( bli_obj_root_is_lower( a ) ) uplo = 0; + else uplo = 1; + } + else // if ( bli_obj_root_is_triangular( b ) ) + { + side = 1; + if ( bli_obj_root_is_lower( b ) ) uplo = 0; + else uplo = 1; + } + + // Index into the variant array to extract the correct function pointer. + l3_var_oft f = vars[side][uplo]; + + // Call the macrokernel. + f + ( + a, + b, + c, + cntx, + cntl, + thread_par + ); +} + diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.prev b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.prev new file mode 100644 index 000000000..5aebe23c1 --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.prev @@ -0,0 +1,371 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_rl_ker_var2 + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of B is entirely above the diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; + + // If there is a zero region above where the diagonal of B intersects + // the left edge of the panel, adjust the pointer to A and treat this + // case as if the diagonal offset were zero. Note that we don't need to + // adjust the pointer to B since packm would have simply skipped over + // the region that was not stored. + if ( diagoffb < 0 ) + { + k += diagoffb; + a_cast -= diagoffb * PACKMR * dt_size; + diagoffb = 0; + } + + // If there is a zero region to the right of where the diagonal + // of B intersects the bottom of the panel, shrink it to prevent + // "no-op" iterations from executing. + if ( diagoffb + k < n ) + { + n = diagoffb + k; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + dim_t n_iter = n / NR; + dim_t n_left = n % NR; + + dim_t m_iter = m / MR; + dim_t m_left = m % MR; + + if ( n_left ) ++n_iter; + if ( m_left ) ++m_iter; + + // Determine some increments used to step through A, B, and C. + inc_t rstep_a = ps_a * dt_size; + + inc_t cstep_b = ps_b * dt_size; + + inc_t rstep_c = rs_c * MR * dt_size; + inc_t cstep_c = cs_c * NR * dt_size; + + // Save the pack schemas of A and B to the auxinfo_t object. + auxinfo_t aux; + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + dim_t jr_nt = bli_thrinfo_n_way( thread ); + dim_t jr_tid = bli_thrinfo_work_id( thread ); + dim_t ir_nt = bli_thrinfo_n_way( caucus ); + dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + dim_t jr_start, jr_end; + dim_t ir_start, ir_end; + dim_t jr_inc, ir_inc; + + // Note that we partition the 2nd loop into two regions: the rectangular + // part of B, and the triangular portion. + dim_t n_iter_rct; + dim_t n_iter_tri; + + if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) + { + // If the entire panel of B does not intersect the diagonal, there is + // no triangular region, and therefore we can skip the second set of + // loops. + n_iter_rct = n_iter; + n_iter_tri = 0; + } + else + { + // If the panel of B does intersect the diagonal, compute the number of + // iterations in the rectangular region by dividing NR into the diagonal + // offset. (There should never be any remainder in this division.) The + // number of iterations in the triangular (or trapezoidal) region is + // computed as the remaining number of iterations in the n dimension. + n_iter_rct = diagoffb / NR; + n_iter_tri = n_iter - n_iter_rct; + } + + // Determine the thread range and increment for the 2nd and 1st loops for + // the initial rectangular region of B (if it exists). + // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + // NOTE: Parallelism in the 1st loop is disabled for now. + bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + { + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + } + } + + // If there is no triangular region, then we're done. + if ( n_iter_tri == 0 ) return; + + // Use round-robin assignment of micropanels to threads in the 2nd and + // 1st loops for the remaining triangular region of B (if it exists). + // NOTE: We don't need to call bli_thread_range_jrir_rr() here since we + // employ a hack that calls for each thread to execute every iteration + // of the jr and ir loops but skip all but the pointer increment for + // iterations that are not assigned to it. + + // Advance the starting b1 and c1 pointers to the positions corresponding + // to the start of the triangular region of B. + jr_start = n_iter_rct; + const char* b1 = b_cast + jr_start * cstep_b; + char* c1 = c_cast + jr_start * cstep_c; + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < n_iter; ++j ) + { + doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + + // Determine the offset to the beginning of the panel that + // was packed so we can index into the corresponding location + // in A. Then compute the length of that panel. + dim_t off_b1121 = bli_max( -diagoffb_j, 0 ); + dim_t k_b1121 = k - off_b1121; + + const char* a1 = a_cast; + char* c11 = c1; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + // If the current panel of B intersects the diagonal, scale C + // by beta. If it is strictly below the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + { + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_b_cur = k_b1121 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + if ( bli_trmm_my_iter_rr( j, thread ) ) { + + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = 0; i < m_iter; ++i ) + { + if ( bli_trmm_my_iter_rr( i, caucus ) ) { + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + const char* a1_i = a1 + off_b1121 * PACKMR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = a1; + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = b1; + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_b1121, + ( void* )alpha_cast, + ( void* )a1_i, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + + a1 += rstep_a; + c11 += rstep_c; + } + } + + b1 += ps_b_cur; + } + + c1 += cstep_c; + } +} + +//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" ); + diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.unified b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.unified new file mode 100644 index 000000000..7d2aabaa4 --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.unified @@ -0,0 +1,324 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_rl_ker_var2 + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of B is entirely above the diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; + + // If there is a zero region above where the diagonal of B intersects + // the left edge of the panel, adjust the pointer to A and treat this + // case as if the diagonal offset were zero. Note that we don't need to + // adjust the pointer to B since packm would have simply skipped over + // the region that was not stored. + if ( diagoffb < 0 ) + { + k += diagoffb; + a_cast -= diagoffb * PACKMR * dt_size; + diagoffb = 0; + } + + // If there is a zero region to the right of where the diagonal + // of B intersects the bottom of the panel, shrink it to prevent + // "no-op" iterations from executing. + if ( diagoffb + k < n ) + { + n = diagoffb + k; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel while the 'caucus' points to the thrinfo_t + // node for the 1st loop (ir). + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. + //const dim_t jr_nt = bli_thrinfo_n_way( thread ); + //const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + dim_t jr_start, jr_end, jr_inc; + dim_t ir_start, ir_end, ir_inc; + + // Determine the thread range and increment for the 2nd and 1st loops. + // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + // NOTE: Parallelism in the 1st loop is disabled for now. + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + + const char* b1 = b_cast; + // char* c1 = c_cast; + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + const char* a1 = a_cast; + char* c1 = c_cast + j * cstep_c; + char* c11 = c1; + + const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Determine the offset to the beginning of the panel that + // was packed so we can index into the corresponding location + // in A. Then compute the length of that panel. + const dim_t off_b1121 = bli_max( -diagoffb_j, 0 ); + const dim_t k_b1121 = k - off_b1121; + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + // If the current panel of B intersects the diagonal, scale C + // by beta. If it is strictly below the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) + { + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_b_cur = k_b1121 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = 0; i < m_iter; ++i ) + //for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) + { + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + const char* a1_i = a1 + off_b1121 * PACKMR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + //if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + // b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_b1121, + ( void* )alpha_cast, + ( void* )a1_i, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + a1 += rstep_a; + c11 += rstep_c; + } + + b1 += ps_b_cur; + } + else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) + { + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = 0; i < m_iter; ++i ) + { + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + //if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + // b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + a1 += rstep_a; + c11 += rstep_c; + } + + b1 += cstep_b; + } + + //c1 += cstep_c; + } +} + +//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" ); + diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c index 275d6ca47..45af76910 100644 --- a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c @@ -356,7 +356,7 @@ void PASTEMAC(ch,varname) \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, scale C - by beta. If it is strictly below the diagonal, scale by one. + by beta. If it is strictly above the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index e2128f100..786e4f343 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -37,11 +37,11 @@ void bli_trsm_ll_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -158,47 +158,44 @@ void bli_trsm_ll_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); // We don't bother querying the thrinfo_t node for the 1st loop because // we can't parallelize that loop in trsm due to the inter-iteration // dependencies that exist. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); - dim_t jr_start, jr_end; - dim_t jr_inc; + dim_t jr_start, jr_end, jr_inc; // Determine the thread range and increment for the 2nd loop. - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. // NOTE: Parallelism in the 1st loop is unattainable due to the // inter-iteration dependencies present in trsm. - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -206,7 +203,8 @@ void bli_trsm_ll_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -217,9 +215,10 @@ void bli_trsm_ll_ker_var2 // Loop over the m dimension (MR rows at a time). for ( dim_t i = 0; i < m_iter; ++i ) { - doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); // If the current panel of A intersects the diagonal, use a // special micro-kernel that performs a fused gemm and trsm. @@ -230,10 +229,10 @@ void bli_trsm_ll_ker_var2 if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) { // Compute various offsets into and lengths of parts of A. - dim_t off_a10 = 0; - dim_t k_a1011 = diagoffa_i + MR; - dim_t k_a10 = k_a1011 - MR; - dim_t off_a11 = k_a10; + const dim_t off_a10 = 0; + const dim_t k_a1011 = diagoffa_i + MR; + const dim_t k_a10 = k_a1011 - MR; + const dim_t off_a11 = k_a10; // Compute the panel stride for the current diagonal- // intersecting micro-panel. @@ -258,7 +257,7 @@ void bli_trsm_ll_ker_var2 { a2 = a_cast; b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) b2 = b_cast; } @@ -292,7 +291,7 @@ void bli_trsm_ll_ker_var2 { a2 = a_cast; b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) b2 = b_cast; } diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 314ee3070..ebf44905b 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -37,11 +37,11 @@ void bli_trsm_lu_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -169,47 +169,44 @@ void bli_trsm_lu_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); // We don't bother querying the thrinfo_t node for the 1st loop because // we can't parallelize that loop in trsm due to the inter-iteration // dependencies that exist. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); - dim_t jr_start, jr_end; - dim_t jr_inc; + dim_t jr_start, jr_end, jr_inc; // Determine the thread range and increment for the 2nd loop. - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. // NOTE: Parallelism in the 1st loop is unattainable due to the // inter-iteration dependencies present in trsm. - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -217,7 +214,8 @@ void bli_trsm_lu_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -228,10 +226,11 @@ void bli_trsm_lu_ker_var2 // Loop over the m dimension (MR rows at a time). for ( dim_t ib = 0; ib < m_iter; ++ib ) { - dim_t i = m_iter - 1 - ib; - doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + const dim_t i = m_iter - 1 - ib; + const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; - dim_t m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) + ? MR : m_left ); // If the current panel of A intersects the diagonal, use a // special micro-kernel that performs a fused gemm and trsm. @@ -242,11 +241,11 @@ void bli_trsm_lu_ker_var2 if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) { // Compute various offsets into and lengths of parts of A. - dim_t off_a11 = diagoffa_i; - dim_t k_a1112 = k - off_a11;; - dim_t k_a11 = MR; - dim_t k_a12 = k_a1112 - MR; - dim_t off_a12 = off_a11 + k_a11; + const dim_t off_a11 = diagoffa_i; + const dim_t k_a1112 = k - off_a11;; + const dim_t k_a11 = MR; + const dim_t k_a12 = k_a1112 - MR; + const dim_t off_a12 = off_a11 + k_a11; // Compute the panel stride for the current diagonal- // intersecting micro-panel. @@ -271,7 +270,7 @@ void bli_trsm_lu_ker_var2 { a2 = a_cast; b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) b2 = b_cast; } @@ -305,7 +304,7 @@ void bli_trsm_lu_ker_var2 { a2 = a_cast; b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) b2 = b_cast; } diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 42e72840e..073fe3ec0 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -37,11 +37,11 @@ void bli_trsm_rl_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -131,23 +131,23 @@ void bli_trsm_rl_ker_var2 the right-hand side parameter case). */ - /* Safety trap: Certain indexing within this macro-kernel does not - work as intended if both MR and NR are odd. */ + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); - /* If any dimension is zero, return immediately. */ + // If any dimension is zero, return immediately. if ( bli_zero_dim3( m, n, k ) ) return; - /* Safeguard: If the current panel of B is entirely above its diagonal, - it is implicitly zero. So we do nothing. */ + // Safeguard: If the current panel of B is entirely above its diagonal, + // it is implicitly zero. So we do nothing. if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; - /* If there is a zero region above where the diagonal of B intersects - the left edge of the panel, adjust the pointer to A and treat this - case as if the diagonal offset were zero. Note that we don't need to - adjust the pointer to B since packm would have simply skipped over - the region that was not stored. */ + // If there is a zero region above where the diagonal of B intersects + // the left edge of the panel, adjust the pointer to A and treat this + // case as if the diagonal offset were zero. Note that we don't need to + // adjust the pointer to B since packm would have simply skipped over + // the region that was not stored. if ( diagoffb < 0 ) { k += diagoffb; @@ -155,40 +155,40 @@ void bli_trsm_rl_ker_var2 diagoffb = 0; } - /* If there is a zero region to the right of where the diagonal - of B intersects the bottom of the panel, shrink it so that - we can index to the correct place in C (corresponding to the - part of the panel of B that was packed). - NOTE: This is NOT being done to skip over "no-op" iterations, - as with the trsm_lu macro-kernel. This MUST be done for correct - execution because we use n (via n_iter) to compute diagonal and - index offsets for backwards movement through B. */ + // If there is a zero region to the right of where the diagonal + // of B intersects the bottom of the panel, shrink it so that + // we can index to the correct place in C (corresponding to the + // part of the panel of B that was packed). + // NOTE: This is NOT being done to skip over "no-op" iterations, + // as with the trsm_lu macro-kernel. This MUST be done for correct + // execution because we use n (via n_iter) to compute diagonal and + // index offsets for backwards movement through B. if ( diagoffb + k < n ) { n = diagoffb + k; } - /* Check the k dimension, which needs to be a multiple of NR. If k - isn't a multiple of NR, we adjust it higher to satisfy the micro- - kernel, which is expecting to perform an NR x NR triangular solve. - This adjustment of k is consistent with what happened when B was - packed: all of its bottom/right edges were zero-padded, and - furthermore, the panel that stores the bottom-right corner of the - matrix has its diagonal extended into the zero-padded region (as - identity). This allows the trsm of that bottom-right panel to - proceed without producing any infs or NaNs that would infect the - "good" values of the corresponding block of A. */ + // Check the k dimension, which needs to be a multiple of NR. If k + // isn't a multiple of NR, we adjust it higher to satisfy the micro- + // kernel, which is expecting to perform an NR x NR triangular solve. + // This adjustment of k is consistent with what happened when B was + // packed: all of its bottom/right edges were zero-padded, and + // furthermore, the panel that stores the bottom-right corner of the + // matrix has its diagonal extended into the zero-padded region (as + // identity). This allows the trsm of that bottom-right panel to + // proceed without producing any infs or NaNs that would infect the + // "good" values of the corresponding block of A. if ( k % NR != 0 ) k += NR - ( k % NR ); - /* NOTE: We don't need to check that n is a multiple of PACKNR since we - know that the underlying buffer was already allocated to have an n - dimension that is a multiple of PACKNR, with the region between the - last column and the next multiple of NR zero-padded accordingly. */ + // NOTE: We don't need to check that n is a multiple of PACKNR since we + // know that the underlying buffer was already allocated to have an n + // dimension that is a multiple of PACKNR, with the region between the + // last column and the next multiple of NR zero-padded accordingly. thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); - /* Compute number of primary and leftover components of the m and n - dimensions. */ + // Compute number of primary and leftover components of the m and n + // dimensions. dim_t n_iter = n / NR; dim_t n_left = n % NR; @@ -198,7 +198,7 @@ void bli_trsm_rl_ker_var2 if ( n_left ) ++n_iter; if ( m_left ) ++m_iter; - /* Determine some increments used to step through A, B, and C. */ + // Determine some increments used to step through A, B, and C. inc_t rstep_a = ps_a * dt_size; inc_t cstep_b = ps_b * dt_size; @@ -206,17 +206,18 @@ void bli_trsm_rl_ker_var2 inc_t rstep_c = rs_c * MR * dt_size; inc_t cstep_c = cs_c * NR * dt_size; - /* Save the pack schemas of A and B to the auxinfo_t object. - NOTE: We swap the values for A and B since the triangular - "A" matrix is actually contained within B. */ auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + // NOTE: We swap the values for A and B since the triangular + // "A" matrix is actually contained within B. bli_auxinfo_set_schema_a( schema_b, &aux ); bli_auxinfo_set_schema_b( schema_a, &aux ); const char* b1 = b_cast; char* c1 = c_cast; - /* Loop over the n dimension (NR columns at a time). */ + // Loop over the n dimension (NR columns at a time). for ( dim_t jb = 0; jb < n_iter; ++jb ) { dim_t j = n_iter - 1 - jb; @@ -227,50 +228,50 @@ void bli_trsm_rl_ker_var2 const char* a1 = a_cast; char* c11 = c1 + (n_iter-1)*cstep_c; - /* Initialize our next panel of B to be the current panel of B. */ + // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; - /* If the current panel of B intersects the diagonal, use a - special micro-kernel that performs a fused gemm and trsm. - If the current panel of B resides below the diagonal, use a - a regular gemm micro-kernel. Otherwise, if it is above the - diagonal, it was not packed (because it is implicitly zero) - and so we do nothing. */ + // If the current panel of B intersects the diagonal, use a + // special micro-kernel that performs a fused gemm and trsm. + // If the current panel of B resides below the diagonal, use a + // a regular gemm micro-kernel. Otherwise, if it is above the + // diagonal, it was not packed (because it is implicitly zero) + // and so we do nothing. if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) { - /* Determine the offset to and length of the panel that was packed - so we can index into the corresponding location in A. */ + // Determine the offset to and length of the panel that was packed + // so we can index into the corresponding location in A. dim_t off_b11 = bli_max( -diagoffb_j, 0 ); dim_t k_b1121 = k - off_b11; dim_t k_b11 = NR; dim_t k_b21 = k_b1121 - NR; dim_t off_b21 = off_b11 + k_b11; - /* Compute the addresses of the triangular block B11 and the - panel B21. */ + // Compute the addresses of the triangular block B11 and the + // panel B21. const char* b11 = b1; const char* b21 = b1 + k_b11 * PACKNR * dt_size; - /*b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 );*/ + //b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 ); - /* Compute the panel stride for the current micro-panel. */ + // Compute the panel stride for the current micro-panel. inc_t ps_b_cur = k_b1121 * PACKNR; ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); ps_b_cur *= dt_size; - /* Loop over the m dimension (MR rows at a time). */ + // Loop over the m dimension (MR rows at a time). for ( dim_t i = 0; i < m_iter; ++i ) { if ( bli_trsm_my_iter_rr( i, thread ) ){ dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); - /* Compute the addresses of the A11 block and A12 panel. */ + // Compute the addresses of the A11 block and A12 panel. const char* a11 = a1 + off_b11 * PACKMR * dt_size; const char* a12 = a1 + off_b21 * PACKMR * dt_size; - /* Compute the addresses of the next panels of A and B. */ + // Compute the addresses of the next panels of A and B. const char* a2 = a1; - /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */ + //if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) { a2 = a_cast; @@ -279,9 +280,9 @@ void bli_trsm_rl_ker_var2 b2 = b_cast; } - /* Save addresses of next panels of A and B to the auxinfo_t - object. NOTE: We swap the values for A and B since the - triangular "A" matrix is actually contained within B. */ + // Save addresses of next panels of A and B to the auxinfo_t + // object. NOTE: We swap the values for A and B since the + // triangular "A" matrix is actually contained within B. bli_auxinfo_set_next_a( b2, &aux ); bli_auxinfo_set_next_b( a2, &aux ); @@ -310,16 +311,16 @@ void bli_trsm_rl_ker_var2 } else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) { - /* Loop over the m dimension (MR rows at a time). */ + // Loop over the m dimension (MR rows at a time). for ( dim_t i = 0; i < m_iter; ++i ) { if ( bli_trsm_my_iter_rr( i, thread ) ){ dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); - /* Compute the addresses of the next panels of A and B. */ + // Compute the addresses of the next panels of A and B. const char* a2 = a1; - /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */ + //if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) { a2 = a_cast; @@ -328,13 +329,13 @@ void bli_trsm_rl_ker_var2 b2 = b_cast; } - /* Save addresses of next panels of A and B to the auxinfo_t - object. NOTE: We swap the values for A and B since the - triangular "A" matrix is actually contained within B. */ + // Save addresses of next panels of A and B to the auxinfo_t + // object. NOTE: We swap the values for A and B since the + // triangular "A" matrix is actually contained within B. bli_auxinfo_set_next_a( b2, &aux ); bli_auxinfo_set_next_b( a2, &aux ); - /* Invoke the gemm micro-kernel. */ + // Invoke the gemm micro-kernel. gemm_ukr ( m_cur, diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 6cc9a8bbb..a05e94494 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -37,11 +37,11 @@ void bli_trsm_ru_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -244,7 +244,7 @@ void bli_trsm_ru_ker_var2 // block B11. const char* b01 = b1; const char* b11 = b1 + k_b01 * PACKNR * dt_size; - //b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 );*/ + //b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 ); // Compute the panel stride for the current micro-panel. inc_t ps_b_cur = k_b0111 * PACKNR; diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h index a498e687e..4d7e72b43 100644 --- a/frame/3/trsm/bli_trsm_var.h +++ b/frame/3/trsm/bli_trsm_var.h @@ -48,7 +48,7 @@ void PASTEMAC0(opname) \ const obj_t* c, \ const cntx_t* cntx, \ const cntl_t* cntl, \ - thrinfo_t* thread \ + thrinfo_t* thread_par \ ); GENPROT( trsm_blk_var1 ) diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c index 39c5372f3..dfeefcd9d 100644 --- a/frame/3/trsm/bli_trsm_xx_ker_var2.c +++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c @@ -43,12 +43,12 @@ static l3_var_oft vars[2][2] = void bli_trsm_xx_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par ) { dim_t side; @@ -81,7 +81,7 @@ void bli_trsm_xx_ker_var2 c, cntx, cntl, - thread + thread_par ); } diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index 1f00537d5..3fc76b978 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -156,7 +156,7 @@ gint_t bli_info_get_enable_hpx_as_default( void ) return 0; #endif } -gint_t bli_info_get_thread_part_jrir_slab( void ) +gint_t bli_info_get_thread_jrir_slab( void ) { #ifdef BLIS_ENABLE_JRIR_SLAB return 1; @@ -164,7 +164,7 @@ gint_t bli_info_get_thread_part_jrir_slab( void ) return 0; #endif } -gint_t bli_info_get_thread_part_jrir_rr( void ) +gint_t bli_info_get_thread_jrir_rr( void ) { #ifdef BLIS_ENABLE_JRIR_RR return 1; @@ -172,6 +172,14 @@ gint_t bli_info_get_thread_part_jrir_rr( void ) return 0; #endif } +gint_t bli_info_get_thread_jrir_tlb( void ) +{ +#ifdef BLIS_ENABLE_JRIR_TLB + return 1; +#else + return 0; +#endif +} gint_t bli_info_get_enable_memkind( void ) { #ifdef BLIS_ENABLE_MEMKIND diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index 08a99daea..300b3f584 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -74,8 +74,9 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_enable_hpx( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp_as_default( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads_as_default( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_hpx_as_default( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); +BLIS_EXPORT_BLIS gint_t bli_info_get_thread_jrir_slab( void ); +BLIS_EXPORT_BLIS gint_t bli_info_get_thread_jrir_rr( void ); +BLIS_EXPORT_BLIS gint_t bli_info_get_thread_jrir_tlb( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); diff --git a/frame/base/bli_prune.c b/frame/base/bli_prune.c index ebe5c2365..31c3d86d2 100644 --- a/frame/base/bli_prune.c +++ b/frame/base/bli_prune.c @@ -38,9 +38,28 @@ void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ) { - // If the primary object is general, it has no structure, and + // NOTE: This function is not safe to use on packed objects because it does + // not currently take into account the atomicity of the packed micropanel + // widths (i.e., the register blocksize). That is, this function will prune + // greedily, without regard to whether doing so would prune off part of a + // micropanel *which has already been packed* and "assigned" to a thread for + // inclusion in the computation. In order to be safe for use use on packed + // matrices, this function would need to prune only up to the nearest + // micropanel edge (and to the corresponding location within the secondary + // matrix), which may not coincide exactly with the diagonal offset. + if ( bli_obj_is_packed( p ) || bli_obj_is_packed( s ) ) bli_abort(); + + // If the primary object is general AND dense, it has no structure, and // therefore, no unreferenced parts. - if ( bli_obj_is_general( p ) ) return; + // NOTE: There is at least one situation where the matrix is general but + // its uplo_t value is lower or upper: gemmt. This operation benefits from + // pruning unreferenced regions the same way herk/her2k/syrk/syr2k would. + // Because of gemmt, and any future similar operations, we limit early + // returns to situations where the primary object has a dense uplo_t value + // IN ADDITION TO general structure (rather than only checking for general + // structure). + if ( bli_obj_is_general( p ) && + bli_obj_is_dense( p ) ) return; // If the primary object is BLIS_ZEROS, set the dimensions so that the // matrix is empty. This is not strictly needed but rather a minor @@ -116,21 +135,13 @@ void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, if ( bli_is_m_dim( mdim_p ) ) q = m; else /* if ( bli_is_n_dim( mdim_p ) ) */ q = n; - // Update the affected objects in case anything changed. Notice that - // it is okay to update the dimension and diagonal offset fields of - // packed primary objects, as long as we do so in tandem with the - // secondary object to maintain conformality. This just means that - // the "ignore-able" zero region is skipped over here, rather than - // within the macro-kernel. + // Update the affected objects' diagonal offset, dimensions, and row + // and column offsets, in case anything changed. bli_obj_set_diag_offset( diagoff_p, p ); bli_obj_set_dim( mdim_p, q, p ); bli_obj_set_dim( mdim_s, q, s ); - - // Only update the affected offset fields if the object in question - // is NOT a packed object. Otherwise, bli_obj_buffer_at_off() will - // compute the wrong address within the macro-kernel object wrapper. - if ( !bli_obj_is_packed( p ) ) { bli_obj_inc_off( mdim_p, off_inc, p ); } - if ( !bli_obj_is_packed( s ) ) { bli_obj_inc_off( mdim_s, off_inc, s ); } + bli_obj_inc_off( mdim_p, off_inc, p ); + bli_obj_inc_off( mdim_s, off_inc, s ); } } diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c index 786998f23..64124c682 100644 --- a/frame/base/bli_rntm.c +++ b/frame/base/bli_rntm.c @@ -143,12 +143,44 @@ void bli_rntm_set_ways_for_op // kind of information is already stored in the rntm_t object. bli_rntm_factorize( m, n, k, rntm ); -#if 0 -printf( "bli_rntm_set_ways_for_op()\n" ); -bli_rntm_print( rntm ); -#endif + #if 0 + printf( "bli_rntm_set_ways_for_op()\n" ); + bli_rntm_print( rntm ); + #endif // Now modify the number of ways, if necessary, based on the operation. + + // Consider gemm (hemm, symm), gemmt (herk, her2k, syrk, syr2k), and + // trmm (trmm, trmm3). + if ( +#ifdef BLIS_ENABLE_JRIR_TLB + l3_op == BLIS_GEMM || + l3_op == BLIS_GEMMT || + l3_op == BLIS_TRMM || +#endif + FALSE + ) + { + dim_t jc = bli_rntm_jc_ways( rntm ); + dim_t pc = bli_rntm_pc_ways( rntm ); + dim_t ic = bli_rntm_ic_ways( rntm ); + dim_t jr = bli_rntm_jr_ways( rntm ); + dim_t ir = bli_rntm_ir_ways( rntm ); + + // If TLB is enabled for gemm or gemmt, redirect any ir loop parallelism + // into the jr loop. + bli_rntm_set_ways_only + ( + jc, + pc, + ic, + jr * ir, + 1, + rntm + ); + } + + // Consider trmm, trmm3, trsm. if ( l3_op == BLIS_TRMM || l3_op == BLIS_TRSM ) { diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index bf9319f4f..9e9d47699 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -36,6 +36,16 @@ #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H +// NOTE: This file should ONLY contain processing of macros that are set by +// configure and output into bli_config.h. Any other macro processing -- +// especially such as for those macros that are expected to be optionally +// set within a configuration's bli_family_.h header -- MUST be placed +// in bli_kernel_macro_defs.h instead. The reason: bli_arch_config.h (which +// #includes the configuration's bli_family_.h header) is #included +// much later in blis.h than this file (bli_config_macro_defs.h), and so any +// macros set in bli_family_.h would have no effect on the processing +// that happens below. + // -- INTEGER PROPERTIES ------------------------------------------------------- diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index d273c353a..8c0f1cb14 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -151,6 +151,7 @@ #define BLIS_FREE_USER free #endif + // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the @@ -245,6 +246,7 @@ #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif + // -- MR and NR blocksizes (only for reference kernels) ------------------------ // The build system defines BLIS_IN_REF_KERNEL, but only when compiling diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 1822065da..0865b11e9 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -927,7 +927,6 @@ BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) } - // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) @@ -954,7 +953,7 @@ BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) ( i != 0 || n_left == 0 ); } -BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) +BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter ) { return ( bool ) ( i == end_iter - 1 ); @@ -966,15 +965,59 @@ BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } -BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) +BLIS_INLINE bool bli_is_last_iter_slrr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB - return bli_is_last_iter_sl( i, end_iter, tid, nth ); + return bli_is_last_iter_sl( i, end_iter ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } +BLIS_INLINE bool bli_is_last_iter_l( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) +{ + return bli_is_last_iter_slrr( i, end_iter, tid, nth ); +} + +BLIS_INLINE bool bli_is_last_iter_u( doff_t diagoff, dim_t mr, dim_t nr, inc_t inc ) +{ + return bli_is_strictly_below_diag_n( diagoff + inc*mr, mr, nr ); +} + +BLIS_INLINE bool bli_is_last_iter_tlb_l( dim_t i, dim_t end_iter ) +{ + return bli_is_last_iter_sl( i, end_iter ); +} + +BLIS_INLINE bool bli_is_last_iter_tlb_u( doff_t diagoff, dim_t mr, dim_t nr ) +{ + return bli_is_strictly_below_diag_n( diagoff + 1*mr, mr, nr ); +} + +BLIS_INLINE bool bli_is_my_iter_sl( dim_t i, dim_t st, dim_t en ) +{ + return ( st <= i && i < en ); +} + +BLIS_INLINE bool bli_is_my_iter_rr( dim_t i, dim_t work_id, dim_t n_way ) +{ + return ( i % n_way == work_id % n_way ); +} + +BLIS_INLINE bool bli_is_my_iter( dim_t i, dim_t st, dim_t en, dim_t work_id, dim_t n_way ) +{ + // NOTE: This function is (as of this writing) only called from packm. + // If the structure of the cpp macros below is ever changed, make sure + // it is still consistent with that of bli_thread_range_slrr() since + // these functions are used together in packm. + +#ifdef BLIS_ENABLE_JRIR_RR + return bli_is_my_iter_rr( i, work_id, n_way ); +#else // ifdef ( _SLAB || _TLB ) + return bli_is_my_iter_sl( i, st, en ); +#endif +} + // packbuf_t-related diff --git a/frame/include/blis.h b/frame/include/blis.h index 98ebee878..70005e57d 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -83,6 +83,10 @@ extern "C" { // -- Threading definitions -- #include "bli_thread.h" +#include "bli_thread_range.h" +#include "bli_thread_range_slab_rr.h" +#include "bli_thread_range_tlb.h" + #include "bli_pthread.h" diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 4cba76b20..d41f37053 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -108,907 +108,6 @@ void bli_thread_launch // ----------------------------------------------------------------------------- -void bli_thread_range_sub - ( - const thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end - ) -{ - dim_t n_way = bli_thrinfo_n_way( thread ); - - if ( n_way == 1 ) { *start = 0; *end = n; return; } - - dim_t work_id = bli_thrinfo_work_id( thread ); - - dim_t all_start = 0; - dim_t all_end = n; - - dim_t size = all_end - all_start; - - dim_t n_bf_whole = size / bf; - dim_t n_bf_left = size % bf; - - dim_t n_bf_lo = n_bf_whole / n_way; - dim_t n_bf_hi = n_bf_whole / n_way; - - // In this function, we partition the space between all_start and - // all_end into n_way partitions, each a multiple of block_factor - // with the exception of the one partition that recieves the - // "edge" case (if applicable). - // - // Here are examples of various thread partitionings, in units of - // the block_factor, when n_way = 4. (A '+' indicates the thread - // that receives the leftover edge case (ie: n_bf_left extra - // rows/columns in its sub-range). - // (all_start ... all_end) - // n_bf_whole _left hel n_th_lo _hi thr0 thr1 thr2 thr3 - // 12 =0 f 0 4 3 3 3 3 - // 12 >0 f 0 4 3 3 3 3+ - // 13 >0 f 1 3 4 3 3 3+ - // 14 >0 f 2 2 4 4 3 3+ - // 15 >0 f 3 1 4 4 4 3+ - // 15 =0 f 3 1 4 4 4 3 - // - // 12 =0 t 4 0 3 3 3 3 - // 12 >0 t 4 0 3+ 3 3 3 - // 13 >0 t 3 1 3+ 3 3 4 - // 14 >0 t 2 2 3+ 3 4 4 - // 15 >0 t 1 3 3+ 4 4 4 - // 15 =0 t 1 3 3 4 4 4 - - // As indicated by the table above, load is balanced as equally - // as possible, even in the presence of an edge case. - - // First, we must differentiate between cases where the leftover - // "edge" case (n_bf_left) should be allocated to a thread partition - // at the low end of the index range or the high end. - - if ( handle_edge_low == FALSE ) - { - // Notice that if all threads receive the same number of - // block_factors, those threads are considered "high" and - // the "low" thread group is empty. - dim_t n_th_lo = n_bf_whole % n_way; - //dim_t n_th_hi = n_way - n_th_lo; - - // If some partitions must have more block_factors than others - // assign the slightly larger partitions to lower index threads. - if ( n_th_lo != 0 ) n_bf_lo += 1; - - // Compute the actual widths (in units of rows/columns) of - // individual threads in the low and high groups. - dim_t size_lo = n_bf_lo * bf; - dim_t size_hi = n_bf_hi * bf; - - // Precompute the starting indices of the low and high groups. - dim_t lo_start = all_start; - dim_t hi_start = all_start + n_th_lo * size_lo; - - // Compute the start and end of individual threads' ranges - // as a function of their work_ids and also the group to which - // they belong (low or high). - if ( work_id < n_th_lo ) - { - *start = lo_start + (work_id ) * size_lo; - *end = lo_start + (work_id+1) * size_lo; - } - else // if ( n_th_lo <= work_id ) - { - *start = hi_start + (work_id-n_th_lo ) * size_hi; - *end = hi_start + (work_id-n_th_lo+1) * size_hi; - - // Since the edge case is being allocated to the high - // end of the index range, we have to advance the last - // thread's end. - if ( work_id == n_way - 1 ) *end += n_bf_left; - } - } - else // if ( handle_edge_low == TRUE ) - { - // Notice that if all threads receive the same number of - // block_factors, those threads are considered "low" and - // the "high" thread group is empty. - dim_t n_th_hi = n_bf_whole % n_way; - dim_t n_th_lo = n_way - n_th_hi; - - // If some partitions must have more block_factors than others - // assign the slightly larger partitions to higher index threads. - if ( n_th_hi != 0 ) n_bf_hi += 1; - - // Compute the actual widths (in units of rows/columns) of - // individual threads in the low and high groups. - dim_t size_lo = n_bf_lo * bf; - dim_t size_hi = n_bf_hi * bf; - - // Precompute the starting indices of the low and high groups. - dim_t lo_start = all_start; - dim_t hi_start = all_start + n_th_lo * size_lo - + n_bf_left; - - // Compute the start and end of individual threads' ranges - // as a function of their work_ids and also the group to which - // they belong (low or high). - if ( work_id < n_th_lo ) - { - *start = lo_start + (work_id ) * size_lo; - *end = lo_start + (work_id+1) * size_lo; - - // Since the edge case is being allocated to the low - // end of the index range, we have to advance the - // starts/ends accordingly. - if ( work_id == 0 ) *end += n_bf_left; - else { *start += n_bf_left; - *end += n_bf_left; } - } - else // if ( n_th_lo <= work_id ) - { - *start = hi_start + (work_id-n_th_lo ) * size_hi; - *end = hi_start + (work_id-n_th_lo+1) * size_hi; - } - } -} - -siz_t bli_thread_range_l2r - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - num_t dt = bli_obj_dt( a ); - dim_t m = bli_obj_length_after_trans( a ); - dim_t n = bli_obj_width_after_trans( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - bli_thread_range_sub( thr, n, bf, - FALSE, start, end ); - - return m * ( *end - *start ); -} - -siz_t bli_thread_range_r2l - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - num_t dt = bli_obj_dt( a ); - dim_t m = bli_obj_length_after_trans( a ); - dim_t n = bli_obj_width_after_trans( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - bli_thread_range_sub( thr, n, bf, - TRUE, start, end ); - - return m * ( *end - *start ); -} - -siz_t bli_thread_range_t2b - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - num_t dt = bli_obj_dt( a ); - dim_t m = bli_obj_length_after_trans( a ); - dim_t n = bli_obj_width_after_trans( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - bli_thread_range_sub( thr, m, bf, - FALSE, start, end ); - - return n * ( *end - *start ); -} - -siz_t bli_thread_range_b2t - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - num_t dt = bli_obj_dt( a ); - dim_t m = bli_obj_length_after_trans( a ); - dim_t n = bli_obj_width_after_trans( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - bli_thread_range_sub( thr, m, bf, - TRUE, start, end ); - - return n * ( *end - *start ); -} - -// ----------------------------------------------------------------------------- - -dim_t bli_thread_range_width_l - ( - doff_t diagoff_j, - dim_t m, - dim_t n_j, - dim_t j, - dim_t n_way, - dim_t bf, - dim_t bf_left, - double area_per_thr, - bool handle_edge_low - ) -{ - dim_t width; - - // In this function, we assume that we are somewhere in the process of - // partitioning an m x n lower-stored region (with arbitrary diagonal - // offset) n_ways along the n dimension (into column panels). The value - // j identifies the left-to-right subpartition index (from 0 to n_way-1) - // of the subpartition whose width we are about to compute using the - // area per thread determined by the caller. n_j is the number of - // columns in the remaining region of the matrix being partitioned, - // and diagoff_j is that region's diagonal offset. - - // If this is the last subpartition, the width is simply equal to n_j. - // Note that this statement handles cases where the "edge case" (if - // one exists) is assigned to the high end of the index range (ie: - // handle_edge_low == FALSE). - if ( j == n_way - 1 ) return n_j; - - // At this point, we know there are at least two subpartitions left. - // We also know that IF the submatrix contains a completely dense - // rectangular submatrix, it will occur BEFORE the triangular (or - // trapezoidal) part. - - // Here, we implement a somewhat minor load balancing optimization - // that ends up getting employed only for relatively small matrices. - // First, recall that all subpartition widths will be some multiple - // of the blocking factor bf, except perhaps either the first or last - // subpartition, which will receive the edge case, if it exists. - // Also recall that j represents the current thread (or thread group, - // or "caucus") for which we are computing a subpartition width. - // If n_j is sufficiently small that we can only allocate bf columns - // to each of the remaining threads, then we set the width to bf. We - // do not allow the subpartition width to be less than bf, so, under - // some conditions, if n_j is small enough, some of the reamining - // threads may not get any work. For the purposes of this lower bound - // on work (ie: width >= bf), we allow the edge case to count as a - // "full" set of bf columns. - { - dim_t n_j_bf = n_j / bf + ( bf_left > 0 ? 1 : 0 ); - - if ( n_j_bf <= n_way - j ) - { - if ( j == 0 && handle_edge_low ) - width = ( bf_left > 0 ? bf_left : bf ); - else - width = bf; - - // Make sure that the width does not exceed n_j. This would - // occur if and when n_j_bf < n_way - j; that is, when the - // matrix being partitioned is sufficiently small relative to - // n_way such that there is not even enough work for every - // (remaining) thread to get bf (or bf_left) columns. The - // net effect of this safeguard is that some threads may get - // assigned empty ranges (ie: no work), which of course must - // happen in some situations. - if ( width > n_j ) width = n_j; - - return width; - } - } - - // This block computes the width assuming that we are entirely within - // a dense rectangle that precedes the triangular (or trapezoidal) - // part. - { - // First compute the width of the current panel under the - // assumption that the diagonal offset would not intersect. - width = ( dim_t )bli_round( ( double )area_per_thr / ( double )m ); - - // Adjust the width, if necessary. Specifically, we may need - // to allocate the edge case to the first subpartition, if - // requested; otherwise, we just need to ensure that the - // subpartition is a multiple of the blocking factor. - if ( j == 0 && handle_edge_low ) - { - if ( width % bf != bf_left ) width += bf_left - ( width % bf ); - } - else // if interior case - { - // Round up to the next multiple of the blocking factor. - //if ( width % bf != 0 ) width += bf - ( width % bf ); - // Round to the nearest multiple of the blocking factor. - if ( width % bf != 0 ) width = bli_round_to_mult( width, bf ); - } - } - - // We need to recompute width if the panel, according to the width - // as currently computed, would intersect the diagonal. - if ( diagoff_j < width ) - { - dim_t offm_inc, offn_inc; - - // Prune away the unstored region above the diagonal, if it exists. - // Note that the entire region was pruned initially, so we know that - // we don't need to try to prune the right side. (Also, we discard - // the offset deltas since we don't need to actually index into the - // subpartition.) - bli_prune_unstored_region_top_l( &diagoff_j, &m, &n_j, &offm_inc ); - //bli_prune_unstored_region_right_l( &diagoff_j, &m, &n_j, &offn_inc ); - - // We don't need offm_inc, offn_inc here. These statements should - // prevent compiler warnings. - ( void )offm_inc; - ( void )offn_inc; - - // Prepare to solve a quadratic equation to find the width of the - // current (jth) subpartition given the m dimension, diagonal offset, - // and area. - // NOTE: We know that the +/- in the quadratic formula must be a + - // here because we know that the desired solution (the subpartition - // width) will be smaller than (m + diagoff), not larger. If you - // don't believe me, draw a picture! - const double a = -0.5; - const double b = ( double )m + ( double )diagoff_j + 0.5; - const double c = -0.5 * ( ( double )diagoff_j * - ( ( double )diagoff_j + 1.0 ) - ) - area_per_thr; - const double r = b * b - 4.0 * a * c; - - // If the quadratic solution is not imaginary, round it and use that - // as our width, but make sure it didn't round to zero. Otherwise, - // discard the quadratic solution and leave width, as previously - // computed, unchanged. - if ( r >= 0.0 ) - { - const double x = ( -b + sqrt( r ) ) / ( 2.0 * a ); - - width = ( dim_t )bli_round( x ); - if ( width == 0 ) width = 1; - } - - // Adjust the width, if necessary. - if ( j == 0 && handle_edge_low ) - { - if ( width % bf != bf_left ) width += bf_left - ( width % bf ); - } - else // if interior case - { - // Round up to the next multiple of the blocking factor. - //if ( width % bf != 0 ) width += bf - ( width % bf ); - // Round to the nearest multiple of the blocking factor. - if ( width % bf != 0 ) width = bli_round_to_mult( width, bf ); - } - } - - // Make sure that the width, after being adjusted, does not cause the - // subpartition to exceed n_j. - if ( width > n_j ) width = n_j; - - return width; -} - -siz_t bli_find_area_trap_l - ( - dim_t m, - dim_t n, - doff_t diagoff - ) -{ - dim_t offm_inc = 0; - dim_t offn_inc = 0; - double tri_area; - double area; - - // Prune away any rectangular region above where the diagonal - // intersects the left edge of the subpartition, if it exists. - bli_prune_unstored_region_top_l( &diagoff, &m, &n, &offm_inc ); - - // Prune away any rectangular region to the right of where the - // diagonal intersects the bottom edge of the subpartition, if - // it exists. (This shouldn't ever be needed, since the caller - // would presumably have already performed rightward pruning, - // but it's here just in case.) - bli_prune_unstored_region_right_l( &diagoff, &m, &n, &offn_inc ); - - ( void )offm_inc; - ( void )offn_inc; - - // Compute the area of the empty triangle so we can subtract it - // from the area of the rectangle that bounds the subpartition. - if ( bli_intersects_diag_n( diagoff, m, n ) ) - { - double tri_dim = ( double )( n - diagoff - 1 ); - tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0; - } - else - { - // If the diagonal does not intersect the trapezoid, then - // we can compute the area as a simple rectangle. - tri_area = 0.0; - } - - area = ( double )m * ( double )n - tri_area; - - return ( siz_t )area; -} - -// ----------------------------------------------------------------------------- - -siz_t bli_thread_range_weighted_sub - ( - const thrinfo_t* thread, - doff_t diagoff, - uplo_t uplo, - dim_t m, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* j_start_thr, - dim_t* j_end_thr - ) -{ - dim_t n_way = bli_thrinfo_n_way( thread ); - dim_t my_id = bli_thrinfo_work_id( thread ); - - dim_t bf_left = n % bf; - - dim_t j; - - dim_t off_j; - doff_t diagoff_j; - dim_t n_left; - - dim_t width_j; - - dim_t offm_inc, offn_inc; - - double tri_dim, tri_area; - double area_total, area_per_thr; - - siz_t area = 0; - - // In this function, we assume that the caller has already determined - // that (a) the diagonal intersects the submatrix, and (b) the submatrix - // is either lower- or upper-stored. - - if ( bli_is_lower( uplo ) ) - { - // Prune away the unstored region above the diagonal, if it exists, - // and then to the right of where the diagonal intersects the bottom, - // if it exists. (Also, we discard the offset deltas since we don't - // need to actually index into the subpartition.) - bli_prune_unstored_region_top_l( &diagoff, &m, &n, &offm_inc ); - bli_prune_unstored_region_right_l( &diagoff, &m, &n, &offn_inc ); - - // We don't need offm_inc, offn_inc here. These statements should - // prevent compiler warnings. - ( void )offm_inc; - ( void )offn_inc; - - // Now that pruning has taken place, we know that diagoff >= 0. - - // Compute the total area of the submatrix, accounting for the - // location of the diagonal, and divide it by the number of ways - // of parallelism. - tri_dim = ( double )( n - diagoff - 1 ); - tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0; - area_total = ( double )m * ( double )n - tri_area; - area_per_thr = area_total / ( double )n_way; - - // Initialize some variables prior to the loop: the offset to the - // current subpartition, the remainder of the n dimension, and - // the diagonal offset of the current subpartition. - off_j = 0; - diagoff_j = diagoff; - n_left = n; - - // Iterate over the subpartition indices corresponding to each - // thread/caucus participating in the n_way parallelism. - for ( j = 0; j < n_way; ++j ) - { - // Compute the width of the jth subpartition, taking the - // current diagonal offset into account, if needed. - width_j = - bli_thread_range_width_l - ( - diagoff_j, m, n_left, - j, n_way, - bf, bf_left, - area_per_thr, - handle_edge_low - ); - - // If the current thread belongs to caucus j, this is his - // subpartition. So we compute the implied index range and - // end our search. - if ( j == my_id ) - { - *j_start_thr = off_j; - *j_end_thr = off_j + width_j; - - area = bli_find_area_trap_l( m, width_j, diagoff_j ); - - break; - } - - // Shift the current subpartition's starting and diagonal offsets, - // as well as the remainder of the n dimension, according to the - // computed width, and then iterate to the next subpartition. - off_j += width_j; - diagoff_j -= width_j; - n_left -= width_j; - } - } - else // if ( bli_is_upper( uplo ) ) - { - // Express the upper-stored case in terms of the lower-stored case. - - // First, we convert the upper-stored trapezoid to an equivalent - // lower-stored trapezoid by rotating it 180 degrees. - bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); - - // Now that the trapezoid is "flipped" in the n dimension, negate - // the bool that encodes whether to handle the edge case at the - // low (or high) end of the index range. - bli_toggle_bool( &handle_edge_low ); - - // Compute the appropriate range for the rotated trapezoid. - area = bli_thread_range_weighted_sub - ( - thread, diagoff, uplo, m, n, bf, - handle_edge_low, - j_start_thr, j_end_thr - ); - - // Reverse the indexing basis for the subpartition ranges so that - // the indices, relative to left-to-right iteration through the - // unrotated upper-stored trapezoid, map to the correct columns - // (relative to the diagonal). This amounts to subtracting the - // range from n. - bli_reverse_index_direction( n, j_start_thr, j_end_thr ); - } - - return area; -} - -siz_t bli_thread_range_mdim - ( - dir_t direct, - const thrinfo_t* thr, - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntl_t* cntl, - const cntx_t* cntx, - dim_t* start, - dim_t* end - ) -{ - bszid_t bszid = bli_cntl_bszid( cntl ); - opid_t family = bli_cntl_family( cntl ); - - // This is part of trsm's current implementation, whereby right side - // cases are implemented in left-side micro-kernels, which requires - // we swap the usage of the register blocksizes for the purposes of - // packing A and B. - if ( family == BLIS_TRSM ) - { - if ( bli_obj_root_is_triangular( a ) ) bszid = BLIS_MR; - else bszid = BLIS_NR; - } - - const blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); - const obj_t* x; - bool use_weighted; - - // Use the operation family to choose the one of the two matrices - // being partitioned that potentially has structure, and also to - // decide whether or not we need to use weighted range partitioning. - // NOTE: It's important that we use non-weighted range partitioning - // for hemm and symm (ie: the gemm family) because the weighted - // function will mistakenly skip over unstored regions of the - // structured matrix, even though they represent part of that matrix - // that will be dense and full (after packing). - if ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; } - else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } - else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE; } - else /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; } - - if ( use_weighted ) - { - if ( direct == BLIS_FWD ) - return bli_thread_range_weighted_t2b( thr, x, bmult, start, end ); - else - return bli_thread_range_weighted_b2t( thr, x, bmult, start, end ); - } - else - { - if ( direct == BLIS_FWD ) - return bli_thread_range_t2b( thr, x, bmult, start, end ); - else - return bli_thread_range_b2t( thr, x, bmult, start, end ); - } -} - -siz_t bli_thread_range_ndim - ( - dir_t direct, - const thrinfo_t* thr, - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntl_t* cntl, - const cntx_t* cntx, - dim_t* start, - dim_t* end - ) -{ - bszid_t bszid = bli_cntl_bszid( cntl ); - opid_t family = bli_cntl_family( cntl ); - - // This is part of trsm's current implementation, whereby right side - // cases are implemented in left-side micro-kernels, which requires - // we swap the usage of the register blocksizes for the purposes of - // packing A and B. - if ( family == BLIS_TRSM ) - { - if ( bli_obj_root_is_triangular( b ) ) bszid = BLIS_MR; - else bszid = BLIS_NR; - } - - const blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); - const obj_t* x; - bool use_weighted; - - // Use the operation family to choose the one of the two matrices - // being partitioned that potentially has structure, and also to - // decide whether or not we need to use weighted range partitioning. - // NOTE: It's important that we use non-weighted range partitioning - // for hemm and symm (ie: the gemm family) because the weighted - // function will mistakenly skip over unstored regions of the - // structured matrix, even though they represent part of that matrix - // that will be dense and full (after packing). - if ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; } - else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } - else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE; } - else /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; } - - if ( use_weighted ) - { - if ( direct == BLIS_FWD ) - return bli_thread_range_weighted_l2r( thr, x, bmult, start, end ); - else - return bli_thread_range_weighted_r2l( thr, x, bmult, start, end ); - } - else - { - if ( direct == BLIS_FWD ) - return bli_thread_range_l2r( thr, x, bmult, start, end ); - else - return bli_thread_range_r2l( thr, x, bmult, start, end ); - } -} - -siz_t bli_thread_range_weighted_l2r - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - siz_t area; - - // This function assigns area-weighted ranges in the n dimension - // where the total range spans 0 to n-1 with 0 at the left end and - // n-1 at the right end. - - if ( bli_obj_intersects_diag( a ) && - bli_obj_is_upper_or_lower( a ) ) - { - num_t dt = bli_obj_dt( a ); - doff_t diagoff = bli_obj_diag_offset( a ); - uplo_t uplo = bli_obj_uplo( a ); - dim_t m = bli_obj_length( a ); - dim_t n = bli_obj_width( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - // Support implicit transposition. - if ( bli_obj_has_trans( a ) ) - { - bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); - } - - area = - bli_thread_range_weighted_sub - ( - thr, diagoff, uplo, m, n, bf, - FALSE, start, end - ); - } - else // if dense or zeros - { - area = bli_thread_range_l2r - ( - thr, a, bmult, - start, end - ); - } - - return area; -} - -siz_t bli_thread_range_weighted_r2l - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - siz_t area; - - // This function assigns area-weighted ranges in the n dimension - // where the total range spans 0 to n-1 with 0 at the right end and - // n-1 at the left end. - - if ( bli_obj_intersects_diag( a ) && - bli_obj_is_upper_or_lower( a ) ) - { - num_t dt = bli_obj_dt( a ); - doff_t diagoff = bli_obj_diag_offset( a ); - uplo_t uplo = bli_obj_uplo( a ); - dim_t m = bli_obj_length( a ); - dim_t n = bli_obj_width( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - // Support implicit transposition. - if ( bli_obj_has_trans( a ) ) - { - bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); - } - - bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); - - area = - bli_thread_range_weighted_sub - ( - thr, diagoff, uplo, m, n, bf, - TRUE, start, end - ); - } - else // if dense or zeros - { - area = bli_thread_range_r2l - ( - thr, a, bmult, - start, end - ); - } - - return area; -} - -siz_t bli_thread_range_weighted_t2b - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - siz_t area; - - // This function assigns area-weighted ranges in the m dimension - // where the total range spans 0 to m-1 with 0 at the top end and - // m-1 at the bottom end. - - if ( bli_obj_intersects_diag( a ) && - bli_obj_is_upper_or_lower( a ) ) - { - num_t dt = bli_obj_dt( a ); - doff_t diagoff = bli_obj_diag_offset( a ); - uplo_t uplo = bli_obj_uplo( a ); - dim_t m = bli_obj_length( a ); - dim_t n = bli_obj_width( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - // Support implicit transposition. - if ( bli_obj_has_trans( a ) ) - { - bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); - } - - bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); - - area = - bli_thread_range_weighted_sub - ( - thr, diagoff, uplo, m, n, bf, - FALSE, start, end - ); - } - else // if dense or zeros - { - area = bli_thread_range_t2b - ( - thr, a, bmult, - start, end - ); - } - - return area; -} - -siz_t bli_thread_range_weighted_b2t - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - siz_t area; - - // This function assigns area-weighted ranges in the m dimension - // where the total range spans 0 to m-1 with 0 at the bottom end and - // m-1 at the top end. - - if ( bli_obj_intersects_diag( a ) && - bli_obj_is_upper_or_lower( a ) ) - { - num_t dt = bli_obj_dt( a ); - doff_t diagoff = bli_obj_diag_offset( a ); - uplo_t uplo = bli_obj_uplo( a ); - dim_t m = bli_obj_length( a ); - dim_t n = bli_obj_width( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - // Support implicit transposition. - if ( bli_obj_has_trans( a ) ) - { - bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); - } - - bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); - - bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); - - area = bli_thread_range_weighted_sub - ( - thr, diagoff, uplo, m, n, bf, - TRUE, start, end - ); - } - else // if dense or zeros - { - area = bli_thread_range_b2t - ( - thr, a, bmult, - start, end - ); - } - - return area; -} - -// ----------------------------------------------------------------------------- - void bli_prime_factorization( dim_t n, bli_prime_factors_t* factors ) { factors->n = n; diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index e61fc8b89..5002672dc 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -56,6 +56,8 @@ typedef void (*thread_func_t)( thrcomm_t* gl_comm, dim_t tid, const void* params void bli_thread_init( void ); void bli_thread_finalize( void ); +// ----------------------------------------------------------------------------- + BLIS_EXPORT_BLIS void bli_thread_launch ( timpl_t ti, @@ -64,91 +66,6 @@ BLIS_EXPORT_BLIS void bli_thread_launch const void* params ); -// Thread range-related prototypes. - -BLIS_EXPORT_BLIS void bli_thread_range_sub - ( - const thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end - ); - -#undef GENPROT -#define GENPROT( opname ) \ -\ -siz_t PASTEMAC0( opname ) \ - ( \ - dir_t direct, \ - const thrinfo_t* thr, \ - const obj_t* a, \ - const obj_t* b, \ - const obj_t* c, \ - const cntl_t* cntl, \ - const cntx_t* cntx, \ - dim_t* start, \ - dim_t* end \ - ); - -GENPROT( thread_range_mdim ) -GENPROT( thread_range_ndim ) - -#undef GENPROT -#define GENPROT( opname ) \ -\ -siz_t PASTEMAC0( opname ) \ - ( \ - const thrinfo_t* thr, \ - const obj_t* a, \ - const blksz_t* bmult, \ - dim_t* start, \ - dim_t* end \ - ); - -GENPROT( thread_range_l2r ) -GENPROT( thread_range_r2l ) -GENPROT( thread_range_t2b ) -GENPROT( thread_range_b2t ) - -GENPROT( thread_range_weighted_l2r ) -GENPROT( thread_range_weighted_r2l ) -GENPROT( thread_range_weighted_t2b ) -GENPROT( thread_range_weighted_b2t ) - - -dim_t bli_thread_range_width_l - ( - doff_t diagoff_j, - dim_t m, - dim_t n_j, - dim_t j, - dim_t n_way, - dim_t bf, - dim_t bf_left, - double area_per_thr, - bool handle_edge_low - ); -siz_t bli_find_area_trap_l - ( - dim_t m, - dim_t n, - doff_t diagoff - ); -siz_t bli_thread_range_weighted_sub - ( - const thrinfo_t* thread, - doff_t diagoff, - uplo_t uplo, - dim_t m, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* j_start_thr, - dim_t* j_end_thr - ); - // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes @@ -212,98 +129,5 @@ BLIS_EXPORT_BLIS void bli_thread_set_thread_impl( timpl_t ti ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); -// ----------------------------------------------------------------------------- - -BLIS_INLINE void bli_thread_range_jrir_rr - ( - const thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc - ) -{ - // Use interleaved partitioning of jr/ir loops. - *start = bli_thrinfo_work_id( thread ); - *inc = bli_thrinfo_n_way( thread ); - *end = n; -} - -BLIS_INLINE void bli_thread_range_jrir_sl - ( - const thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc - ) -{ - // Use contiguous slab partitioning of jr/ir loops. - bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); - *inc = 1; -} - -BLIS_INLINE void bli_thread_range_jrir - ( - const thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc - ) -{ - // Define a general-purpose version of bli_thread_range_jrir() whose - // definition depends on whether slab or round-robin partitioning was - // requested at configure-time. -#ifdef BLIS_ENABLE_JRIR_SLAB - bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); -#else - bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); -#endif -} - -#if 0 -BLIS_INLINE void bli_thread_range_weighted_jrir - ( - thrinfo_t* thread, - doff_t diagoff, - uplo_t uplo, - dim_t m, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc - ) -{ -#ifdef BLIS_ENABLE_JRIR_SLAB - - // Use contiguous slab partitioning for jr/ir loops. - bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, - handle_edge_low, start, end ); - - *start = *start / bf; *inc = 1; - - if ( *end % bf ) *end = *end / bf + 1; - else *end = *end / bf; - -#else - // Use interleaved partitioning of jr/ir loops. - *start = bli_thrinfo_work_id( thread ); - *inc = bli_thrinfo_n_way( thread ); - *end = n; - -#endif -} #endif - -#endif - diff --git a/frame/thread/bli_thread_range.c b/frame/thread/bli_thread_range.c new file mode 100644 index 000000000..a28e529b0 --- /dev/null +++ b/frame/thread/bli_thread_range.c @@ -0,0 +1,1121 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_thread_range_sub + ( + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end + ) +{ + dim_t n_way = bli_thrinfo_n_way( thread ); + + if ( n_way == 1 ) { *start = 0; *end = n; return; } + + dim_t work_id = bli_thrinfo_work_id( thread ); + + dim_t all_start = 0; + dim_t all_end = n; + + dim_t size = all_end - all_start; + + dim_t n_bf_whole = size / bf; + dim_t n_bf_left = size % bf; + + dim_t n_bf_lo = n_bf_whole / n_way; + dim_t n_bf_hi = n_bf_whole / n_way; + + // In this function, we partition the space between all_start and + // all_end into n_way partitions, each a multiple of block_factor + // with the exception of the one partition that recieves the + // "edge" case (if applicable). + // + // Here are examples of various thread partitionings, in units of + // the block_factor, when n_way = 4. (A '+' indicates the thread + // that receives the leftover edge case (ie: n_bf_left extra + // rows/columns in its sub-range). + // (all_start ... all_end) + // n_bf_whole _left hel n_th_lo _hi thr0 thr1 thr2 thr3 + // 12 =0 f 0 4 3 3 3 3 + // 12 >0 f 0 4 3 3 3 3+ + // 13 >0 f 1 3 4 3 3 3+ + // 14 >0 f 2 2 4 4 3 3+ + // 15 >0 f 3 1 4 4 4 3+ + // 15 =0 f 3 1 4 4 4 3 + // + // 12 =0 t 4 0 3 3 3 3 + // 12 >0 t 4 0 3+ 3 3 3 + // 13 >0 t 3 1 3+ 3 3 4 + // 14 >0 t 2 2 3+ 3 4 4 + // 15 >0 t 1 3 3+ 4 4 4 + // 15 =0 t 1 3 3 4 4 4 + + // As indicated by the table above, load is balanced as equally + // as possible, even in the presence of an edge case. + + // First, we must differentiate between cases where the leftover + // "edge" case (n_bf_left) should be allocated to a thread partition + // at the low end of the index range or the high end. + + if ( handle_edge_low == FALSE ) + { + // Notice that if all threads receive the same number of + // block_factors, those threads are considered "high" and + // the "low" thread group is empty. + dim_t n_th_lo = n_bf_whole % n_way; + //dim_t n_th_hi = n_way - n_th_lo; + + // If some partitions must have more block_factors than others + // assign the slightly larger partitions to lower index threads. + if ( n_th_lo != 0 ) n_bf_lo += 1; + + // Compute the actual widths (in units of rows/columns) of + // individual threads in the low and high groups. + dim_t size_lo = n_bf_lo * bf; + dim_t size_hi = n_bf_hi * bf; + + // Precompute the starting indices of the low and high groups. + dim_t lo_start = all_start; + dim_t hi_start = all_start + n_th_lo * size_lo; + + // Compute the start and end of individual threads' ranges + // as a function of their work_ids and also the group to which + // they belong (low or high). + if ( work_id < n_th_lo ) + { + *start = lo_start + (work_id ) * size_lo; + *end = lo_start + (work_id+1) * size_lo; + } + else // if ( n_th_lo <= work_id ) + { + *start = hi_start + (work_id-n_th_lo ) * size_hi; + *end = hi_start + (work_id-n_th_lo+1) * size_hi; + + // Since the edge case is being allocated to the high + // end of the index range, we have to advance the last + // thread's end. + if ( work_id == n_way - 1 ) *end += n_bf_left; + } + } + else // if ( handle_edge_low == TRUE ) + { + // Notice that if all threads receive the same number of + // block_factors, those threads are considered "low" and + // the "high" thread group is empty. + dim_t n_th_hi = n_bf_whole % n_way; + dim_t n_th_lo = n_way - n_th_hi; + + // If some partitions must have more block_factors than others + // assign the slightly larger partitions to higher index threads. + if ( n_th_hi != 0 ) n_bf_hi += 1; + + // Compute the actual widths (in units of rows/columns) of + // individual threads in the low and high groups. + dim_t size_lo = n_bf_lo * bf; + dim_t size_hi = n_bf_hi * bf; + + // Precompute the starting indices of the low and high groups. + dim_t lo_start = all_start; + dim_t hi_start = all_start + n_th_lo * size_lo + + n_bf_left; + + // Compute the start and end of individual threads' ranges + // as a function of their work_ids and also the group to which + // they belong (low or high). + if ( work_id < n_th_lo ) + { + *start = lo_start + (work_id ) * size_lo; + *end = lo_start + (work_id+1) * size_lo; + + // Since the edge case is being allocated to the low + // end of the index range, we have to advance the + // starts/ends accordingly. + if ( work_id == 0 ) *end += n_bf_left; + else { *start += n_bf_left; + *end += n_bf_left; } + } + else // if ( n_th_lo <= work_id ) + { + *start = hi_start + (work_id-n_th_lo ) * size_hi; + *end = hi_start + (work_id-n_th_lo+1) * size_hi; + } + } +} + +// ----------------------------------------------------------------------------- + +siz_t bli_thread_range_l2r + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + num_t dt = bli_obj_dt( a ); + dim_t m = bli_obj_length_after_trans( a ); + dim_t n = bli_obj_width_after_trans( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + bli_thread_range_sub( thr, n, bf, + FALSE, start, end ); + + return m * ( *end - *start ); +} + +siz_t bli_thread_range_r2l + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + num_t dt = bli_obj_dt( a ); + dim_t m = bli_obj_length_after_trans( a ); + dim_t n = bli_obj_width_after_trans( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + bli_thread_range_sub( thr, n, bf, + TRUE, start, end ); + + return m * ( *end - *start ); +} + +siz_t bli_thread_range_t2b + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + num_t dt = bli_obj_dt( a ); + dim_t m = bli_obj_length_after_trans( a ); + dim_t n = bli_obj_width_after_trans( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + bli_thread_range_sub( thr, m, bf, + FALSE, start, end ); + + return n * ( *end - *start ); +} + +siz_t bli_thread_range_b2t + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + num_t dt = bli_obj_dt( a ); + dim_t m = bli_obj_length_after_trans( a ); + dim_t n = bli_obj_width_after_trans( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + bli_thread_range_sub( thr, m, bf, + TRUE, start, end ); + + return n * ( *end - *start ); +} + +// ----------------------------------------------------------------------------- + +dim_t bli_thread_range_width_l + ( + doff_t diagoff_j, + dim_t m, + dim_t n_j, + dim_t j, + dim_t n_way, + dim_t bf, + dim_t bf_left, + double area_per_thr, + bool handle_edge_low + ) +{ + dim_t width; + + // In this function, we assume that we are somewhere in the process of + // partitioning an m x n lower-stored region (with arbitrary diagonal + // offset) n_ways along the n dimension (into column panels). The value + // j identifies the left-to-right subpartition index (from 0 to n_way-1) + // of the subpartition whose width we are about to compute using the + // area per thread determined by the caller. n_j is the number of + // columns in the remaining region of the matrix being partitioned, + // and diagoff_j is that region's diagonal offset. + + // If this is the last subpartition, the width is simply equal to n_j. + // Note that this statement handles cases where the "edge case" (if + // one exists) is assigned to the high end of the index range (ie: + // handle_edge_low == FALSE). + if ( j == n_way - 1 ) return n_j; + + // At this point, we know there are at least two subpartitions left. + // We also know that IF the submatrix contains a completely dense + // rectangular submatrix, it will occur BEFORE the triangular (or + // trapezoidal) part. + + // Here, we implement a somewhat minor load balancing optimization + // that ends up getting employed only for relatively small matrices. + // First, recall that all subpartition widths will be some multiple + // of the blocking factor bf, except perhaps either the first or last + // subpartition, which will receive the edge case, if it exists. + // Also recall that j represents the current thread (or thread group, + // or "caucus") for which we are computing a subpartition width. + // If n_j is sufficiently small that we can only allocate bf columns + // to each of the remaining threads, then we set the width to bf. We + // do not allow the subpartition width to be less than bf, so, under + // some conditions, if n_j is small enough, some of the reamining + // threads may not get any work. For the purposes of this lower bound + // on work (ie: width >= bf), we allow the edge case to count as a + // "full" set of bf columns. + { + dim_t n_j_bf = n_j / bf + ( bf_left > 0 ? 1 : 0 ); + + if ( n_j_bf <= n_way - j ) + { + if ( j == 0 && handle_edge_low ) + width = ( bf_left > 0 ? bf_left : bf ); + else + width = bf; + + // Make sure that the width does not exceed n_j. This would + // occur if and when n_j_bf < n_way - j; that is, when the + // matrix being partitioned is sufficiently small relative to + // n_way such that there is not even enough work for every + // (remaining) thread to get bf (or bf_left) columns. The + // net effect of this safeguard is that some threads may get + // assigned empty ranges (ie: no work), which of course must + // happen in some situations. + if ( width > n_j ) width = n_j; + + return width; + } + } + + // This block computes the width assuming that we are entirely within + // a dense rectangle that precedes the triangular (or trapezoidal) + // part. + { + // First compute the width of the current panel under the + // assumption that the diagonal offset would not intersect. + width = ( dim_t )bli_round( ( double )area_per_thr / ( double )m ); + + // Adjust the width, if necessary. Specifically, we may need + // to allocate the edge case to the first subpartition, if + // requested; otherwise, we just need to ensure that the + // subpartition is a multiple of the blocking factor. + if ( j == 0 && handle_edge_low ) + { + if ( width % bf != bf_left ) width += bf_left - ( width % bf ); + } + else // if interior case + { + // Round up to the next multiple of the blocking factor. + //if ( width % bf != 0 ) width += bf - ( width % bf ); + // Round to the nearest multiple of the blocking factor. + if ( width % bf != 0 ) width = bli_round_to_mult( width, bf ); + } + } + + // We need to recompute width if the panel, according to the width + // as currently computed, would intersect the diagonal. + if ( diagoff_j < width ) + { + dim_t offm_inc, offn_inc; + + // Prune away the unstored region above the diagonal, if it exists. + // Note that the entire region was pruned initially, so we know that + // we don't need to try to prune the right side. (Also, we discard + // the offset deltas since we don't need to actually index into the + // subpartition.) + bli_prune_unstored_region_top_l( &diagoff_j, &m, &n_j, &offm_inc ); + //bli_prune_unstored_region_right_l( &diagoff_j, &m, &n_j, &offn_inc ); + + // We don't need offm_inc, offn_inc here. These statements should + // prevent compiler warnings. + ( void )offm_inc; + ( void )offn_inc; + + // Prepare to solve a quadratic equation to find the width of the + // current (jth) subpartition given the m dimension, diagonal offset, + // and area. + // NOTE: We know that the +/- in the quadratic formula must be a + + // here because we know that the desired solution (the subpartition + // width) will be smaller than (m + diagoff), not larger. If you + // don't believe me, draw a picture! + const double a = -0.5; + const double b = ( double )m + ( double )diagoff_j + 0.5; + const double c = -0.5 * ( ( double )diagoff_j * + ( ( double )diagoff_j + 1.0 ) + ) - area_per_thr; + const double r = b * b - 4.0 * a * c; + + // If the quadratic solution is not imaginary, round it and use that + // as our width (but make sure it didn't round to zero). Otherwise, + // discard the quadratic solution and leave width, as previously + // computed, unchanged. + if ( r >= 0.0 ) + { + const double x = ( -b + sqrt( r ) ) / ( 2.0 * a ); + + width = ( dim_t )bli_round( x ); + if ( width == 0 ) width = 1; + } + + // Adjust the width, if necessary. + if ( j == 0 && handle_edge_low ) + { + if ( width % bf != bf_left ) width += bf_left - ( width % bf ); + } + else // if interior case + { + // Round up to the next multiple of the blocking factor. + //if ( width % bf != 0 ) width += bf - ( width % bf ); + // Round to the nearest multiple of the blocking factor. + if ( width % bf != 0 ) width = bli_round_to_mult( width, bf ); + } + } + + // Make sure that the width, after being adjusted, does not cause the + // subpartition to exceed n_j. + if ( width > n_j ) width = n_j; + + return width; +} + +siz_t bli_find_area_trap_l + ( + doff_t diagoff, + dim_t m, + dim_t n, + dim_t bf + ) +{ + dim_t offm_inc = 0; + dim_t offn_inc = 0; + double utri_area; + double blktri_area; + + // Prune away any rectangular region above where the diagonal + // intersects the left edge of the subpartition, if it exists. + bli_prune_unstored_region_top_l( &diagoff, &m, &n, &offm_inc ); + + // Prune away any rectangular region to the right of where the + // diagonal intersects the bottom edge of the subpartition, if + // it exists. (This shouldn't ever be needed, since the caller + // would presumably have already performed rightward pruning, + // but it's here just in case.) + //bli_prune_unstored_region_right_l( &diagoff, &m, &n, &offn_inc ); + + ( void )offm_inc; + ( void )offn_inc; + + // Compute the area of the empty triangle so we can subtract it + // from the area of the rectangle that bounds the subpartition. + if ( bli_intersects_diag_n( diagoff, m, n ) ) + { + double tri_dim = ( double )( n - diagoff - 1 ); + tri_dim = bli_min( tri_dim, m - 1 ); + + utri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0; + blktri_area = tri_dim * ( bf - 1.0 ) / 2.0; + } + else + { + // If the diagonal does not intersect the trapezoid, then + // we can compute the area as a simple rectangle. + utri_area = 0.0; + blktri_area = 0.0; + } + + double area = ( double )m * ( double )n - utri_area + blktri_area; + + return ( siz_t )area; +} + +// ----------------------------------------------------------------------------- + +siz_t bli_thread_range_weighted_sub + ( + const thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + uplo_t uplo_orig, + dim_t m, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* j_start_thr, + dim_t* j_end_thr + ) +{ + dim_t n_way = bli_thrinfo_n_way( thread ); + dim_t my_id = bli_thrinfo_work_id( thread ); + + dim_t bf_left = n % bf; + + dim_t offm_inc, offn_inc; + + siz_t area = 0; + + // In this function, we assume that the caller has already determined + // that (a) the diagonal intersects the submatrix, and (b) the submatrix + // is either lower- or upper-stored. + + if ( bli_is_lower( uplo ) ) + { + #if 0 + if ( n_way > 1 ) + printf( "thread_range_weighted_sub(): tid %d: m n = %3d %3d do %d (lower)\n", + (int)my_id, (int)(m), (int)(n), (int)(diagoff) ); + #endif + + // Prune away the unstored region above the diagonal, if it exists, + // and then to the right of where the diagonal intersects the bottom, + // if it exists. (Also, we discard the offset deltas since we don't + // need to actually index into the subpartition.) + bli_prune_unstored_region_top_l( &diagoff, &m, &n, &offm_inc ); + + if ( !handle_edge_low ) + { + // This branch handles the following two cases: + // - note: Edge case microtiles are marked as 'e'. + // + // uplo_orig = lower | uplo = lower + // handle edge high (orig) | handle edge high + // + // x x x x x x x x x x x x x x + // x x x x x x x x x x x x x x x x + // x x x x x x x x x -> x x x x x x x x x + // x x x x x x x x x x x x x x x x x x x x + // x x x x x x x x x x e x x x x x x x x x x e + // x x x x x x x x x x e x x x x x x x x x x e + // + // uplo_orig = upper | uplo = lower + // handle edge low (orig) | handle edge high + // + // e x x x x x x x x x x x x x x x x x + // e x x x x x x x x x x x x x x x x x x + // x x x x x x x x x x -> x x x x x x x x x + // x x x x x x x x x x x x x x x x x x x + // x x x x x x x x x x x x x x x x x x e + // x x x x x x x x x x x x x x x x x e + + // If the edge case is being handled "high", then we can employ this + // simple macro for pruning the region to the right of where the + // diagonal intersets the right side of the submatrix (which amounts + // to adjusting the n dimension). + bli_prune_unstored_region_right_l( &diagoff, &m, &n, &offn_inc ); + } + else // if ( handle_edge_low ) + { + // This branch handles the following two cases: + // + // uplo_orig = upper | uplo = lower + // handle edge high (orig) | handle edge low + // + // x x x x x x x x x x e e x x x x x x + // x x x x x x x x x x e e x x x x x x x + // x x x x x x x x x e -> e x x x x x x x x + // x x x x x x x x e e x x x x x x x x x + // x x x x x x x e e x x x x x x x x x x + // x x x x x x e e x x x x x x x x x x + // + // uplo_orig = lower | uplo = lower + // handle edge low (orig) | handle edge low + // + // e x x x x x x e x x x x x x + // e x x x x x x x e x x x x x x x + // e x x x x x x x x -> e x x x x x x x x + // e x x x x x x x x x e x x x x x x x x x + // e x x x x x x x x x x e x x x x x x x x x x + // e x x x x x x x x x x e x x x x x x x x x x + + // If the edge case is being handled "low", then we have to be more + // careful. The problem can be seen in certain situations when we're + // actually computing the weighted ranges for an upper-stored + // subpartition whose (a) diagonal offset is positive (though will + // always be less than NR), (b) right-side edge case exists, and (c) + // sum of (a) and (b) is less than NR. This is a problem because the + // upcoming loop that iterates over/ bli_thread_range_width_l() + // doesn't realize that the offsets associated with (a) and (b) + // belong on two separate columns of microtiles. If we naively use + // bli_prune_unstored_region_right_l() when handle_edge_low == TRUE, + // the loop over bli_thread_range_width_l() will only "see" p-1 + // IR-iterations of work to assign to threads when there are + // actually p micropanels. + + const dim_t n_inner = ( diagoff + bli_min( m, n - diagoff ) - bf_left ); + + const dim_t n_bf_iter_br = n_inner / bf; + const dim_t n_bf_left_br = n_inner % bf; + const dim_t n_bf_br = ( bf_left > 0 ? 1 : 0 ) + + n_bf_iter_br + + ( n_bf_left_br > 0 ? 1 : 0 ); + + // Compute the number of extra columns that were included in n_bf_br + // as a result of including a full micropanel for the part of the + // submatrix that contains bf_left columns. For example, if bf = 16 + // and bf_left = 4, then bf_extra = 12. But if bf_left = 0, then we + // didn't include any extra columns. + const dim_t bf_extra = ( bf_left > 0 ? bf - bf_left : 0 ); + + // Subtract off bf_extra from n_bf_br to arrive at the "true" value + // of n that we'll use going forward. + n = n_bf_br * bf - bf_extra; + + #if 0 + if ( n_way > 1 ) + { + //printf( "thread_range_weighted_sub(): tid %d: _iter _left = %3d %3d (lower1)\n", + // (int)my_id, (int)n_bf_iter_br, (int)n_bf_left_br ); + printf( "thread_range_weighted_sub(): tid %d: m n = %3d %3d do %d (lower2)\n", + (int)my_id, (int)(m), (int)(n), (int)(diagoff) ); + } + #endif + } + + // We don't need offm_inc, offn_inc here. These statements should + // prevent compiler warnings. + ( void )offm_inc; + ( void )offn_inc; + + // Now that pruning has taken place, we know that diagoff >= 0. + + // Compute the total area of the submatrix, accounting for the + // location of the diagonal. This is done by computing the area in + // the strictly upper triangle, subtracting it off the area of the + // full rectangle, and then adding the missing strictly upper + // triangles of the bf x bf blocks along the diagonal. + double tri_dim = ( double )( n - diagoff - 1 ); + tri_dim = bli_min( tri_dim, m - 1 ); + double utri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0; + + // Note that the expression below is the simplified form of: + // blktri_area = ( tri_dim / bf ) * bf * ( bf - 1.0 ) / 2.0; + double blktri_area = tri_dim * ( bf - 1.0 ) / 2.0; + + // Compute the area of the region to the right of where the diagonal + // intersects the bottom edge of the submatrix. If it instead intersects + // the right edge (or the bottom-right corner), then this region does + // not exist and so its area is explicitly set to zero. + double beyondtri_dim = n - diagoff - m; + double beyondtri_area; + if ( 0 < beyondtri_dim ) beyondtri_area = beyondtri_dim * m; + else beyondtri_area = 0.0; + + // Here, we try to account for the added cost of computing columns of + // microtiles that intersect the diagonal. This is rather difficult to + // model, but this is partly due to the way non-square microtiles map + // onto the matrix relative to the diagonal, as well as additional + // overhead incurred from (potentially) computing with less-than-full + // columns of microtiles (i.e., columns for which diagoff_j < 0). + // Note that higher values for blktri_area have the net effect of + // increasing the relative size of slabs that share little or no overlap + // with the diagonal region. this is because it slightly increases the + // total area computation below, which in turn increases the area + // targeted by each thread/group earlier in the thread range, which + // for lower trapezoidal submatrices, corresponds to the regular + // rectangular region that precedes the diagonal part (if such a + // rectangular region exists). + blktri_area *= 1.5; + //blktri_area = 0.0; + + double area_total = ( double )m * ( double )n - utri_area + blktri_area + - beyondtri_area; + + // Divide the computed area by the number of ways of parallelism. + double area_per_thr = area_total / ( double )n_way; + + + // Initialize some variables prior to the loop: the offset to the + // current subpartition, the remainder of the n dimension, and + // the diagonal offset of the current subpartition. + dim_t off_j = 0; + doff_t diagoff_j = diagoff; + dim_t n_left = n; + + #if 0 + printf( "thread_range_weighted_sub(): tid %d: n_left = %3d (lower4)\n", + (int)my_id, (int)(n_left) ); + #endif + + // Iterate over the subpartition indices corresponding to each + // thread/caucus participating in the n_way parallelism. + for ( dim_t j = 0; j < n_way; ++j ) + { + // Compute the width of the jth subpartition, taking the + // current diagonal offset into account, if needed. + dim_t width_j + = + bli_thread_range_width_l + ( + diagoff_j, m, n_left, + j, n_way, + bf, bf_left, + area_per_thr, + handle_edge_low + ); + + #if 0 + if ( n_way > 1 ) + printf( "thread_range_weighted_sub(): tid %d: width_j = %d doff_j = %d\n", + (int)my_id, (int)width_j, (int)diagoff_j ); + #endif + + // If the current thread belongs to caucus j, this is his + // subpartition. So we compute the implied index range and + // end our search. + #if 0 + // An alternate way of assigning work to threads such that regions + // are assigned to threads left to right *after* accounting for the + // fact that we recycle the same lower-trapezoidal code to also + // compute the upper-trapezoidal case. + bool is_my_range; + if ( bli_is_lower( uplo_orig ) ) is_my_range = ( j == my_id ); + else is_my_range = ( j == n_way - my_id - 1 ); + #else + bool is_my_range = ( j == my_id ); + #endif + + if ( is_my_range ) + { + *j_start_thr = off_j; + *j_end_thr = off_j + width_j; + + #if 0 + if ( n_way > 1 ) + printf( "thread_range_weighted_sub(): tid %d: sta end = %3d %3d\n", + (int)my_id, (int)(*j_start_thr), (int)(*j_end_thr) ); + //printf( "thread_range_weighted_sub(): tid %d: n_left = %3d\n", + // (int)my_id, (int)(n) ); + #endif + + // Compute the area of the thread's current subpartition in case + // the caller is curious how much work they were assigned. + // NOTE: This area computation isn't actually needed for BLIS to + // function properly.) + area = bli_find_area_trap_l( diagoff_j, m, width_j, bf ); + + break; + } + + // Shift the current subpartition's starting and diagonal offsets, + // as well as the remainder of the n dimension, according to the + // computed width, and then iterate to the next subpartition. + off_j += width_j; + diagoff_j -= width_j; + n_left -= width_j; + } + } + else // if ( bli_is_upper( uplo ) ) + { + // Express the upper-stored case in terms of the lower-stored case. + + #if 0 + if ( n_way > 1 ) + printf( "thread_range_weighted_sub(): tid %d: m n = %3d %3d do %d (upper)\n", + (int)my_id, (int)(m), (int)(n), (int)(diagoff) ); + #endif + + // First, we convert the upper-stored trapezoid to an equivalent + // lower-stored trapezoid by rotating it 180 degrees. + bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); + + // Now that the trapezoid is "flipped" in the n dimension, negate + // the bool that encodes whether to handle the edge case at the + // low (or high) end of the index range. + bli_toggle_bool( &handle_edge_low ); + + // Compute the appropriate range for the rotated trapezoid. + area = bli_thread_range_weighted_sub + ( + thread, diagoff, uplo, uplo_orig, m, n, bf, + handle_edge_low, + j_start_thr, j_end_thr + ); + + // Reverse the indexing basis for the subpartition ranges so that + // the indices, relative to left-to-right iteration through the + // unrotated upper-stored trapezoid, map to the correct columns + // (relative to the diagonal). This amounts to subtracting the + // range from n. + bli_reverse_index_direction( n, j_start_thr, j_end_thr ); + } + + return area; +} + +// ----------------------------------------------------------------------------- + +siz_t bli_thread_range_mdim + ( + dir_t direct, + const thrinfo_t* thr, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntl_t* cntl, + const cntx_t* cntx, + dim_t* start, + dim_t* end + ) +{ + bszid_t bszid = bli_cntl_bszid( cntl ); + opid_t family = bli_cntl_family( cntl ); + + // This is part of trsm's current implementation, whereby right side + // cases are implemented in left-side micro-kernels, which requires + // we swap the usage of the register blocksizes for the purposes of + // packing A and B. + if ( family == BLIS_TRSM ) + { + if ( bli_obj_root_is_triangular( a ) ) bszid = BLIS_MR; + else bszid = BLIS_NR; + } + + const blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); + const obj_t* x; + bool use_weighted; + + // Use the operation family to choose the one of the two matrices + // being partitioned that potentially has structure, and also to + // decide whether or not we need to use weighted range partitioning. + // NOTE: It's important that we use non-weighted range partitioning + // for hemm and symm (ie: the gemm family) because the weighted + // function will mistakenly skip over unstored regions of the + // structured matrix, even though they represent part of that matrix + // that will be dense and full (after packing). + if ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; } + else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE; } + else /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; } + + if ( use_weighted ) + { + if ( direct == BLIS_FWD ) + return bli_thread_range_weighted_t2b( thr, x, bmult, start, end ); + else + return bli_thread_range_weighted_b2t( thr, x, bmult, start, end ); + } + else + { + if ( direct == BLIS_FWD ) + return bli_thread_range_t2b( thr, x, bmult, start, end ); + else + return bli_thread_range_b2t( thr, x, bmult, start, end ); + } +} + +siz_t bli_thread_range_ndim + ( + dir_t direct, + const thrinfo_t* thr, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntl_t* cntl, + const cntx_t* cntx, + dim_t* start, + dim_t* end + ) +{ + bszid_t bszid = bli_cntl_bszid( cntl ); + opid_t family = bli_cntl_family( cntl ); + + // This is part of trsm's current implementation, whereby right side + // cases are implemented in left-side micro-kernels, which requires + // we swap the usage of the register blocksizes for the purposes of + // packing A and B. + if ( family == BLIS_TRSM ) + { + if ( bli_obj_root_is_triangular( b ) ) bszid = BLIS_MR; + else bszid = BLIS_NR; + } + + const blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); + const obj_t* x; + bool use_weighted; + + // Use the operation family to choose the one of the two matrices + // being partitioned that potentially has structure, and also to + // decide whether or not we need to use weighted range partitioning. + // NOTE: It's important that we use non-weighted range partitioning + // for hemm and symm (ie: the gemm family) because the weighted + // function will mistakenly skip over unstored regions of the + // structured matrix, even though they represent part of that matrix + // that will be dense and full (after packing). + if ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; } + else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE; } + else /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; } + + if ( use_weighted ) + { + if ( direct == BLIS_FWD ) + return bli_thread_range_weighted_l2r( thr, x, bmult, start, end ); + else + return bli_thread_range_weighted_r2l( thr, x, bmult, start, end ); + } + else + { + if ( direct == BLIS_FWD ) + return bli_thread_range_l2r( thr, x, bmult, start, end ); + else + return bli_thread_range_r2l( thr, x, bmult, start, end ); + } +} + +// ----------------------------------------------------------------------------- + +siz_t bli_thread_range_weighted_l2r + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + siz_t area; + + // This function assigns area-weighted ranges in the n dimension + // where the total range spans 0 to n-1 with 0 at the left end and + // n-1 at the right end. + + if ( bli_obj_intersects_diag( a ) && + bli_obj_is_upper_or_lower( a ) ) + { + num_t dt = bli_obj_dt( a ); + doff_t diagoff = bli_obj_diag_offset( a ); + uplo_t uplo = bli_obj_uplo( a ); + dim_t m = bli_obj_length( a ); + dim_t n = bli_obj_width( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + // Support implicit transposition. + if ( bli_obj_has_trans( a ) ) + { + bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); + } + + area = + bli_thread_range_weighted_sub + ( + thr, diagoff, uplo, uplo, m, n, bf, + FALSE, start, end + ); + } + else // if dense or zeros + { + area = bli_thread_range_l2r + ( + thr, a, bmult, + start, end + ); + } + + return area; +} + +siz_t bli_thread_range_weighted_r2l + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + siz_t area; + + // This function assigns area-weighted ranges in the n dimension + // where the total range spans 0 to n-1 with 0 at the right end and + // n-1 at the left end. + + if ( bli_obj_intersects_diag( a ) && + bli_obj_is_upper_or_lower( a ) ) + { + num_t dt = bli_obj_dt( a ); + doff_t diagoff = bli_obj_diag_offset( a ); + uplo_t uplo = bli_obj_uplo( a ); + dim_t m = bli_obj_length( a ); + dim_t n = bli_obj_width( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + // Support implicit transposition. + if ( bli_obj_has_trans( a ) ) + { + bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); + } + + bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); + + area = + bli_thread_range_weighted_sub + ( + thr, diagoff, uplo, uplo, m, n, bf, + TRUE, start, end + ); + } + else // if dense or zeros + { + area = bli_thread_range_r2l + ( + thr, a, bmult, + start, end + ); + } + + return area; +} + +siz_t bli_thread_range_weighted_t2b + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + siz_t area; + + // This function assigns area-weighted ranges in the m dimension + // where the total range spans 0 to m-1 with 0 at the top end and + // m-1 at the bottom end. + + if ( bli_obj_intersects_diag( a ) && + bli_obj_is_upper_or_lower( a ) ) + { + num_t dt = bli_obj_dt( a ); + doff_t diagoff = bli_obj_diag_offset( a ); + uplo_t uplo = bli_obj_uplo( a ); + dim_t m = bli_obj_length( a ); + dim_t n = bli_obj_width( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + // Support implicit transposition. + if ( bli_obj_has_trans( a ) ) + { + bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); + } + + bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); + + area = + bli_thread_range_weighted_sub + ( + thr, diagoff, uplo, uplo, m, n, bf, + FALSE, start, end + ); + } + else // if dense or zeros + { + area = bli_thread_range_t2b + ( + thr, a, bmult, + start, end + ); + } + + return area; +} + +siz_t bli_thread_range_weighted_b2t + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + siz_t area; + + // This function assigns area-weighted ranges in the m dimension + // where the total range spans 0 to m-1 with 0 at the bottom end and + // m-1 at the top end. + + if ( bli_obj_intersects_diag( a ) && + bli_obj_is_upper_or_lower( a ) ) + { + num_t dt = bli_obj_dt( a ); + doff_t diagoff = bli_obj_diag_offset( a ); + uplo_t uplo = bli_obj_uplo( a ); + dim_t m = bli_obj_length( a ); + dim_t n = bli_obj_width( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + // Support implicit transposition. + if ( bli_obj_has_trans( a ) ) + { + bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); + } + + bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); + + bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); + + area = bli_thread_range_weighted_sub + ( + thr, diagoff, uplo, uplo, m, n, bf, + TRUE, start, end + ); + } + else // if dense or zeros + { + area = bli_thread_range_b2t + ( + thr, a, bmult, + start, end + ); + } + + return area; +} + diff --git a/frame/thread/bli_thread_range.h b/frame/thread/bli_thread_range.h new file mode 100644 index 000000000..cf966b5a3 --- /dev/null +++ b/frame/thread/bli_thread_range.h @@ -0,0 +1,128 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016, Hewlett Packard Enterprise Development LP + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_THREAD_RANGE_H +#define BLIS_THREAD_RANGE_H + +// Thread range-related prototypes. + +BLIS_EXPORT_BLIS void bli_thread_range_sub + ( + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end + ); + +#undef GENPROT +#define GENPROT( opname ) \ +\ +siz_t PASTEMAC0( opname ) \ + ( \ + dir_t direct, \ + const thrinfo_t* thr, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* c, \ + const cntl_t* cntl, \ + const cntx_t* cntx, \ + dim_t* start, \ + dim_t* end \ + ); + +GENPROT( thread_range_mdim ) +GENPROT( thread_range_ndim ) + +#undef GENPROT +#define GENPROT( opname ) \ +\ +siz_t PASTEMAC0( opname ) \ + ( \ + const thrinfo_t* thr, \ + const obj_t* a, \ + const blksz_t* bmult, \ + dim_t* start, \ + dim_t* end \ + ); + +GENPROT( thread_range_l2r ) +GENPROT( thread_range_r2l ) +GENPROT( thread_range_t2b ) +GENPROT( thread_range_b2t ) + +GENPROT( thread_range_weighted_l2r ) +GENPROT( thread_range_weighted_r2l ) +GENPROT( thread_range_weighted_t2b ) +GENPROT( thread_range_weighted_b2t ) + + +dim_t bli_thread_range_width_l + ( + doff_t diagoff_j, + dim_t m, + dim_t n_j, + dim_t j, + dim_t n_way, + dim_t bf, + dim_t bf_left, + double area_per_thr, + bool handle_edge_low + ); +siz_t bli_find_area_trap_l + ( + doff_t diagoff, + dim_t m, + dim_t n, + dim_t bf + ); + +siz_t bli_thread_range_weighted_sub + ( + const thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + uplo_t uplo_orig, + dim_t m, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* j_start_thr, + dim_t* j_end_thr + ); + +#endif diff --git a/frame/thread/bli_thread_range_slab_rr.c b/frame/thread/bli_thread_range_slab_rr.c new file mode 100644 index 000000000..be4432309 --- /dev/null +++ b/frame/thread/bli_thread_range_slab_rr.c @@ -0,0 +1,134 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_thread_range_quad + ( + const thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + dim_t m, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + +#ifdef BLIS_ENABLE_JRIR_RR + + const dim_t tid = bli_thrinfo_work_id( thread ); + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t n_iter = n / bf + ( n % bf ? 1 : 0 ); + + // Use round-robin (interleaved) partitioning of jr/ir loops. + *start = tid; + *end = n_iter; + *inc = jr_nt; + +#else // #elif defined( BLIS_ENABLE_JRIR_SLAB ) || + // defined( BLIS_ENABLE_JRIR_TLB ) + + // NOTE: While this cpp conditional branch applies to both _SLAB and _TLB + // cases, this *function* should never be called when BLIS_ENABLE_JRIR_TLB + // is defined, since the function is only called from macrokernels that were + // designed for slab/rr partitioning. + + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t n_iter = n / bf + ( n % bf ? 1 : 0 ); + + // If there is no parallelism in this loop, set the output variables + // and return early. + if ( jr_nt == 1 ) { *start = 0; *end = n_iter; *inc = 1; return; } + + // Local variables for the computed start, end, and increment. + dim_t st, en, in; + + if ( bli_intersects_diag_n( diagoff, m, n ) ) + { + // If the current submatrix intersects the diagonal, try to be + // intelligent about how threads are assigned work by using the + // quadratic partitioning function. + + bli_thread_range_weighted_sub + ( + thread, diagoff, uplo, uplo, m, n, bf, + handle_edge_low, &st, &en + ); + in = bf; + } + else + { + // If the current submatrix does not intersect the diagonal, then we + // are free to perform a uniform (and contiguous) slab partitioning. + + bli_thread_range_sub + ( + thread, n, bf, + handle_edge_low, &st, &en + ); + in = bf; + } + + // Convert the start and end column indices into micropanel indices by + // dividing by the blocking factor (which, for the jr loop, is NR). If + // either one yields a remainder, add an extra unit to the result. This + // is necessary for situations where there are t threads with t-1 or + // fewer micropanels of work, including an edge case. For example, if + // t = 3 and n = 10 (with bf = NR = 8), then we want start and end for + // each thread to be: + // + // column index upanel index + // tid 0: start, end = 0, 8 -> start, end = 0, 1 + // tid 1: start, end = 8, 10 -> start, end = 1, 2 + // tid 2: start, end = 10, 10 -> start, end = 2, 2 + // + // In this example, it's important that thread (tid) 2 gets no work, and + // we express that by specifying start = end = n, which is a non-existent + // column index. + + if ( st % bf == 0 ) *start = st / bf; + else *start = st / bf + 1; + + if ( en % bf == 0 ) *end = en / bf; + else *end = en / bf + 1; + + *inc = in / bf; + +#endif +} diff --git a/frame/thread/bli_thread_range_slab_rr.h b/frame/thread/bli_thread_range_slab_rr.h new file mode 100644 index 000000000..3e9797363 --- /dev/null +++ b/frame/thread/bli_thread_range_slab_rr.h @@ -0,0 +1,116 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_THREAD_RANGE_SLAB_RR_H +#define BLIS_THREAD_RANGE_SLAB_RR_H + +BLIS_INLINE void bli_thread_range_rr + ( + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + const dim_t tid = bli_thrinfo_work_id( thread ); + const dim_t nt = bli_thrinfo_n_way( thread ); + const dim_t n_iter = n / bf + ( n % bf ? 1 : 0 ); + + // Use round-robin (interleaved) partitioning of jr/ir loops. + *start = tid; + *end = n_iter; + *inc = nt; +} + +BLIS_INLINE void bli_thread_range_sl + ( + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + // Use contiguous slab partitioning of jr/ir loops. + bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); + *inc = 1; +} + +BLIS_INLINE void bli_thread_range_slrr + ( + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + // Define a general-purpose slab/rr function whose definition depends on + // whether slab or round-robin partitioning was requested at configure-time. + // Note that this function also uses the slab code path when tlb is enabled. + // If this is ever changed, make sure to change bli_is_my_iter() since they + // are used together by packm. + +#ifdef BLIS_ENABLE_JRIR_RR + bli_thread_range_rr( thread, n, bf, handle_edge_low, start, end, inc ); +#else // ifdef ( _SLAB || _TLB ) + bli_thread_range_sl( thread, n, bf, handle_edge_low, start, end, inc ); +#endif +} + +// ----------------------------------------------------------------------------- + +void bli_thread_range_quad + ( + const thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + dim_t m, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ); + +#endif + diff --git a/frame/thread/bli_thread_range_tlb.c b/frame/thread/bli_thread_range_tlb.c new file mode 100644 index 000000000..546ed341d --- /dev/null +++ b/frame/thread/bli_thread_range_tlb.c @@ -0,0 +1,1699 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// ----------------------------------------------------------------------------- + +#define PRINT_MODE +#define PGUARD if ( 0 ) +//#define PRINT_RESULT + + +#if 0 +dim_t bli_thread_range_tlb + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const uplo_t uplo, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + dim_t n_ut_for_me; + + if ( bli_is_lower( uplo ) ) + { + n_ut_for_me = bli_thread_range_tlb_l + ( + nt, tid, diagoff, m_iter, n_iter, mr, nr, j_st_p, i_st_p + ); + } + else if ( bli_is_upper( uplo ) ) + { + n_ut_for_me = bli_thread_range_tlb_u + ( + nt, tid, diagoff, m_iter, n_iter, mr, nr, j_st_p, i_st_p + ); + } + else // if ( bli_is_dense( uplo ) ) + { + n_ut_for_me = bli_thread_range_tlb_d + ( + nt, tid, m_iter, n_iter, mr, nr, j_st_p, i_st_p + ); + } + + return n_ut_for_me; +} +#endif + +// ----------------------------------------------------------------------------- + +dim_t bli_thread_range_tlb_l + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + // This function implements tile-level load balancing for a + // lower-trapezoidal submatrix. This partitioning guarantees that all + // threads are assigned nearly the same number of microtiles-worth of work, + // with a maximum imbalance of one microtile. It makes no effort, however, + // to account for differences in threads' workload that is attributable to + // differences in the number of edge-case (or diagonal-intersecting) + // microtiles (which incur slightly more work since they must first write + // to a temporary microtile before updating the output C matrix). + + // Assumption: -mr < diagoff. Make sure to prune leading rows beforehand! + if ( diagoff <= -mr ) bli_abort(); + + // + // -- Step 1: Compute the computational area of the region ----------------- + // + + // Compute the m and n dimensions according to m_iter and n_iter. (These + // m and n dims will likely be larger than the actual m and n since they + // "round up" the edge case microtiles into full-sized microtiles.) + const dim_t m = m_iter * mr; + const dim_t n = n_iter * nr; + + // For the purposes of many computations in this function, we aren't + // interested in the extent to which diagoff exceeds n (if it does) + // So we use a new variable that is guaranteed to be no greater than n. + const doff_t diagoffmin = bli_min( diagoff, n ); + + const dim_t m_rect = m; + const dim_t n_rect = ( diagoffmin / nr ) * nr; + + const dim_t rect_area = m_rect * n_rect; + const dim_t nonrect_area = m * n - rect_area; + + //const dim_t offn_rect = 0; + const dim_t offn_nonrect = n_rect; + const dim_t diagoff_nonrect = diagoffmin - n_rect; //diagoff % nr; + + const dim_t n_nonrect = n - n_rect; + + const dim_t offn_ut_nonrect = ( diagoffmin / nr ); + + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "min(diagoff,n): %7ld\n", diagoffmin ); + PGUARD printf( "offn_ut_nonrect: %7ld\n", offn_ut_nonrect ); + PGUARD printf( "offn_nonrect: %7ld\n", offn_nonrect ); + PGUARD printf( "diagoff_nonrect: %7ld\n", diagoff_nonrect ); + PGUARD printf( "n_nonrect: %7ld\n", n_nonrect ); + PGUARD printf( "---------------------------\n" ); + + dim_t num_unref_ut = 0; + + // Count the number of unreferenced utiles strictly above the diagonal. + for ( dim_t j = 0; j < n_nonrect; j += nr ) + { + const dim_t diagoff_j = diagoff_nonrect - j; + + // diagoff_j will always be at most nr - 1, but will typically be + // negative. This is because the non-rectangular region's diagonal + // offset will be at most nr - 1 for the first column of microtiles, + // since if it were more than nr - 1, that column would have already + // been pruned away (via the implicit pruning of diagoff_nonrect). + // NOTE: We use bli_max() to ensure that -diagoff_j / mr does not + // become negative, which can only happen if "top" pruning is not + // performed beforehand (and so it really isn't necessary here). + const dim_t num_unref_ut_j = bli_max( ( -diagoff_j / mr ), 0 ); + + num_unref_ut += num_unref_ut_j; + + PGUARD printf( "j %7ld\n", j ); + PGUARD printf( "diagoff_j %7ld\n", diagoff_j ); + PGUARD printf( "num_unref_ut_j %7ld\n", num_unref_ut_j ); + PGUARD printf( "num_unref_ut %7ld\n", num_unref_ut ); + PGUARD printf( "\n" ); + } + PGUARD printf( "---------------------------\n" ); + + const dim_t tri_unref_area = num_unref_ut * mr * nr; + const dim_t tri_ref_area = nonrect_area - tri_unref_area; + const dim_t total_ref_area = rect_area + tri_ref_area; + + PGUARD printf( "gross area: %7ld\n", m * n ); + PGUARD printf( "rect_area: %7ld\n", rect_area ); + PGUARD printf( "nonrect_area: %7ld\n", nonrect_area ); + PGUARD printf( "tri_unref_area: %7ld\n", tri_unref_area ); + PGUARD printf( "tri_ref_area: %7ld\n", tri_ref_area ); + PGUARD printf( "total_ref_area: %7ld\n", total_ref_area ); + PGUARD printf( "---------------------------\n" ); + + // + // -- Step 2: Compute key utile counts (per thread, per column, etc.) ------ + // + + const dim_t n_ut_ref = total_ref_area / ( mr * nr ); + //const dim_t n_ut_tri_ref = tri_ref_area / ( mr * nr ); + const dim_t n_ut_rect = rect_area / ( mr * nr ); + + PGUARD printf( "n_ut_ref: %7ld\n", n_ut_ref ); + //PGUARD printf( "n_ut_tri_ref: %7ld\n", n_ut_tri_ref ); + PGUARD printf( "n_ut_rect: %7ld\n", n_ut_rect ); + PGUARD printf( "---------------------------\n" ); + + // Compute the number of microtiles to allocate per thread as well as the + // number of leftover microtiles. + const dim_t n_ut_per_thr = n_ut_ref / nt; + const dim_t n_ut_pt_left = n_ut_ref % nt; + + PGUARD printf( "n_ut_per_thr: %7ld\n", n_ut_per_thr ); + PGUARD printf( "n_ut_pt_left: %7ld\n", n_ut_pt_left ); + PGUARD printf( "---------------------------\n" ); + + const dim_t n_ut_per_col = m_iter; + + PGUARD printf( "n_ut_per_col: %7ld\n", n_ut_per_col ); + + // Allocate one of the leftover microtiles to the current thread if its + // tid is one of the lower thread ids. + const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 ); + + PGUARD printf( "n_ut_for_me: %7ld (%ld+%ld)\n", n_ut_for_me, + n_ut_per_thr, n_ut_for_me - n_ut_per_thr ); + + // Compute the number of utiles prior to the current thread's starting + // point. This is the sum of all n_ut_for_me for all thread ids less + // than tid. Notice that the second half of this expression effectively + // adds one extra microtile for each lower-valued thread id, up to + // n_ut_pt_left. + const dim_t n_ut_before = tid * n_ut_per_thr + bli_min( tid, n_ut_pt_left ); + + PGUARD printf( "n_ut_before: %7ld\n", n_ut_before ); + PGUARD printf( "---------------------------\n" ); + + // + // -- Step 3: Compute the starting j/i utile offset for a given tid -------- + // + + dim_t j_st; + dim_t i_st; + + if ( n_ut_before < n_ut_rect ) + { + // This branch handles scenarios where the number of microtiles + // assigned to lower thread ids is strictly less than the number of + // utiles in the rectangular region. This means that calculating the + // starting microtile index is easy (because it does not need to + // take the location of the diagonal into account). + + PGUARD printf( "Rectangular region: n_ut_before < n_ut_rect\n" ); + PGUARD printf( "\n" ); + + const dim_t ut_index_rect_st = n_ut_before; + + PGUARD printf( "ut_index_st: %7ld\n", ut_index_rect_st ); + PGUARD printf( "---------------------------\n" ); + + j_st = ut_index_rect_st / n_ut_per_col; + i_st = ut_index_rect_st % n_ut_per_col; + + PGUARD printf( "j_st, i_st (fnl=) %4ld,%4ld\n", j_st, i_st ); + } + else // if ( n_ut_rect <= n_ut_before ) + { + // This branch handles scenarios where the number of microtiles + // assigned to lower thread ids exceeds (or equals) the number of + // utiles in the rectangular region. This means we need to observe the + // location of the diagonal to see how many utiles are referenced per + // column of utiles. + + PGUARD printf( "Diagonal region: n_ut_rect <= n_ut_before\n" ); + PGUARD printf( "\n" ); + + // This will be the number of microtile columns we will immediately + // advance past to get to the diagonal region. + const dim_t n_ut_col_adv = offn_ut_nonrect; + + PGUARD printf( "n_ut_col_adv: %7ld\n", n_ut_col_adv ); + + // In order to find j_st and i_st, we need to "allocate" n_ut_before + // microtiles. + dim_t n_ut_tba = n_ut_before; + + PGUARD printf( "n_ut_tba: %7ld\n", n_ut_tba ); + + // Advance past the rectangular region, decrementing n_ut_tba + // accordingly. + n_ut_tba -= n_ut_per_col * n_ut_col_adv; + + PGUARD printf( "n_ut_tba_1: %7ld\n", n_ut_tba ); + PGUARD printf( "\n" ); + + // In case n_ut_tba == 0. Only happens when n_ut_before == n_ut_rect. + j_st = n_ut_col_adv; + i_st = 0; + + for ( dim_t j = n_ut_col_adv; 0 < n_ut_tba; ++j ) + { + const dim_t diagoff_j = diagoffmin - j*nr; + const dim_t n_ut_skip_j = bli_max( -diagoff_j / mr, 0 ); + const dim_t n_ut_this_col = n_ut_per_col - n_ut_skip_j; + + PGUARD printf( "j: %7ld\n", j ); + PGUARD printf( "diagoff_j: %7ld\n", diagoff_j ); + PGUARD printf( "n_ut_skip_j: %7ld\n", n_ut_skip_j ); + PGUARD printf( "n_ut_this_col: %7ld\n", n_ut_this_col ); + PGUARD printf( "n_ut_tba_j0: %7ld\n", n_ut_tba ); + + if ( n_ut_tba < n_ut_this_col ) + { + // If the number of utiles to allocate is less than the number + // in this column, we know that j_st will refer to the current + // column. To find i_st, we first skip to the utile that + // intersects the diagonal and then add n_ut_tba. + j_st = j; + i_st = n_ut_skip_j + n_ut_tba; + PGUARD printf( "j_st, i_st (fnl<) %4ld,%4ld\n", j_st, i_st ); + } + else if ( n_ut_tba == n_ut_this_col ) + { + // If the number of utiles to allocate is exactly equal to the + // number in this column, we know that j_st will refer to the + // *next* column. But to find i_st, we will have to take the + // location of the diagonal into account. + const doff_t diagoff_jp1 = diagoff_j - nr; + const dim_t n_ut_skip_jp1 = bli_max( -diagoff_jp1 / mr, 0 ); + + j_st = j + 1; + i_st = n_ut_skip_jp1; + PGUARD printf( "j_st, i_st (fnl=) %4ld,%4ld\n", j_st, i_st ); + } + + // No matter what (especially if the number of utiles to allocate + // exceeds the number in this column), we decrement n_ut_tba attempt + // to continue to the next iteration. (Note: If either of the two + // branches above is triggered, n_ut_tba will be decremented down to + // zero (or less), in which case this will be the final iteration.) + n_ut_tba -= n_ut_this_col; + + PGUARD printf( "n_ut_tba_j1: %7ld\n", n_ut_tba ); + PGUARD printf( "\n" ); + } + } + + // + // -- Step 4: Save the results --------------------------------------------- + // + + *j_st_p = j_st; + *i_st_p = i_st; + + #ifdef PRINT_RESULT + printf( "j_st, i_st (mem) %4ld,%4ld (n_ut: %4ld)\n", + j_st, i_st, n_ut_for_me ); + #endif + + // Return the number of utiles that this thread was allocated. + return n_ut_for_me; +} + +// ----------------------------------------------------------------------------- + +dim_t bli_thread_range_tlb_u + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + // This function implements tile-level load balancing for an + // upper-trapezoidal submatrix. This partitioning guarantees that all + // threads are assigned nearly the same number of microtiles-worth of work, + // with a maximum imbalance of one microtile. It makes no effort, however, + // to account for differences in threads' workload that is attributable to + // differences in the number of edge-case (or diagonal-intersecting) + // microtiles (which incur slightly more work since they must first write + // to a temporary microtile before updating the output C matrix). + + // Assumption: diagoff < nr. Make sure to prune leading columns beforehand! + if ( nr <= diagoff ) bli_abort(); + + // + // -- Step 1: Compute the computational area of the region ----------------- + // + + // Compute the m and n dimensions according to m_iter and n_iter. (These + // m and n dims will likely be larger than the actual m and n since they + // "round up" the edge case microtiles into full-sized microtiles.) + const dim_t m = m_iter * mr; + const dim_t n = n_iter * nr; + + // For the purposes of many computations in this function, we aren't + // interested in the extent to which diagoff exceeds -m (if it does) + // So we use a new variable that is guaranteed to be no less than -m. + const doff_t diagoffmin = bli_max( diagoff, -m ); + + const dim_t m_rect = m; + const dim_t n_rect = ( -diagoffmin / nr ) * nr; + + const dim_t rect_area = m_rect * n_rect; + const dim_t nonrect_area = m * n - rect_area; + + const dim_t offn_rect = n - n_rect; + //const dim_t offn_nonrect = 0; + const dim_t diagoff_nonrect = diagoffmin; + + const dim_t n_nonrect = n - n_rect; + + const dim_t offn_ut_rect = n_iter + ( diagoffmin / nr ); + + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "max(diagoff,-m): %7ld\n", diagoffmin ); + PGUARD printf( "offn_ut_rect: %7ld\n", offn_ut_rect ); + PGUARD printf( "offn_rect: %7ld\n", offn_rect ); + PGUARD printf( "diagoff_nonrect: %7ld\n", diagoff_nonrect ); + PGUARD printf( "n_nonrect: %7ld\n", n_nonrect ); + PGUARD printf( "---------------------------\n" ); + + dim_t num_unref_ut = 0; + + // Count the number of unreferenced utiles strictly below the diagonal. + for ( dim_t j = 0; j < n_nonrect; j += nr ) + { + const dim_t diagoff_j = diagoff_nonrect - j; + + // diagoff_j will always be at most nr - 1, but will typically be + // negative. This is because the non-rectangular region's diagonal + // offset will be at most nr - 1 for the first column of microtiles, + // since if it were more than nr - 1, that column would have already + // been pruned away (prior to this function being called). + // NOTE: We use bli_max() to ensure that ( m + diagoff_j - nr ) / mr + // does not become negative, which can happen in some situations + // during the first iteration if diagoff is relatively close to -m. + // NOTE: We subtract nr from diagoff_j since it's really the diagonal + // offset of the *next* column of utiles that needs to be used to + // determine how many utiles are referenced in the current column. + const dim_t num_unref_ut_j = bli_max( ( m + diagoff_j - nr ) / mr, 0 ); + + num_unref_ut += num_unref_ut_j; + + PGUARD printf( "j %7ld\n", j ); + PGUARD printf( "diagoff_j - nr %7ld\n", diagoff_j - nr ); + PGUARD printf( "num_unref_ut_j %7ld\n", num_unref_ut_j ); + PGUARD printf( "num_unref_ut %7ld\n", num_unref_ut ); + PGUARD printf( "\n" ); + } + PGUARD printf( "---------------------------\n" ); + + const dim_t tri_unref_area = num_unref_ut * mr * nr; + const dim_t tri_ref_area = nonrect_area - tri_unref_area; + const dim_t total_ref_area = rect_area + tri_ref_area; + + PGUARD printf( "gross area: %7ld\n", m * n ); + PGUARD printf( "rect_area: %7ld\n", rect_area ); + PGUARD printf( "nonrect_area: %7ld\n", nonrect_area ); + PGUARD printf( "tri_unref_area: %7ld\n", tri_unref_area ); + PGUARD printf( "tri_ref_area: %7ld\n", tri_ref_area ); + PGUARD printf( "total_ref_area: %7ld\n", total_ref_area ); + PGUARD printf( "---------------------------\n" ); + + // + // -- Step 2: Compute key utile counts (per thread, per column, etc.) ------ + // + + const dim_t n_ut_ref = total_ref_area / ( mr * nr ); + const dim_t n_ut_tri_ref = tri_ref_area / ( mr * nr ); + //const dim_t n_ut_rect = rect_area / ( mr * nr ); + + PGUARD printf( "n_ut_ref: %7ld\n", n_ut_ref ); + PGUARD printf( "n_ut_tri_ref: %7ld\n", n_ut_tri_ref ); + //PGUARD printf( "n_ut_rect: %7ld\n", n_ut_rect ); + PGUARD printf( "---------------------------\n" ); + + // Compute the number of microtiles to allocate per thread as well as the + // number of leftover microtiles. + const dim_t n_ut_per_thr = n_ut_ref / nt; + const dim_t n_ut_pt_left = n_ut_ref % nt; + + PGUARD printf( "n_ut_per_thr: %7ld\n", n_ut_per_thr ); + PGUARD printf( "n_ut_pt_left: %7ld\n", n_ut_pt_left ); + PGUARD printf( "---------------------------\n" ); + + const dim_t n_ut_per_col = m_iter; + + PGUARD printf( "n_ut_per_col: %7ld\n", n_ut_per_col ); + + // Allocate one of the leftover microtiles to the current thread if its + // tid is one of the lower thread ids. + const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 ); + + PGUARD printf( "n_ut_for_me: %7ld (%ld+%ld)\n", n_ut_for_me, + n_ut_per_thr, n_ut_for_me - n_ut_per_thr ); + + // Compute the number of utiles prior to the current thread's starting + // point. This is the sum of all n_ut_for_me for all thread ids less + // than tid. Notice that the second half of this expression effectively + // adds one extra microtile for each lower-valued thread id, up to + // n_ut_pt_left. + const dim_t n_ut_before = tid * n_ut_per_thr + bli_min( tid, n_ut_pt_left ); + + PGUARD printf( "n_ut_before: %7ld\n", n_ut_before ); + PGUARD printf( "---------------------------\n" ); + + // + // -- Step 3: Compute the starting j/i utile offset for a given tid -------- + // + + dim_t j_st; + dim_t i_st; + + if ( n_ut_tri_ref <= n_ut_before ) + { + // This branch handles scenarios where the number of microtiles + // assigned to lower thread ids exceeds (or equals) the number of + // utiles in the diagonal region. This means that calculating the + // starting microtile index is easy (because it does not need to + // take the location of the diagonal into account). + + PGUARD printf( "Rectangular region: n_ut_tri_ref <= n_ut_before\n" ); + PGUARD printf( "\n" ); + + const dim_t ut_index_rect_st = n_ut_before - n_ut_tri_ref; + + PGUARD printf( "ut_index_rect_st: %7ld\n", ut_index_rect_st ); + PGUARD printf( "---------------------------\n" ); + + j_st = offn_ut_rect + ut_index_rect_st / n_ut_per_col; + i_st = ut_index_rect_st % n_ut_per_col; + + PGUARD printf( "j_st, i_st (fnl=) %4ld,%4ld\n", j_st, i_st ); + } + else // if ( n_ut_before < n_ut_tri_ref ) + { + // This branch handles scenarios where the number of microtiles + // assigned to lower thread ids is strictly less than the number of + // utiles in the diagonal region. This means we need to observe the + // location of the diagonal to see how many utiles are referenced per + // column of utiles. + + PGUARD printf( "Diagonal region: n_ut_before < n_ut_tri_ref\n" ); + PGUARD printf( "\n" ); + + // This will be the number of microtile columns we will immediately + // advance past to get to the diagonal region. + const dim_t n_ut_col_adv = 0; + + PGUARD printf( "n_ut_col_adv: %7ld\n", n_ut_col_adv ); + + // In order to find j_st and i_st, we need to "allocate" n_ut_before + // microtiles. + dim_t n_ut_tba = n_ut_before; + + PGUARD printf( "n_ut_tba: %7ld\n", n_ut_tba ); + + // No need to advance since the upper-trapezoid begins with the + // diagonal region. + //n_ut_tba -= 0; + + PGUARD printf( "n_ut_tba_1: %7ld\n", n_ut_tba ); + PGUARD printf( "\n" ); + + // In case n_ut_tba == 0. Only happens when n_ut_before == 0. + j_st = 0; + i_st = 0; + + for ( dim_t j = n_ut_col_adv; 0 < n_ut_tba; ++j ) + { + const dim_t diagoff_j = diagoffmin - j*nr; + const dim_t n_ut_skip_j = bli_max( ( m + diagoff_j - nr ) / mr, 0 ); + const dim_t n_ut_this_col = n_ut_per_col - n_ut_skip_j; + + PGUARD printf( "j: %7ld\n", j ); + PGUARD printf( "diagoff_j: %7ld\n", diagoff_j ); + PGUARD printf( "n_ut_skip_j: %7ld\n", n_ut_skip_j ); + PGUARD printf( "n_ut_this_col: %7ld\n", n_ut_this_col ); + PGUARD printf( "n_ut_tba_j0: %7ld\n", n_ut_tba ); + + if ( n_ut_tba < n_ut_this_col ) + { + // If the number of utiles to allocate is less than the number + // in this column, we know that j_st will refer to the current + // column. To find i_st, we simply use n_ut_tba. + j_st = j; + i_st = n_ut_tba; + PGUARD printf( "j_st, i_st (fnl<) %4ld,%4ld\n", j_st, i_st ); + } + else if ( n_ut_tba == n_ut_this_col ) + { + // If the number of utiles to allocate is exactly equal to the + // number in this column, we know that j_st will refer to the + // *next* column. In this situation, i_st will always be 0. + + j_st = j + 1; + i_st = 0; + PGUARD printf( "j_st, i_st (fnl=) %4ld,%4ld\n", j_st, i_st ); + } + + // No matter what (especially if the number of utiles to allocate + // exceeds the number in this column), we decrement n_ut_tba attempt + // to continue to the next iteration. (Note: If either of the two + // branches above is triggered, n_ut_tba will be decremented down to + // zero (or less), in which case this will be the final iteration.) + n_ut_tba -= n_ut_this_col; + + PGUARD printf( "n_ut_tba_j1: %7ld\n", n_ut_tba ); + PGUARD printf( "\n" ); + } + } + + // + // -- Step 4: Save the results --------------------------------------------- + // + + *j_st_p = j_st; + *i_st_p = i_st; + + #ifdef PRINT_RESULT + printf( "j_st, i_st (mem) %4ld,%4ld (n_ut: %4ld)\n", + j_st, i_st, n_ut_for_me ); + #endif + + // Return the number of utiles that this thread was allocated. + return n_ut_for_me; +} + +// ----------------------------------------------------------------------------- + +dim_t bli_thread_range_tlb_d + ( + const dim_t nt, + const dim_t tid, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + // This function implements tile-level load balancing for a + // general/dense submatrix. This partitioning guarantees that all + // threads are assigned nearly the same number of microtiles-worth of work, + // with a maximum imbalance of one microtile. It makes no effort, however, + // to account for differences in threads' workload that is attributable to + // differences in the number of edge-case microtiles (which incur slightly + // more work since they must first write to a temporary microtile before + // updating the output C matrix). + + // + // -- Step 1: Compute the computational area of the region ----------------- + // + + // Compute the m and n dimensions according to m_iter and n_iter. (These + // m and n dims will likely be larger than the actual m and n since they + // "round up" the edge case microtiles into full-sized microtiles.) + const dim_t m = m_iter * mr; + const dim_t n = n_iter * nr; + + const dim_t m_rect = m; + const dim_t n_rect = n; + + const dim_t total_ref_area = m_rect * n_rect; + + PGUARD printf( "total_ref_area: %7ld\n", total_ref_area ); + PGUARD printf( "---------------------------\n" ); + + // + // -- Step 2: Compute key utile counts (per thread, per column, etc.) ------ + // + + const dim_t n_ut_ref = total_ref_area / ( mr * nr ); + + PGUARD printf( "n_ut_ref: %7ld\n", n_ut_ref ); + PGUARD printf( "---------------------------\n" ); + + // Compute the number of microtiles to allocate per thread as well as the + // number of leftover microtiles. + const dim_t n_ut_per_thr = n_ut_ref / nt; + const dim_t n_ut_pt_left = n_ut_ref % nt; + + PGUARD printf( "n_ut_per_thr: %7ld\n", n_ut_per_thr ); + PGUARD printf( "n_ut_pt_left: %7ld\n", n_ut_pt_left ); + PGUARD printf( "---------------------------\n" ); + + const dim_t n_ut_per_col = m_iter; + + PGUARD printf( "n_ut_per_col: %7ld\n", n_ut_per_col ); + + // Allocate one of the leftover microtiles to the current thread if its + // tid is one of the lower thread ids. + const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 ); + + PGUARD printf( "n_ut_for_me: %7ld (%ld+%ld)\n", n_ut_for_me, + n_ut_per_thr, n_ut_for_me - n_ut_per_thr ); + + // Compute the number of utiles prior to the current thread's starting + // point. This is the sum of all n_ut_for_me for all thread ids less + // than tid. Notice that the second half of this expression effectively + // adds one extra microtile for each lower-valued thread id, up to + // n_ut_pt_left. + const dim_t n_ut_before = tid * n_ut_per_thr + bli_min( tid, n_ut_pt_left ); + + PGUARD printf( "n_ut_before: %7ld\n", n_ut_before ); + PGUARD printf( "---------------------------\n" ); + + // + // -- Step 3: Compute the starting j/i utile offset for a given tid -------- + // + + const dim_t ut_index_st = n_ut_before; + + PGUARD printf( "ut_index_st: %7ld\n", ut_index_st ); + PGUARD printf( "---------------------------\n" ); + + const dim_t j_st = ut_index_st / n_ut_per_col; + const dim_t i_st = ut_index_st % n_ut_per_col; + + // + // -- Step 4: Save the results --------------------------------------------- + // + + *j_st_p = j_st; + *i_st_p = i_st; + + #ifdef PRINT_RESULT + printf( "j_st, i_st (mem) %4ld,%4ld (n_ut: %4ld)\n", + j_st, i_st, n_ut_for_me ); + #endif + + // Return the number of utiles that this thread was allocated. + return n_ut_for_me; +} + +// ----------------------------------------------------------------------------- + +BLIS_INLINE dim_t bli_tlb_trmm_lx_k_iter + ( + const doff_t diagoff_iter, + const uplo_t uplo, + const dim_t k_iter, + const dim_t ir_iter + ) +{ + if ( bli_is_lower( uplo ) ) + return bli_min( diagoff_iter + ( ir_iter + 1 ), k_iter ); + else // if ( bli_is_upper( uplo ) ) + return k_iter - bli_max( diagoff_iter + ir_iter, 0 ); +} + +BLIS_INLINE dim_t bli_tlb_trmm_rl_k_iter + ( + const doff_t diagoff_iter, + const dim_t k_iter, + const dim_t jr_iter + ) +{ + return k_iter - bli_max( -diagoff_iter + jr_iter, 0 ); +} + +// ----------------------------------------------------------------------------- + +dim_t bli_thread_range_tlb_trmm_ll + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + return bli_thread_range_tlb_trmm_lx_impl + ( + nt, tid, diagoff, BLIS_LOWER, m_iter, n_iter, k_iter, mr, nr, + j_st_p, i_st_p + ); +} + +dim_t bli_thread_range_tlb_trmm_lu + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + return bli_thread_range_tlb_trmm_lx_impl + ( + nt, tid, diagoff, BLIS_UPPER, m_iter, n_iter, k_iter, mr, nr, + j_st_p, i_st_p + ); +} + +dim_t bli_thread_range_tlb_trmm_lx_impl + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const uplo_t uplo, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + // Assumption: 0 <= diagoff (lower); diagoff <= 0 (upper). + // Make sure to prune leading rows (lower) or columns (upper) beforehand! + if ( bli_is_lower( uplo ) && diagoff < 0 ) bli_abort(); + else if ( bli_is_upper( uplo ) && diagoff > 0 ) bli_abort(); + + // Single-threaded cases are simple and allow early returns. + if ( nt == 1 ) + { + const dim_t n_ut_for_me = m_iter * n_iter; + + *j_st_p = 0; + *i_st_p = 0; + + return n_ut_for_me; + } + + // + // -- Step 1: Compute the computational flop cost of each utile column ----- + // + + // Normalize the diagonal offset by mr so that it represents the offset in + // units of mr x mr chunks. + const doff_t diagoff_iter = diagoff / mr; + + // Determine the actual k dimension, in units of mr x mr iterations, capped + // by the k_iter given by the caller. + + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "m_iter: %7ld\n", m_iter ); + PGUARD printf( "n_iter: %7ld\n", n_iter ); + PGUARD printf( "k_iter: %7ld\n", k_iter ); + PGUARD printf( "mr: %7ld\n", mr ); + PGUARD printf( "nr: %7ld\n", nr ); + PGUARD printf( "diagoff_iter: %7ld\n", diagoff_iter ); + + dim_t uops_per_col = 0; + + // Compute the computation flop cost of each microtile column, normalized + // by the number of flops performed by each mr x nr rank-1 update. This + // is simply the sum of all of the k dimensions of each micropanel, up to + // and including (lower) or starting from (upper) the part that intersects + // the diagonal, or the right (lower) or left (upper) edge of the matrix, + // as applicable. + for ( dim_t i = 0; i < m_iter; ++i ) + { + // Don't allow k_a1011 to exceed k_iter, which is the maximum possible + // k dimension (in units of mr x mr chunks of micropanel). + const dim_t k_i_iter + = bli_tlb_trmm_lx_k_iter( diagoff_iter, uplo, k_iter, i ); + + uops_per_col += k_i_iter; + } + + PGUARD printf( "uops_per_col: %7ld\n", uops_per_col ); + + // + // -- Step 2: Compute key flop counts (per thread, per column, etc.) ------- + // + + // Compute the total cost for the entire block-panel multiply. + const dim_t total_uops = uops_per_col * n_iter; + + // Compute the number of microtile ops to allocate per thread as well as the + // number of leftover microtile ops. + const dim_t n_uops_per_thr = total_uops / nt; + const dim_t n_uops_pt_left = total_uops % nt; + + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "total_uops: %7ld\n", total_uops ); + PGUARD printf( "n_uops_per_thr: %7ld\n", n_uops_per_thr ); + PGUARD printf( "n_uops_pt_left: %7ld\n", n_uops_pt_left ); + + // + // -- Step 3: Compute the starting j/i utile offset for a given tid -------- + // + + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "total_utiles: %7ld\n", m_iter * n_iter ); + PGUARD printf( "---------------------------\n" ); + + dim_t j_st_cur = 0; dim_t j_en_cur = 0; + dim_t i_st_cur = 0; dim_t i_en_cur = 0; + + PGUARD printf( " tid %ld will start at j,i: %ld %ld\n", + ( dim_t )0, j_st_cur, i_st_cur ); + + // Find the utile update that pushes uops_tba to 0 or less. +#ifdef PRINT_MODE + for ( dim_t tid_i = 0; tid_i < nt; ++tid_i ) +#else + for ( dim_t tid_i = 0; tid_i < nt - 1; ++tid_i ) +#endif + { + const dim_t uops_ta = n_uops_per_thr + ( tid_i < n_uops_pt_left ? 1 : 0 ); + dim_t uops_tba = uops_ta; + dim_t j = j_st_cur; + dim_t n_ut_for_me = 0; + bool done_e = FALSE; + + PGUARD printf( "tid_i: %ld n_uops to alloc: %3ld \n", tid_i, uops_tba ); + + // This code begins allocating uops when the starting point is somewhere + // after the first microtile. Typically this will not be enough to + // allocate all uops, except for small matrices (and/or high numbers of + // threads), in which case the code signals an early finish (via done_e). + if ( 0 < i_st_cur ) + { + dim_t i; + + //PGUARD printf( "tid_i: %ld uops left to alloc: %2ld \n", tid_i, uops_tba ); + + for ( i = i_st_cur; i < m_iter; ++i ) + { + n_ut_for_me += 1; + + const dim_t uops_tba_new + = uops_tba - + bli_tlb_trmm_lx_k_iter( diagoff_iter, uplo, k_iter, i ); + + uops_tba = uops_tba_new; + + PGUARD printf( "tid_i: %ld i: %2ld (1 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, i, n_ut_for_me, uops_ta - uops_tba ); + + if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; done_e = TRUE; + break; } + } + + if ( i == m_iter ) j += 1; + } + + // This code advances over as many columns of utiles as possible and then + // walks down to the correct utile within the subsequent column. However, + // it gets skipped entirely if the previous code block was able to + // allocate all of the current tid's uops. + if ( !done_e ) + { + const dim_t j_inc0 = uops_tba / uops_per_col; + const dim_t j_left0 = uops_tba % uops_per_col; + + // We need to set a hard limit on how much j_inc can be. Namely, + // it should not exceed the number of utile columns that are left + // in the matrix. We also correctly compute j_left when the initial + // computation of j_inc0 above exceeds the revised j_inc, but this + // is mostly only so that in these situations the debug statements + // report the correct numbers. + const dim_t j_inc = bli_min( j_inc0, n_iter - j ); + const dim_t delta = j_inc0 - j_inc; + const dim_t j_left = j_left0 + delta * uops_per_col; + + // Increment j by the number of full utile columns we allocate, and + // set the remaining utile ops to be allocated to the remainder. + j += j_inc; + uops_tba = j_left; + + n_ut_for_me += j_inc * m_iter; + + PGUARD printf( "tid_i: %ld advanced to col: %2ld (uops traversed: %ld)\n", + tid_i, j, uops_per_col * j_inc ); + PGUARD printf( "tid_i: %ld j: %2ld ( n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + PGUARD printf( "tid_i: %ld uops left to alloc: %2ld \n", tid_i, j_left ); + + if ( uops_tba == 0 ) + { + // If advancing j_inc columns allocated all of our uops, then + // designate the last iteration of the previous column as the + // end point. + j_en_cur = j - 1; + i_en_cur = m_iter - 1; + } + else if ( j > n_iter ) bli_abort(); // safety check. + else if ( j == n_iter ) + { + // If we still have at least some uops to allocate, and advancing + // j_inc columns landed us at the beginning of the first non- + // existent column (column n_iter), then we're done. (The fact + // that we didn't get to allocate all of our uops just means that + // the lower tids slightly overshot their allocations, leaving + // fewer uops for the last thread.) + } + else // if ( 0 < uops_tba && j < n_iter ) + { + // If we have at least some uops to allocate, and we still have + // at least some columns to process, then we search for the + // utile that will put us over the top. + + for ( dim_t i = 0; i < m_iter; ++i ) + { + n_ut_for_me += 1; + + const dim_t uops_tba_new + = uops_tba - + bli_tlb_trmm_lx_k_iter( diagoff_iter, uplo, k_iter, i ); + + uops_tba = uops_tba_new; + + PGUARD printf( "tid_i: %ld i: %2ld (4 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, i, n_ut_for_me, uops_ta - uops_tba ); + + if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; + break; } + } + } + } + + + PGUARD printf( "tid_i: %ld (5 n_ut_cur: %ld) (overshoot: %ld out of %ld)\n", + tid_i, n_ut_for_me, -uops_tba, uops_ta ); + + if ( tid_i == tid ) + { + *j_st_p = j_st_cur; + *i_st_p = i_st_cur; + return n_ut_for_me; + } + + // Use the current tid's ending i,j values to determine the starting i,j + // values for the next tid. + j_st_cur = j_en_cur; + i_st_cur = i_en_cur + 1; + if ( i_st_cur == m_iter ) { j_st_cur += 1; i_st_cur = 0; } + + PGUARD printf( "tid_i: %ld (6 n_ut_cur: %ld)\n", + tid_i, n_ut_for_me ); + PGUARD printf( "tid_i: %ld tid %ld will start at j,i: %ld %ld\n", + tid_i, tid_i + 1, j_st_cur, i_st_cur ); + PGUARD printf( "---------------------------\n" ); + } + +#ifndef PRINT_MODE + + // + // -- Step 4: Handle the last thread's allocation -------------------------- + // + + // An optimization: The above loop runs to nt - 1 rather than nt since it's + // easy to count the number of utiles allocated to the last thread. + const dim_t n_ut_for_me = m_iter - i_st_cur + + (n_iter - j_st_cur - 1) * m_iter; + *j_st_p = j_st_cur; + *i_st_p = i_st_cur; + + PGUARD printf( "tid_i: %ld (7 n_ut_for_me: %ld) (j,i_st: %ld %ld)\n", + tid, n_ut_for_me, j_st_cur, i_st_cur ); + + return n_ut_for_me; +#else + // This line should never execute, but we need it to satisfy the compiler. + return -1; +#endif +} + +// ----------------------------------------------------------------------------- + +#if 0 +dim_t bli_thread_range_tlb_trmm_r + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const uplo_t uplo, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + dim_t n_ut_for_me; + + if ( bli_is_lower( uplo ) ) + { + inc_t j_en_l, i_en_l; + + n_ut_for_me = bli_thread_range_tlb_trmm_rl_impl + ( + nt, tid, diagoff, m_iter, n_iter, k_iter, mr, nr, + j_st_p, i_st_p, &j_en_l, &i_en_l + ); + } + else // if ( bli_is_upper( uplo ) ) + { + inc_t j_st_l, i_st_l; + inc_t j_en_l, i_en_l; + + // Reverse the effective tid and use the diagonal offset as if the m and + // n dimension were reversed (similar to a 180 degree rotation). This + // transforms the problem into one of allocating ranges for a lower- + // triangular matrix, for which we already have a special routine. + const dim_t tid_rev = nt - tid - 1; + const doff_t diagoff_rev = nr*n_iter - ( nr*k_iter + diagoff ); + + n_ut_for_me = bli_thread_range_tlb_trmm_rl_impl + ( + nt, tid_rev, diagoff_rev, m_iter, n_iter, k_iter, mr, nr, + &j_st_l, &i_st_l, &j_en_l, &i_en_l + ); + + // The ending j and i offsets will serve as our starting offsets + // returned to the caller, but first we have to reverse the offsets so + // that their semantics are once again relative to an upper-triangular + // matrix. + j_en_l = n_iter - j_en_l - 1; + i_en_l = m_iter - i_en_l - 1; + + *j_st_p = j_en_l; + *i_st_p = i_en_l; + } + + return n_ut_for_me; +} +#endif + +dim_t bli_thread_range_tlb_trmm_rl + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + inc_t j_en_l, i_en_l; + + return bli_thread_range_tlb_trmm_rl_impl + ( + nt, tid, diagoff, m_iter, n_iter, k_iter, mr, nr, + j_st_p, i_st_p, &j_en_l, &i_en_l + ); +} + +dim_t bli_thread_range_tlb_trmm_ru + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + inc_t j_st_l, i_st_l; + inc_t j_en_l, i_en_l; + + // Reverse the effective tid and use the diagonal offset as if the m and + // n dimension were reversed (similar to a 180 degree rotation). This + // transforms the problem into one of allocating ranges for a lower- + // triangular matrix, for which we already have a special routine. + const dim_t tid_rev = nt - tid - 1; + const doff_t diagoff_rev = nr*n_iter - ( nr*k_iter + diagoff ); + + const dim_t n_ut_for_me = bli_thread_range_tlb_trmm_rl_impl + ( + nt, tid_rev, diagoff_rev, m_iter, n_iter, k_iter, mr, nr, + &j_st_l, &i_st_l, &j_en_l, &i_en_l + ); + + // The ending j and i offsets will serve as our starting offsets + // returned to the caller, but first we have to reverse the offsets so + // that their semantics are once again relative to an upper-triangular + // matrix. + j_en_l = n_iter - j_en_l - 1; + i_en_l = m_iter - i_en_l - 1; + + *j_st_p = j_en_l; + *i_st_p = i_en_l; + + return n_ut_for_me; +} + +dim_t bli_thread_range_tlb_trmm_rl_impl + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p, + inc_t* j_en_p, + inc_t* i_en_p + ) +{ + // Assumption: 0 <= diagoff. Make sure to prune leading rows beforehand! + if ( diagoff < 0 ) bli_abort(); + + // Single-threaded cases are simple and allow early returns. + if ( nt == 1 ) + { + const dim_t n_ut_for_me = m_iter * n_iter; + + *j_st_p = 0; + *i_st_p = 0; + *j_en_p = n_iter - 1; + *i_en_p = m_iter - 1; + + return n_ut_for_me; + } + + // + // -- Step 1: Compute the computational volume of the region --------------- + // + + // Normalize the diagonal offset by nr so that it represents the offset in + // units of nr x nr chunks. + const doff_t diagoff_iter = diagoff / nr; + + // For the purposes of many computations in this function, we aren't + // interested in the extent to which diagoff exceeds n (if it does) + // So we use a new variable that is guaranteed to be no greater than n. + const doff_t diagoffmin_iter = bli_min( diagoff_iter, n_iter ); + + const dim_t k_rect = k_iter; + const dim_t n_rect = diagoffmin_iter; + + const dim_t gross_area = k_rect * n_iter; + const dim_t rect_area = k_rect * n_rect; + const dim_t nonrect_area = gross_area - rect_area; + + const dim_t offn_nonrect = n_rect; + const dim_t diagoff_nonrect = 0; + + const dim_t n_nonrect = n_iter - n_rect; + + const dim_t offn_ut_nonrect = diagoffmin_iter; + + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "m_iter: %7ld\n", m_iter ); + PGUARD printf( "k_iter: %7ld\n", k_iter ); + PGUARD printf( "n_iter: %7ld\n", n_iter ); + PGUARD printf( "min(diagoff_it,n): %7ld\n", diagoffmin_iter ); + PGUARD printf( "offn_ut_nonrect: %7ld\n", offn_ut_nonrect ); + PGUARD printf( "offn_nonrect: %7ld\n", offn_nonrect ); + PGUARD printf( "diagoff_nonrect: %7ld\n", diagoff_nonrect ); + PGUARD printf( "n_nonrect: %7ld\n", n_nonrect ); + PGUARD printf( "---------------------------\n" ); + + const dim_t num_unref_ut0 = n_nonrect * ( n_nonrect - 1 ) / 2; + const dim_t num_unref_ut = bli_max( 0, num_unref_ut0 ); + + const dim_t tri_unref_area = num_unref_ut; + const dim_t tri_ref_area = nonrect_area - tri_unref_area; + const dim_t total_ref_area = rect_area + tri_ref_area; + const dim_t rect_vol = rect_area * m_iter; + const dim_t tri_ref_vol = tri_ref_area * m_iter; + const dim_t total_vol = total_ref_area * m_iter; + + PGUARD printf( "gross_area: %7ld\n", gross_area ); + PGUARD printf( "nonrect_area: %7ld\n", nonrect_area ); + PGUARD printf( "tri_unref_area: %7ld\n", tri_unref_area ); + PGUARD printf( "rect_area: %7ld\n", rect_area ); + PGUARD printf( "tri_ref_area: %7ld\n", tri_ref_area ); + PGUARD printf( "total_ref_area: %7ld\n", total_ref_area ); + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "rect_vol (uops): %7ld\n", rect_vol ); + PGUARD printf( "tri_ref_vol (uops): %7ld\n", tri_ref_vol ); + PGUARD printf( "total_vol (uops): %7ld\n", total_vol ); + PGUARD printf( "---------------------------\n" ); + + // + // -- Step 2: Compute key flop counts (per thread, per column, etc.) ------- + // + + //const dim_t rect_uops = rect_vol; + //const dim_t tri_ref_uops = tri_ref_vol; + const dim_t total_uops = total_vol; + + // Compute the number of microtile ops to allocate per thread as well as the + // number of leftover microtile ops. + const dim_t n_uops_per_thr = total_uops / nt; + const dim_t n_uops_pt_left = total_uops % nt; + + PGUARD printf( "n_threads: %7ld\n", nt ); + PGUARD printf( "n_uops_per_thr: %7ld\n", n_uops_per_thr ); + PGUARD printf( "n_uops_pt_left: %7ld\n", n_uops_pt_left ); + PGUARD printf( "---------------------------\n" ); + + const dim_t uops_per_col_rect = m_iter * k_iter; + + PGUARD printf( "uops_per_col_rect: %7ld\n", uops_per_col_rect ); + + // Allocate one of the leftover uops to the current thread if its tid is + // one of the lower thread ids. + //const dim_t n_uops_for_me = n_uops_per_thr + ( tid < n_uops_pt_left ? 1 : 0 ); + + //PGUARD printf( "n_uops_for_me: %7ld (%ld+%ld)\n", + // n_uops_for_me, n_uops_per_thr, n_uops_for_me - n_uops_per_thr ); + + // + // -- Step 3: Compute the starting j/i utile offset for a given tid) ------- + // + + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "total_utiles: %7ld\n", m_iter * n_iter ); + PGUARD printf( "---------------------------\n" ); + + dim_t j_st_cur = 0; dim_t j_en_cur = 0; + dim_t i_st_cur = 0; dim_t i_en_cur = 0; + + // Find the utile update that pushes uops_tba to 0 or less. +#ifdef PRINT_MODE + for ( dim_t tid_i = 0; tid_i < nt; ++tid_i ) +#else + for ( dim_t tid_i = 0; tid_i < nt - 1; ++tid_i ) +#endif + { + const dim_t uops_ta = n_uops_per_thr + ( tid_i < n_uops_pt_left ? 1 : 0 ); + dim_t uops_tba = uops_ta; + dim_t j = j_st_cur; + dim_t n_ut_for_me = 0; + bool done_e = FALSE; + bool search_tri = FALSE; + + PGUARD printf( "tid_i: %ld n_uops_ta: %3ld \n", tid_i, uops_tba ); + PGUARD printf( "tid_i: %ld j: %2ld ( n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + + // This code begins allocating uops when the starting point is somewhere + // after the first microtile. Typically this will not be enough to + // allocate all uops, except for situations where the number of threads + // is high relative to the number of utile columns, in which case the + // code signals an early finish (via done_e). + if ( 0 < i_st_cur ) + { + // Compute the number of uops needed to update each utile in the + // current column. + const dim_t k_iter_j = bli_tlb_trmm_rl_k_iter( diagoff_iter, k_iter, j ); + + dim_t i; + + #if 0 + + // Starting from i_st_cur within the current utile column, allocate + // utiles until (a) we run out of utiles in the column (which is tyipcally + // what happens), or (b) we finish allocating all uops for the current + // thread (uops_tba drops to zero or less). + for ( i = i_st_cur; i < m_iter; ++i ) + { + n_ut_for_me += 1; + + const dim_t uops_tba_new = uops_tba - k_iter_j; + + uops_tba = uops_tba_new; + + PGUARD printf( "tid_i: %ld i: %2ld (0 n_ut_cur: %ld) (uops_alloc: %ld) (k_iter_j: %ld)\n", + tid_i, i, n_ut_for_me, uops_ta - uops_tba, k_iter_j ); + + if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; done_e = TRUE; + break; } + } + + // If we traversed the entire column (regardless of whether we finished + // allocating utiles for the current thread), increment j to the next + // column, which is where we'll continue our search for the current tid + // (or start our search for the next tid if we finished allocating utiles). + // Additionally, if we finished traversing all utile columns, mark the + // last utile of the last column as the end point, and set the "done early" + // flag. + if ( i == m_iter ) + { + j += 1; + if ( j == n_iter ) { j_en_cur = j - 1; i_en_cur = m_iter - 1; done_e = TRUE; } + } + + #else + + // Compute the number of utiles left to allocate under the (probably false) + // assumption that all utiles incur the same uop cost (k_iter_j) to update. + // Also compute the number of utiles that remain in the current column. + const dim_t n_ut_tba_j = uops_tba / k_iter_j + ( uops_tba % k_iter_j ? 1 : 0 ); + const dim_t n_ut_rem_j = m_iter - i_st_cur; + + // Compare the aforementioned values. If n_ut_tba_j is less than or equal to + // the number of remaining utiles in the column, we can finish allocating + // without moving to the next column. But if n_ut_tba_j exceeds n_ut_rem_j, + // then we aren't done yet, so allocate what we can and move on. + if ( n_ut_tba_j <= n_ut_rem_j ) + { + n_ut_for_me += n_ut_tba_j; + uops_tba -= n_ut_tba_j * k_iter_j; + i = i_st_cur + n_ut_tba_j; + + j_en_cur = j; i_en_cur = i - 1; done_e = TRUE; + } + else // if ( n_ut_rem_j < n_ut_tba_j ) + { + n_ut_for_me += n_ut_rem_j; + uops_tba -= n_ut_rem_j * k_iter_j; + i = i_st_cur + n_ut_rem_j; + } + + PGUARD printf( "tid_i: %ld i: %2ld (* n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, i-1, n_ut_for_me, uops_ta - uops_tba ); + + // If we allocated all utiles in the column (regardless of whether we finished + // allocating utiles for the current thread), increment j to the next column, + // which is where we'll continue our search for the current tid's end point + // (or start our search through the next tid's range if we finished allocating + // the current tid's utiles). Additionally, if we allocated utiles from the + // last column, mark the tid's end point and set the "done early" flag. + if ( i == m_iter ) + { + j += 1; i = 0; + if ( j == n_iter ) { j_en_cur = j - 1; i_en_cur = m_iter - 1; done_e = TRUE; } + + PGUARD printf( "tid_i: %ld j: %2ld (! n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + } + + #endif + } + + // This code advances over as many columns of utiles as possible, within + // the rectangular region (i.e., pre-diagonal), and then walks down to + // the correct utile within the subsequent column. However, note that + // this code gets skipped entirely if the previous code block was able + // to allocate all of the current tid's uops. + if ( !done_e ) + { + // If j is positioned somewhere within the rectangular region, we can + // skip over as many utile columns as possible with some integer math. + // And depending on how many uops we were able to allocate relative to + // the number of columns that exist, we may need to walk through the + // triangular region as well. But if j is already in the triangular + // region, we set a flag so that we execute the code that will walk + // through those columns. + if ( j < diagoff_iter ) + { + const dim_t j_inc0 = uops_tba / uops_per_col_rect; + const dim_t j_left0 = uops_tba % uops_per_col_rect; + + // We need to set a hard limit on how much j_inc can be. Namely, + // it should not exceed the number of utile columns that are left + // in the rectangular region of the matrix, nor should it exceed + // the total number of utile columns that are left. + const dim_t j_inc1 = bli_min( j_inc0, diagoff_iter - j ); + const dim_t j_inc = bli_min( j_inc1, n_iter - j ); + const dim_t delta = j_inc0 - j_inc; + const dim_t j_left = j_left0 + delta * uops_per_col_rect; + + // Increment j by the number of full utile columns we allocate, and + // set the remaining utile ops to be allocated to the remainder. + j += j_inc; + uops_tba = j_left; + + n_ut_for_me += j_inc * m_iter; + + PGUARD printf( "tid_i: %ld advanced to col: %2ld (uops traversed: %ld)\n", + tid_i, j, uops_per_col_rect * j_inc ); + PGUARD printf( "tid_i: %ld j: %2ld (1 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + PGUARD printf( "tid_i: %ld uops left to alloc: %2ld \n", tid_i, j_left ); + + if ( uops_tba == 0 ) + { + // If advancing j_inc columns allocated all of our uops, then + // designate the last iteration of the previous column as the + // end point. + j_en_cur = j - 1; + i_en_cur = m_iter - 1; + search_tri = FALSE; + + PGUARD printf( "tid_i: %ld j: %2ld (2 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + } + else if ( j > n_iter ) bli_abort(); // Safety check; should never execute. + else if ( j == n_iter ) + { + // If we still have at least some uops to allocate, and advancing + // j_inc columns landed us at the beginning of the first non- + // existent column (column n_iter), then we're done. (The fact + // that we didn't get to allocate all of our uops just means that + // the lower tids slightly overshot their allocations, leaving + // fewer uops for the last thread.) + search_tri = FALSE; + PGUARD printf( "tid_i: %ld j: %2ld (3 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + } + else if ( j < diagoff_iter ) + { + // If we still have at least some uops to allocate, and advancing + // j_inc columns landed us at the beginning of a column that is + // still in the rectangular region, then we don't need to enter + // the triangular region (if it even exists). The code below will + // walk down the current column and find the utile that puts us + // over the top. + search_tri = FALSE; + PGUARD printf( "tid_i: %ld j: %2ld (4 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + } + else // if ( 0 < uops_tba && j == diagoff_iter && j < n_iter ) + { + // If we have at least some uops to allocate, and we still have + // at least some columns to process, then we set a flag to + // indicate that we still need to step through the triangular + // region. + search_tri = TRUE; + PGUARD printf( "tid_i: %ld j: %2ld (5 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + } + } + else /* if ( diagoff_iter <= j ) */ + { + PGUARD printf( "tid_i: %ld j: %2ld >= diagoff_iter: %ld\n", + tid_i, j, diagoff_iter ); + search_tri = TRUE; + } + + PGUARD printf( "tid_i: %ld j: %2ld search_tri: %d\n", tid_i, j, search_tri ); + + if ( search_tri ) + { + // If we still have some uops to allocate in the triangular region, + // we first allocate as many full utile columns as possible without + // exceeding the number of uops left to be allocated. + for ( ; j < n_iter; ++j ) + { + const dim_t k_iter_j = bli_tlb_trmm_rl_k_iter( diagoff_iter, k_iter, j ); + const dim_t n_uops_j = k_iter_j * m_iter; + + PGUARD printf( "tid_i: %ld j: %2ld (6 n_ut_cur: %ld) (uops_alloc: %ld) (n_uops_j: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba, n_uops_j ); + + if ( uops_tba == 0 ) + { + PGUARD printf( "tid_i: %ld j: %2ld (7 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + // If advancing over the previous column allocated all of + // our uops, then designate the last iteration of the + // previous column as the end point. + j_en_cur = j - 1; + i_en_cur = m_iter - 1; + break; + } + if ( n_uops_j <= uops_tba ) + { + // If advancing over the current column doesn't exceed the + // number of uops left to allocate, then allocate them. (If + // n_uops_j == uops_tba, then we'll be done shortly after + // incrementing j.) + n_ut_for_me += m_iter; + uops_tba -= n_uops_j; + + PGUARD printf( "tid_i: %ld j: %2ld (8 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + } + else // if ( uops_tba < n_uops_j ) + { + PGUARD printf( "tid_i: %ld j: %2ld (9 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + // If we can finish allocating all the remaining uops + // with the utiles in the current column, then we break + // out of the loop without updating j, n_ut_for_me, or + // uops_tba. The remaining uops will be allocated in + // the loop over m_iter below. + break; + } + } + } + + // If there are any uops left to allocate, and we haven't already + // exhausted all allocatable utiles, it means that we have to walk down + // the current column and find the utile that puts us over the top. + if ( 0 < uops_tba && j < n_iter ) + { + const dim_t k_iter_j = bli_tlb_trmm_rl_k_iter( diagoff_iter, k_iter, j ); + + PGUARD printf( "tid_i: %ld j: %2ld (A n_ut_cur: %ld) (uops_alloc: %ld) (k_iter_j: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba, k_iter_j ); + + #if 0 + + dim_t i; + for ( i = 0; i < m_iter; ++i ) + { + n_ut_for_me += 1; + const dim_t uops_tba_new = uops_tba - k_iter_j; + uops_tba = uops_tba_new; + PGUARD printf( "tid_i: %ld i: %2ld (B n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, i, n_ut_for_me, uops_ta - uops_tba ); + if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; break; } + } + + if ( i == m_iter ) + { + j += 1; + if ( j == n_iter ) { j_en_cur = j - 1; i_en_cur = m_iter - 1; } + } + + #else + + const dim_t n_ut_j = uops_tba / k_iter_j + ( uops_tba % k_iter_j ? 1 : 0 ); + const dim_t i = n_ut_j - 1; + + uops_tba -= n_ut_j * k_iter_j; + n_ut_for_me += n_ut_j; + + j_en_cur = j; i_en_cur = i; + + PGUARD printf( "tid_i: %ld i: %2ld (b n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, i, n_ut_for_me, uops_ta - uops_tba ); + + #endif + } + else // if ( uops_tba <= 0 || j == n_iter ) + { + j_en_cur = j - 1; + i_en_cur = m_iter - 1; + } + } + + PGUARD printf( "tid_i: %ld done! (C n_ut_cur: %ld) (overshoot: %ld out of %ld)\n", + tid_i, n_ut_for_me, -uops_tba, uops_ta ); + + if ( tid_i == tid ) + { + *j_st_p = j_st_cur; + *i_st_p = i_st_cur; + *j_en_p = j_en_cur; + *i_en_p = i_en_cur; + return n_ut_for_me; + } + + // Use the current tid's ending i,j values to determine the starting i,j + // values for the next tid. + j_st_cur = j_en_cur; + i_st_cur = i_en_cur + 1; + if ( i_st_cur == m_iter ) { j_st_cur += 1; i_st_cur = 0; } + + PGUARD printf( "tid_i: %ld (D n_ut_cur: %ld)\n", + tid_i, n_ut_for_me ); + PGUARD printf( "tid_i: %ld tid %ld will start at j,i: %ld %ld\n", + tid_i, tid_i + 1, j_st_cur, i_st_cur ); + PGUARD printf( "---------------------------\n" ); + } + +#ifndef PRINT_MODE + + // + // -- Step 4: Handle the last thread's allocation -------------------------- + // + + // An optimization: The above loop runs to nt - 1 rather than nt since it's + // easy to count the number of utiles allocated to the last thread. + const dim_t n_ut_for_me = m_iter - i_st_cur + + (n_iter - j_st_cur - 1) * m_iter; + *j_st_p = j_st_cur; + *i_st_p = i_st_cur; + *j_en_p = n_iter - 1; + *i_en_p = m_iter - 1; + + PGUARD printf( "tid_i: %ld (E n_ut_for_me: %ld) (j,i_st: %ld %ld)\n", + tid, n_ut_for_me, j_st_cur, i_st_cur ); + + return n_ut_for_me; +#else + // This line should never execute, but we need it to satisfy the compiler. + return -1; +#endif +} + diff --git a/frame/thread/bli_thread_range_tlb.h b/frame/thread/bli_thread_range_tlb.h new file mode 100644 index 000000000..b344f09ef --- /dev/null +++ b/frame/thread/bli_thread_range_tlb.h @@ -0,0 +1,192 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_THREAD_RANGE_TLB_H +#define BLIS_THREAD_RANGE_TLB_H + +#if 0 +dim_t bli_thread_range_tlb + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const uplo_t uplo, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +#endif +dim_t bli_thread_range_tlb_l + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +dim_t bli_thread_range_tlb_u + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +dim_t bli_thread_range_tlb_d + ( + const dim_t nt, + const dim_t tid, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); + +// --- + +dim_t bli_thread_range_tlb_trmm_ll + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +dim_t bli_thread_range_tlb_trmm_lu + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +dim_t bli_thread_range_tlb_trmm_lx_impl + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const uplo_t uplo, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +#if 0 +dim_t bli_thread_range_tlb_trmm_r + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const uplo_t uplo, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +#endif + +// --- + +dim_t bli_thread_range_tlb_trmm_rl + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +dim_t bli_thread_range_tlb_trmm_ru + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +dim_t bli_thread_range_tlb_trmm_rl_impl + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p, + inc_t* j_en_p, + inc_t* i_en_p + ); + +#endif diff --git a/frame/thread/old/bli_thread_range_snake.c b/frame/thread/old/bli_thread_range_snake.c new file mode 100644 index 000000000..11a287659 --- /dev/null +++ b/frame/thread/old/bli_thread_range_snake.c @@ -0,0 +1,120 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#if 0 +void bli_thread_range_snake_jr + ( + const thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + // Use snake partitioning of jr loop. + + // NOTE: This function currently assumes that edge cases are handled + // "high" and therefore ignores handle_edge_low. This is because the + // function is only used by gemmt and friends (herk/her2k/syrk/syr2k). + // These operations, unlike trmm/trmm3 and trsm, never require + // low-range edge cases. + + const dim_t tid = bli_thrinfo_work_id( thread ); + const dim_t nt = bli_thrinfo_n_way( thread ); + + const dim_t n_left = n % bf; + const dim_t n_iter = n / bf + ( n_left ? 1 : 0 ); + + if ( bli_is_lower( uplo ) ) + { + // Use the thrinfo_t work id as the thread's starting index. + const dim_t st = tid; + + // This increment will be too big for some threads with only one unit + // (NR columns, or an edge case) of work, but that's okay since all that + // matters is that st + in >= en, which will cause that thread's jr loop + // to not execute beyond the first iteration. + const dim_t in = 2 * ( nt - tid ) - 1; + + dim_t en = st + in + 1; + + // Don't let the thread's end index exceed n_iter. + if ( n_iter < en ) en = n_iter; + + *start = st * bf; + *end = en * bf; // - ( bf - n_left ); + *inc = in * bf; + } + else // if ( bli_is_upper( uplo ) ) + { + dim_t st = n_iter - 2 * nt + tid; + + const dim_t in = 2 * ( nt - tid ) - 1; + + dim_t en = st + in + 1; + + #if 1 + // When nt exceeds half n_iter, some threads will only get one unit + // (NR columns, or an edge case) of work. This manifests as st being + // negative, and thus we need to move their start index to their other + // assigned unit in the positive index range. + if ( st < 0 ) st += in; + + // If the start index is *still* negative, which happens for some + // threads when nt exceeds n_iter, then manually assign this thread + // an empty index range. + if ( st < 0 ) { st = 0; en = 0; } + #else + if ( 0 <= st + in ) { st += in; } + else { st = 0; en = 0; } + #endif + + #if 0 + printf( "thread_range_snake_jr(): tid %d: sta end = %3d %3d %3d\n", + (int)tid, (int)(st), (int)(en), (int)(in) ); + #endif + + *start = st * bf; + *end = en * bf; + *inc = in * bf; + } +} +#endif diff --git a/frame/1m/packm/bli_packm_thrinfo.h b/frame/thread/old/bli_thread_range_snake.h similarity index 70% rename from frame/1m/packm/bli_packm_thrinfo.h rename to frame/thread/old/bli_thread_range_snake.h index 1ac7f88df..73fd4ae73 100644 --- a/frame/1m/packm/bli_packm_thrinfo.h +++ b/frame/thread/old/bli_thread_range_snake.h @@ -32,34 +32,22 @@ */ -// -// thrinfo_t macros specific to packm. -// - -/* -#define bli_packm_thread_my_iter( index, thread ) \ -\ - ( index % thread->n_way == thread->work_id % thread->n_way ) -*/ - -#define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ -\ - ( i % n_way == work_id % n_way ) - -#define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ -\ - ( start <= i && i < end ) - -// Define a general-purpose version of bli_packm_my_iter() whose definition -// depends on whether slab or round-robin partitioning was requested at -// configure-time. -#ifdef BLIS_ENABLE_JRIR_SLAB - - #define bli_packm_my_iter bli_packm_my_iter_sl - -#else // BLIS_ENABLE_JRIR_RR - - #define bli_packm_my_iter bli_packm_my_iter_rr - +#ifndef BLIS_THREAD_RANGE_SNAKE_H +#define BLIS_THREAD_RANGE_SNAKE_H + +#if 0 +void bli_thread_range_snake_jr + ( + const thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ); #endif +#endif diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c index 02f7458ad..b61140743 100644 --- a/sandbox/gemmlike/bls_gemm_bp_var1.c +++ b/sandbox/gemmlike/bls_gemm_bp_var1.c @@ -344,11 +344,11 @@ void PASTECH2(bls_,ch,varname) \ \ /* Compute the addresses of the next micropanels of A and B. */ \ a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \ - if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ + if ( bli_is_last_iter_slrr( i, ir_end, ir_tid, ir_nt ) ) \ { \ a2 = a_ic_use; \ b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \ - if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ + if ( bli_is_last_iter_slrr( j, jr_end, jr_tid, jr_nt ) ) \ b2 = b_pc_use; \ } \ \ diff --git a/sandbox/gemmlike/bls_l3_packm_var1.c b/sandbox/gemmlike/bls_l3_packm_var1.c index 7c2c4e9a9..b37d34cce 100644 --- a/sandbox/gemmlike/bls_l3_packm_var1.c +++ b/sandbox/gemmlike/bls_l3_packm_var1.c @@ -131,10 +131,10 @@ void PASTECH2(bls_,ch,varname) \ dim_t it_start, it_end, it_inc; \ \ /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ \ /* Iterate over every logical micropanel in the source matrix. */ \ for ( ic = ic0, it = 0; it < n_iter; \ @@ -147,10 +147,10 @@ void PASTECH2(bls_,ch,varname) \ ctype* restrict c_use = c_begin; \ ctype* restrict p_use = p_begin; \ \ - /* The definition of bli_packm_my_iter() will depend on whether slab + /* The definition of bli_is_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. (The default is slab.) */ \ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ PASTECH2(bls_,ch,packm_cxk) \ ( \ diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/sandbox/gemmlike/bls_l3_packm_var2.c index 94ee0efcd..b3efbbc28 100644 --- a/sandbox/gemmlike/bls_l3_packm_var2.c +++ b/sandbox/gemmlike/bls_l3_packm_var2.c @@ -131,10 +131,10 @@ void PASTECH2(bls_,ch,varname) \ dim_t it_start, it_end, it_inc; \ \ /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ \ /* Iterate over every logical micropanel in the source matrix. */ \ for ( ic = ic0, it = 0; it < n_iter; \ @@ -147,10 +147,10 @@ void PASTECH2(bls_,ch,varname) \ ctype* restrict c_use = c_begin; \ ctype* restrict p_use = p_begin; \ \ - /* The definition of bli_packm_my_iter() will depend on whether slab + /* The definition of bli_is_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. (The default is slab.) */ \ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ /* NOTE: We assume here that kappa = 1 and therefore ignore it. If we're wrong, this will get someone's attention. */ \ diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 851102a2f..8656652b3 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -786,7 +786,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) char impl_str[32]; char def_impl_set_str[32]; char def_impl_unset_str[32]; - char jrir_str[16]; + char jrir_str[32]; const bool has_openmp = bli_info_get_enable_openmp(); const bool has_pthreads = bli_info_get_enable_pthreads(); @@ -821,8 +821,9 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) else sprintf( def_impl_set_str, "single" ); // Describe the status of jrir thread partitioning. - if ( bli_info_get_thread_part_jrir_slab() ) sprintf( jrir_str, "slab" ); - else /*bli_info_get_thread_part_jrir_rr()*/ sprintf( jrir_str, "round-robin" ); + if ( bli_info_get_thread_jrir_slab() ) sprintf( jrir_str, "slab" ); + else if ( bli_info_get_thread_jrir_rr() ) sprintf( jrir_str, "round-robin" ); + else /*bli_info_get_thread_jrir_tlb()*/ sprintf( jrir_str, "tile-level (slab)" ); char nt_str[16]; char jc_nt_str[16]; diff --git a/testsuite/src/test_trmm.c b/testsuite/src/test_trmm.c index 0504b3315..497ecf97e 100644 --- a/testsuite/src/test_trmm.c +++ b/testsuite/src/test_trmm.c @@ -271,7 +271,10 @@ void libblis_test_trmm_impl switch ( iface ) { case BLIS_TEST_SEQ_FRONT_END: +//bli_printm( "a", a, "%5.2f", "" ); +//bli_printm( "b", b, "%5.2f", "" ); bli_trmm( side, alpha, a, b ); +//bli_printm( "b after", b, "%5.2f", "" ); break; default: From d220f9c436c0dae409974724d42ab6c52f12a726 Mon Sep 17 00:00:00 2001 From: Nisanth M P Date: Wed, 11 Jan 2023 08:43:03 +0530 Subject: [PATCH 118/230] Fix k = 0 edge case in power10 microkernels (#706) Details: - When power10 sgemm and dgemm microkernels are called with k = 0, they become caught in infinite loops and segfault. This is fixed now via an early exit in the case of k = 0. --- kernels/power10/3/bli_dgemm_power10_mma.c | 29 ++++++++++------------- kernels/power10/3/bli_sgemm_power10_mma.c | 29 ++++++++++------------- 2 files changed, 25 insertions(+), 33 deletions(-) diff --git a/kernels/power10/3/bli_dgemm_power10_mma.c b/kernels/power10/3/bli_dgemm_power10_mma.c index abf66f58f..67163b5a7 100644 --- a/kernels/power10/3/bli_dgemm_power10_mma.c +++ b/kernels/power10/3/bli_dgemm_power10_mma.c @@ -74,12 +74,10 @@ void bli_dgemm_power10_mma_8x8 cntx_t* cntx ) { - // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. - // (1 is subtracted from k0 because 1 iteration of the k loop is pulled out) - uint64_t k_iter = (k-1) / 4; - uint64_t k_left = (k-1) % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; @@ -110,6 +108,16 @@ void bli_dgemm_power10_mma_8x8 __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + // initialize the accumulators to zeros + __builtin_mma_xxsetaccz(&acc0); + __builtin_mma_xxsetaccz(&acc1); + __builtin_mma_xxsetaccz(&acc2); + __builtin_mma_xxsetaccz(&acc3); + __builtin_mma_xxsetaccz(&acc4); + __builtin_mma_xxsetaccz(&acc5); + __builtin_mma_xxsetaccz(&acc6); + __builtin_mma_xxsetaccz(&acc7); + /* 2 vector pairs are necessary for a double precision outer product instruction. */ __vector_pair colA_1, @@ -141,19 +149,6 @@ void bli_dgemm_power10_mma_8x8 */ D_ASSEMBLE_VEC_PAIR - /* Compute accumulate outer products and override accumulators with result */ - __builtin_mma_xvf64ger (&acc0, colA_1, rb[0]); - __builtin_mma_xvf64ger (&acc1, colA_1, rb[1]); - __builtin_mma_xvf64ger (&acc2, colA_1, rb[2]); - __builtin_mma_xvf64ger (&acc3, colA_1, rb[3]); - __builtin_mma_xvf64ger (&acc4, colA_2, rb[0]); - __builtin_mma_xvf64ger (&acc5, colA_2, rb[1]); - __builtin_mma_xvf64ger (&acc6, colA_2, rb[2]); - __builtin_mma_xvf64ger (&acc7, colA_2, rb[3]); - - /* Move A and B pointers */ - D_INCREMENT - // k loop (unrolled by 4) for (int k = 0; k Date: Wed, 11 Jan 2023 08:50:57 +0530 Subject: [PATCH 119/230] Disable power10 kernels other than sgemm, dgemm. (#705) Details: - There is a power10 sandbox which uses microkernels for datatypes other than float and double (or scomplex/dcomplex). In a regular power10- configured build (that is, with the sandbox disabled), there were compile errors for some of these other non-sgemm/non-dgemm microkernels. This commit protects those kernels with a new cpp macro guard (which is defined in sandbox/power10/bli_sandbox.h) that prevents that kernel code from being compiled for normal, non-sandbox power10 builds. --- kernels/power10/3/bli_i16gemm_power10_mma.c | 5 ++++- kernels/power10/3/bli_i16sgemm_power10_mma.c | 5 ++++- kernels/power10/3/bli_i4gemm_power10_mma.c | 3 +++ kernels/power10/3/bli_i8gemm_power10_mma.c | 3 +++ kernels/power10/3/bli_sbgemm_power10_mma.c | 5 ++++- kernels/power10/3/bli_shgemm_power10_mma.c | 5 ++++- sandbox/power10/bli_sandbox.h | 4 ++++ 7 files changed, 26 insertions(+), 4 deletions(-) diff --git a/kernels/power10/3/bli_i16gemm_power10_mma.c b/kernels/power10/3/bli_i16gemm_power10_mma.c index d0c9390f5..cc1cd3d84 100644 --- a/kernels/power10/3/bli_i16gemm_power10_mma.c +++ b/kernels/power10/3/bli_i16gemm_power10_mma.c @@ -32,6 +32,8 @@ */ +#ifdef BLIS_SANDBOX_POWER10 + #include "vector_int_macros.h" #define I16_ACCUMULATE \ @@ -139,4 +141,5 @@ void bli_i16gemm_power10_mma_8x16 SAVE_ACC_bz(iv4sf_t, &acc6, rs_c, 8+4*rs_c); SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c); } -} \ No newline at end of file +} +#endif // BLIS_SANDBOX_POWER10 diff --git a/kernels/power10/3/bli_i16sgemm_power10_mma.c b/kernels/power10/3/bli_i16sgemm_power10_mma.c index 7d84e68e2..26da6cf79 100644 --- a/kernels/power10/3/bli_i16sgemm_power10_mma.c +++ b/kernels/power10/3/bli_i16sgemm_power10_mma.c @@ -32,6 +32,8 @@ */ +#ifdef BLIS_SANDBOX_POWER10 + #include "vector_int_macros.h" #define I16S_ACCUMULATE \ @@ -139,4 +141,5 @@ void bli_i16sgemm_power10_mma_8x16 SAVE_ACC_bz(iv4sf_t, &acc6, rs_c, 8+4*rs_c); SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c); } -} \ No newline at end of file +} +#endif // BLIS_SANDBOX_POWER10 diff --git a/kernels/power10/3/bli_i4gemm_power10_mma.c b/kernels/power10/3/bli_i4gemm_power10_mma.c index 6c78a9f00..a8d25d2da 100644 --- a/kernels/power10/3/bli_i4gemm_power10_mma.c +++ b/kernels/power10/3/bli_i4gemm_power10_mma.c @@ -32,6 +32,8 @@ */ +#ifdef BLIS_SANDBOX_POWER10 + #include "vector_int_macros.h" #define I4_ACCUMULATE \ @@ -140,3 +142,4 @@ void bli_i4gemm_power10_mma_8x16 SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c); } } +#endif // BLIS_SANDBOX_POWER10 diff --git a/kernels/power10/3/bli_i8gemm_power10_mma.c b/kernels/power10/3/bli_i8gemm_power10_mma.c index 8a0b158a5..2948e10bf 100644 --- a/kernels/power10/3/bli_i8gemm_power10_mma.c +++ b/kernels/power10/3/bli_i8gemm_power10_mma.c @@ -32,6 +32,8 @@ */ +#ifdef BLIS_SANDBOX_POWER10 + #include "vector_int_macros.h" #define I8_ACCUMULATE \ @@ -139,3 +141,4 @@ void bli_i8gemm_power10_mma_8x16 SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c); } } +#endif // BLIS_SANDBOX_POWER10 diff --git a/kernels/power10/3/bli_sbgemm_power10_mma.c b/kernels/power10/3/bli_sbgemm_power10_mma.c index c16710f45..e68c5bed9 100644 --- a/kernels/power10/3/bli_sbgemm_power10_mma.c +++ b/kernels/power10/3/bli_sbgemm_power10_mma.c @@ -32,6 +32,8 @@ */ +#ifdef BLIS_SANDBOX_POWER10 + #include "vector_int_macros.h" #define B_ACCUMULATE \ @@ -140,4 +142,5 @@ void bli_sbgemm_power10_mma_8x16 SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c); } -} \ No newline at end of file +} +#endif // BLIS_SANDBOX_POWER10 diff --git a/kernels/power10/3/bli_shgemm_power10_mma.c b/kernels/power10/3/bli_shgemm_power10_mma.c index dc62b5d60..9c7f9f741 100644 --- a/kernels/power10/3/bli_shgemm_power10_mma.c +++ b/kernels/power10/3/bli_shgemm_power10_mma.c @@ -32,6 +32,8 @@ */ +#ifdef BLIS_SANDBOX_POWER10 + #include "vector_int_macros.h" #define H_ACCUMULATE \ @@ -140,4 +142,5 @@ void bli_shgemm_power10_mma_8x16 SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c); } -} \ No newline at end of file +} +#endif // BLIS_SANDBOX_POWER10 diff --git a/sandbox/power10/bli_sandbox.h b/sandbox/power10/bli_sandbox.h index 22d293d13..35f786912 100644 --- a/sandbox/power10/bli_sandbox.h +++ b/sandbox/power10/bli_sandbox.h @@ -35,6 +35,10 @@ #ifndef BLIS_SANDBOX_H #define BLIS_SANDBOX_H +#ifndef BLIS_SANDBOX_POWER10 +#define BLIS_SANDBOX_POWER10 +#endif + #include "blis.h" #include "gemm_prototypes.h" From 38d88d5c131253066cad4f98eea06fa9299cae3b Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 10 Jan 2023 21:24:58 -0600 Subject: [PATCH 120/230] Define new global scalar (obj_t) constants. (#703) Details: - This commit defines the following new global scalar constants: - BLIS_ONE_I: This constant encodes the imaginary unit. - BLIS_MINUS_ONE_I: This constant encodes the negative imaginary unit. - BLIS_NAN: This constant encodes a not-a-number value. Both real and imaginary parts are set to NaN for complex datatypes. --- frame/base/bli_const.c | 26 ++++++++++++++++---------- frame/include/bli_extern_defs.h | 3 +++ frame/include/bli_type_defs.h | 9 +++++++++ 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/frame/base/bli_const.c b/frame/base/bli_const.c index 210d6ae77..03f1f7e60 100644 --- a/frame/base/bli_const.c +++ b/frame/base/bli_const.c @@ -36,19 +36,25 @@ // Statically initialize structs containing representations of various // constants for each datatype supported in BLIS. -static constdata_t bli_two_buffer = bli_obj_init_constdata( 2.0 ); -static constdata_t bli_one_buffer = bli_obj_init_constdata( 1.0 ); -static constdata_t bli_zero_buffer = bli_obj_init_constdata( 0.0 ); -static constdata_t bli_mone_buffer = bli_obj_init_constdata( -1.0 ); -static constdata_t bli_mtwo_buffer = bli_obj_init_constdata( -2.0 ); +static constdata_t bli_two_buffer = bli_obj_init_constdata( 2.0 ); +static constdata_t bli_one_buffer = bli_obj_init_constdata( 1.0 ); +static constdata_t bli_zero_buffer = bli_obj_init_constdata( 0.0 ); +static constdata_t bli_mone_buffer = bli_obj_init_constdata( -1.0 ); +static constdata_t bli_mtwo_buffer = bli_obj_init_constdata( -2.0 ); +static constdata_t bli_onei_buffer = bli_obj_init_constdata_ri( 0.0, 1.0 ); +static constdata_t bli_monei_buffer = bli_obj_init_constdata_ri( 0.0, -1.0 ); +static constdata_t bli_nan_buffer = bli_obj_init_constdata_ri( NAN, NAN ); // Statically initialize global scalar constants, attaching the addresses // of the corresponding structs above. -const obj_t BLIS_TWO = bli_obj_init_const( &bli_two_buffer ); -const obj_t BLIS_ONE = bli_obj_init_const( &bli_one_buffer ); -const obj_t BLIS_ZERO = bli_obj_init_const( &bli_zero_buffer ); -const obj_t BLIS_MINUS_ONE = bli_obj_init_const( &bli_mone_buffer ); -const obj_t BLIS_MINUS_TWO = bli_obj_init_const( &bli_mtwo_buffer ); +const obj_t BLIS_TWO = bli_obj_init_const( &bli_two_buffer ); +const obj_t BLIS_ONE = bli_obj_init_const( &bli_one_buffer ); +const obj_t BLIS_ZERO = bli_obj_init_const( &bli_zero_buffer ); +const obj_t BLIS_MINUS_ONE = bli_obj_init_const( &bli_mone_buffer ); +const obj_t BLIS_MINUS_TWO = bli_obj_init_const( &bli_mtwo_buffer ); +const obj_t BLIS_ONE_I = bli_obj_init_const( &bli_onei_buffer ); +const obj_t BLIS_MINUS_ONE_I = bli_obj_init_const( &bli_monei_buffer ); +const obj_t BLIS_NAN = bli_obj_init_const( &bli_nan_buffer ); #if 0 obj_t BLIS_TWO = {}; diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h index 71a6096e1..f157a6d56 100644 --- a/frame/include/bli_extern_defs.h +++ b/frame/include/bli_extern_defs.h @@ -42,6 +42,9 @@ BLIS_EXPORT_BLIS extern const obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_TWO; +BLIS_EXPORT_BLIS extern const obj_t BLIS_ONE_I; +BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_ONE_I; +BLIS_EXPORT_BLIS extern const obj_t BLIS_NAN; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 014be18b7..cb933bfa4 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1415,6 +1415,15 @@ BLIS_INLINE void bli_obj_init_subpart_from( const obj_t* a, obj_t* b ) .i = ( gint_t )val, \ } +#define bli_obj_init_constdata_ri( valr, vali ) \ +{ \ + .s = ( float )valr, \ + .d = ( double )valr, \ + .c = { .real = ( float )valr, .imag = ( float )vali }, \ + .z = { .real = ( double )valr, .imag = ( double )vali }, \ + .i = ( gint_t )valr, \ +} + // -- Context type -- From b895ec9f1f66fb93972589c06bff171337153a31 Mon Sep 17 00:00:00 2001 From: Nisanth M P Date: Wed, 11 Jan 2023 09:02:32 +0530 Subject: [PATCH 121/230] Fixing type-mismatch errors in power10 sandbox (#701) Details: - This commit fixes a mismatch between the function type signature of bli_gemm_ex() required by BLIS and the version of the function defined within the power10 sandbox. It also performs typecasting upon calling bli_gemm_front() to attain type consistency with the type signature defined by BLIS for bli_gemm_front(). --- sandbox/power10/bli_gemm_ex.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sandbox/power10/bli_gemm_ex.c b/sandbox/power10/bli_gemm_ex.c index d136c7e1b..7eef0ccef 100644 --- a/sandbox/power10/bli_gemm_ex.c +++ b/sandbox/power10/bli_gemm_ex.c @@ -46,13 +46,13 @@ void bli_gemm_ex ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - rntm_t* rntm + const obj_t* alpha, + const obj_t* a, + const obj_t* b, + const obj_t* beta, + const obj_t* c, + const cntx_t* cntx, + const rntm_t* rntm ) { bli_init_once(); @@ -73,7 +73,7 @@ void bli_gemm_ex // Invoke the operation's front end. bli_gemm_front ( - alpha, a, b, beta, c, cntx, rntm + alpha, a, b, beta, c, cntx, (rntm_t* )rntm ); } From 9a366b14fe52c469f4664ef5dd93d85be8d97baa Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 12 Jan 2023 13:07:22 -0600 Subject: [PATCH 122/230] Implement cntx_t pointer caching in gks. (#709) Details: - Refactored the gks cntx_t query functions so that: (1) there is a clearer pattern of similarity between functions that query a native context and those that query its induced (1m) counterpart; and (2) queried cntx_t pointers (for both native and induced cntx_t pointers) are cached (by default), or deep-queried upon each invocation, depending on whether cpp macro BLIS_ENABLE_GKS_CACHING is defined. - Refactored query-related functions in bli_arch.c to cache the queried arch_t value (by default), or deep-query the arch_t value upon each invocation, depending on whether cpp macro BLIS_ENABLE_GKS_CACHING is defined. - Tweaked the behavior of bli_gks_query_ind_cntx_impl() (formerly named bli_gks_query_ind_cntx()) so that the induced method cntx_t struct is repopulated each time the function is called. (It is still only allocated once on first call.) This was mostly done in preparation for some future in which the arch_t value might change at runtime. In such a scenario, the induced method context would need to be recalculated any time the native context changes. - Added preprocessor logic to bli_config_macro_defs.h to handle enabling or disabling of cntx_t pointer caching (via BLIS_ENABLE_GKS_CACHING). - For now, cntx_t pointer caching is enabled by default and does not correspond to any official configure option. Disabling can be done by inserting a #define for BLIS_DISABLE_GKS_CACHING into the appropriate bli_family_*.h header file within the configuration of interest. - Thanks to Harihara Sudhan S (AMD) for suggesting that cntxt_t pointers (and not just arch_t values) be cached. - Comment updates. --- CREDITS | 1 + frame/base/bli_arch.c | 43 +++++++- frame/base/bli_arch.h | 5 +- frame/base/bli_gks.c | 146 ++++++++++++++++++++------ frame/base/bli_gks.h | 7 +- frame/base/bli_ind.c | 6 +- frame/base/bli_memsys.c | 10 +- frame/include/bli_config_macro_defs.h | 11 ++ frame/thread/bli_pthread.c | 2 +- 9 files changed, 179 insertions(+), 52 deletions(-) diff --git a/CREDITS b/CREDITS index 939351c00..51afcc276 100644 --- a/CREDITS +++ b/CREDITS @@ -104,6 +104,7 @@ but many others have contributed code and feedback, including Paul Springer @springer13 (RWTH Aachen University) Adam J. Stewart @adamjstewart (University of Illinois at Urbana-Champaign) Vladimir Sukarev + Harihara Sudhan S @ihariharasudhan (AMD) Chengguo Sun @chengguosun Santanu Thangaraj (AMD) Nicholai Tukanov @nicholaiTukanov (The University of Texas at Austin) diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index 48b50a774..bd3f24993 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -67,14 +67,27 @@ // The arch_t id for the currently running hardware. We initialize to -1, // which will be overwritten upon calling bli_arch_set_id(). -static arch_t id = -1; +static arch_t cached_id = -1; arch_t bli_arch_query_id( void ) { + +#ifdef BLIS_ENABLE_GKS_CACHING + + // Deep-query the arch_t id once via bli_pthread_once(). Since we are + // constrained by the pthread interface to pthread_once(), the id must be + // "returned" indirectly via a static variable (cached_id). bli_arch_set_id_once(); - // Simply return the id that was previously cached. - return id; + // Return the id that was previously cached. + return cached_id; + +#else + + // Deep-query and return a fresh arch_t. + return bli_arch_query_id_impl(); + +#endif } // ----------------------------------------------------------------------------- @@ -85,6 +98,9 @@ static bli_pthread_once_t once_id = BLIS_PTHREAD_ONCE_INIT; void bli_arch_set_id_once( void ) { + // When this file is being compiled as part of the configure script's + // hardware auto-detection driver, we avoid calling the bli_pthread APIs + // so that we aren't required to include those symbols in the executable. #ifndef BLIS_CONFIGURETIME_CPUID bli_pthread_once( &once_id, bli_arch_set_id ); #endif @@ -94,6 +110,16 @@ void bli_arch_set_id_once( void ) void bli_arch_set_id( void ) { + // Deep-query the arch_t and save it in the static variable (cached_id). + cached_id = bli_arch_query_id_impl(); +} + +// ----------------------------------------------------------------------------- + +arch_t bli_arch_query_id_impl( void ) +{ + arch_t id; + // Check the environment variable BLIS_ARCH_DEBUG to see if the user // requested that we echo the result of the subconfiguration selection. bool do_logging = bli_env_get_var( "BLIS_ARCH_DEBUG", 0 ); @@ -103,6 +129,9 @@ void bli_arch_set_id( void ) // requested that we use a specific subconfiguration. dim_t req_id = bli_env_get_var( "BLIS_ARCH_TYPE", -1 ); + // When this file is being compiled as part of the configure script's + // hardware auto-detection driver, we avoid calling the bli_check APIs + // so that we aren't required to include those symbols in the executable. #ifndef BLIS_CONFIGURETIME_CPUID if ( req_id != -1 ) { @@ -243,8 +272,12 @@ void bli_arch_set_id( void ) fprintf( stderr, "libblis: selecting sub-configuration '%s'.\n", bli_arch_string( id ) ); - //printf( "blis_arch_query_id(): id = %u\n", id ); - //exit(1); + #if 0 + printf( "blis_arch_query_id_impl(): id = %u\n", id ); + exit(1); + #endif + + return id; } // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_arch.h b/frame/base/bli_arch.h index 08af7ae79..a5f0c15d3 100644 --- a/frame/base/bli_arch.h +++ b/frame/base/bli_arch.h @@ -37,8 +37,9 @@ BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); -void bli_arch_set_id_once( void ); -void bli_arch_set_id( void ); +void bli_arch_set_id_once( void ); +void bli_arch_set_id( void ); +arch_t bli_arch_query_id_impl( void ); BLIS_EXPORT_BLIS const char* bli_arch_string( arch_t id ); diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 094810d9d..df0abc8ed 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -35,8 +35,8 @@ #include "blis.h" -// The array of cntx_t* pointers to cache modified contexts used by -// induced methods. +// The array of cntx_t* pointers to cache modified contexts used by induced +// methods. static cntx_t** gks[ BLIS_NUM_ARCHS ]; // The array of function pointers holding the registered context initialization @@ -52,6 +52,13 @@ typedef void (*nat_cntx_init_ft)( cntx_t* cntx ); typedef void (*ref_cntx_init_ft)( cntx_t* cntx ); typedef void (*ind_cntx_init_ft)( ind_t method, cntx_t* cntx ); +// Cached copies of the pointers to the native and induced contexts for the +// active subconfiguration. When BLIS_ENABLE_GKS_CACHING is enabled, these +// pointers will be set once and then reused to fulfill subsequent context +// queries. +static cntx_t* cached_cntx_nat = NULL; +static cntx_t* cached_cntx_ind = NULL; + // ----------------------------------------------------------------------------- void bli_gks_init( void ) @@ -216,6 +223,18 @@ void bli_gks_init( void ) bli_cntx_init_generic_ind ); #endif } + +#ifdef BLIS_ENABLE_GKS_CACHING + // Deep-query and cache the native and induced method contexts so they are + // ready to go when needed (by BLIS or the application). Notice that we use + // the _noinit() APIs, which skip their internal calls to bli_init_once(). + // The reasons: (1) Skipping that call is necessary to prevent an infinite + // loop since the current function, bli_gks_init(), is called from within + // bli_init_once(); and (2) we can guarantee that the gks has been + // initialized given that bli_gks_init() is about to return. + cached_cntx_nat = ( cntx_t* )bli_gks_query_nat_cntx_noinit(); + cached_cntx_ind = ( cntx_t* )bli_gks_query_ind_cntx_noinit( BLIS_1M ); +#endif } // ----------------------------------------------------------------------------- @@ -267,6 +286,12 @@ void bli_gks_finalize( void ) } // END CRITICAL SECTION + +#ifdef BLIS_ENABLE_GKS_CACHING + // Clear the cached pointers to the native and induced contexts. + cached_cntx_nat = NULL; + cached_cntx_ind = NULL; +#endif } // ----------------------------------------------------------------------------- @@ -475,10 +500,38 @@ const cntx_t* bli_gks_query_cntx( void ) return bli_gks_query_nat_cntx(); } +// ----------------------------------------------------------------------------- + const cntx_t* bli_gks_query_nat_cntx( void ) { bli_init_once(); +#ifdef BLIS_ENABLE_GKS_CACHING + + // Return a pointer to the context for native execution that was deep- + // queried and cached at the end of bli_gks_init(). + return cached_cntx_nat; + +#else + + // Deep-query and return the address of a context for native execution. + return bli_gks_query_nat_cntx_impl(); + +#endif +} + +const cntx_t* bli_gks_query_nat_cntx_noinit( void ) +{ + // NOTE: This function purposefully avoids calling bli_init_once() so that + // it is safe to call during inititalization. + + return bli_gks_query_nat_cntx_impl(); +} + +// ----------------------------------------------------------------------------- + +const cntx_t* bli_gks_query_nat_cntx_impl( void ) +{ // Return the address of the native context for the architecture id // corresponding to the current hardware, as determined by // bli_arch_query_id(). @@ -494,18 +547,42 @@ const cntx_t* bli_gks_query_nat_cntx( void ) // ----------------------------------------------------------------------------- -const cntx_t* bli_gks_query_cntx_noinit( void ) +const cntx_t* bli_gks_query_ind_cntx + ( + ind_t ind + ) { - // This function is identical to bli_gks_query_cntx(), except that it - // does not call bli_init_once(). + bli_init_once(); - // Query the architecture id. - arch_t id = bli_arch_query_id(); +#ifdef BLIS_ENABLE_GKS_CACHING - // Use the architecture id to look up a pointer to its context. - const cntx_t* cntx = bli_gks_lookup_nat_cntx( id ); + // If for some reason the native context was requested, we return its + // address instead of the one for induced execution. + if ( ind == BLIS_NAT ) return cached_cntx_nat; - return cntx; + // Return a pointer to the context for the induced method that was deep- + // queried and cached at the end of bli_gks_init(). + return cached_cntx_ind; + +#else + + // Deep-query and return the address of a context for the requested induced + // method. (In this case, caching never takes place since it was disabled + // at configure-time.) + return bli_gks_query_ind_cntx_impl( ind ); + +#endif +} + +const cntx_t* bli_gks_query_ind_cntx_noinit + ( + ind_t ind + ) +{ + // NOTE: This function purposefully avoids calling bli_init_once() so that + // it is safe to call during inititalization. + + return bli_gks_query_ind_cntx_impl( ind ); } // ----------------------------------------------------------------------------- @@ -514,16 +591,15 @@ const cntx_t* bli_gks_query_cntx_noinit( void ) // with a new entry corresponding to a context for an ind_t value. static bli_pthread_mutex_t gks_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER; -const cntx_t* bli_gks_query_ind_cntx +const cntx_t* bli_gks_query_ind_cntx_impl ( ind_t ind ) { - bli_init_once(); - cntx_t* gks_id_ind; err_t r_val; + // Return the address of a context that will be suited for executing a // level-3 operation via the requested induced method (and datatype) for // the architecture id corresponding to the current hardware, as @@ -532,10 +608,13 @@ const cntx_t* bli_gks_query_ind_cntx // This function is called when a level-3 operation via induced method is // called, e.g. bli_gemm1m(). If this is the first time that induced method // is being executed since bli_gks_init(), the necessary context structure - // is allocated and initialized. If this is not the first time, then the - // address of a previously-allocated and initialized (cached) context is - // returned. Note that much of this must be done with mutual exclusion to - // ensure thread safety and deterministic behavior. + // is allocated. If this is not the first time a context for the requested + // induced method was queried, then the memory will already be allocated + // and initialized, and the previous cntx_t struct will be overwritten. + // The function will then return the address to the newly-initialized (or + // previously-allocated-but-reinitialized) cntx_t struct. Note that some of + // this function must be executed with mutual exclusion to ensure thread + // safety and deterministic behavior. // Query the architecture id. arch_t id = bli_arch_query_id(); @@ -583,23 +662,24 @@ const cntx_t* bli_gks_query_ind_cntx // gks_id[ ind ]. gks_id_ind = bli_calloc_intl( sizeof( cntx_t ), &r_val ); gks_id[ ind ] = gks_id_ind; - - // Before we can call the induced method context initialization - // function on the newly allocated structure, we must first copy - // over the contents of the native context. - *gks_id_ind = *gks_id_nat; - - // Use the architecture id to look up the function pointer to the - // context initialization function for induced methods. - ind_cntx_init_ft f = cntx_ind_init[ id ]; - - // Now we modify the context (so that it contains the proper values - // for its induced method) by calling the context initialization - // function for the current induced method. (That function assumes - // that the context is pre- initialized with values for native - // execution.) - f( ind, gks_id_ind ); } + + // Before we can call the induced method context initialization + // function on the newly allocated structure, we must first copy + // over the contents of the native context. If a previous context + // was already copied, this will overwrite those previous values. + *gks_id_ind = *gks_id_nat; + + // Use the architecture id to look up the function pointer to the + // context initialization function for induced methods. + ind_cntx_init_ft f = cntx_ind_init[ id ]; + + // Now we modify the context (so that it contains the proper values + // for its induced method) by calling the context initialization + // function for the current induced method. (That function assumes + // that the context is pre-initialized with values for native + // execution.) + f( ind, gks_id_ind ); } // END CRITICAL SECTION diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h index 3a93fd59e..d1c715be1 100644 --- a/frame/base/bli_gks.h +++ b/frame/base/bli_gks.h @@ -46,11 +46,14 @@ const cntx_t* const * bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_cntx( void ); -BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_nat_cntx( void ); -const cntx_t* bli_gks_query_cntx_noinit( void ); +BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_nat_cntx( void ); +const cntx_t* bli_gks_query_nat_cntx_noinit( void ); +const cntx_t* bli_gks_query_nat_cntx_impl( void ); BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_ind_cntx( ind_t ind ); +const cntx_t* bli_gks_query_ind_cntx_noinit( ind_t ind ); +const cntx_t* bli_gks_query_ind_cntx_impl( ind_t ind ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); diff --git a/frame/base/bli_ind.c b/frame/base/bli_ind.c index fbe740465..cc2810d51 100644 --- a/frame/base/bli_ind.c +++ b/frame/base/bli_ind.c @@ -44,9 +44,9 @@ static const char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] = void bli_ind_init( void ) { - // NOTE: Instead of calling bli_gks_query_cntx(), we call - // bli_gks_query_cntx_noinit() to avoid the call to bli_init_once(). - const cntx_t* cntx = bli_gks_query_cntx_noinit(); + // NOTE: We intentionally call bli_gks_query_nat_cntx_noinit() in order + // to avoid the internal call to bli_init_once(). + const cntx_t* cntx = bli_gks_query_nat_cntx_noinit(); // For each precision, enable the default induced method (1m) if both of // the following conditions are met: diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c index 7b62ded5c..a226b7b85 100644 --- a/frame/base/bli_memsys.c +++ b/frame/base/bli_memsys.c @@ -39,12 +39,10 @@ void bli_memsys_init( void ) { // Query a native context so we have something to pass into - // bli_pba_init_pools(). We use BLIS_DOUBLE for the datatype, - // but the dt argument is actually only used when initializing - // contexts for induced methods. - // NOTE: Instead of calling bli_gks_query_cntx(), we call - // bli_gks_query_cntx_noinit() to avoid the call to bli_init_once(). - const cntx_t* cntx_p = bli_gks_query_cntx_noinit(); + // bli_pba_init_pools(). + // NOTE: We intentionally call bli_gks_query_nat_cntx_noinit() in order + // to avoid the internal call to bli_init_once(). + const cntx_t* cntx_p = bli_gks_query_nat_cntx_noinit(); // Initialize the packing block allocator and its data structures. bli_pba_init( cntx_p ); diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index 9e9d47699..e7b77acbb 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -77,6 +77,17 @@ #endif +// -- MULTITHREADING ----------------------------------------------------------- + +// Enable caching of queried cntx_t pointers in the gks? +#ifdef BLIS_DISABLE_GKS_CACHING + #undef BLIS_ENABLE_GKS_CACHING +#else + // Default behavior is enabled. + #define BLIS_ENABLE_GKS_CACHING +#endif + + // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. diff --git a/frame/thread/bli_pthread.c b/frame/thread/bli_pthread.c index 804ace46d..b840e2b77 100644 --- a/frame/thread/bli_pthread.c +++ b/frame/thread/bli_pthread.c @@ -701,7 +701,7 @@ int bli_pthread_barrier_wait // Note that bli_pthread_switch_t has the following properties: // // 1. Access to a switch is protected by a mutex specific to that switch, and -// therefore state changes and thread-safe. +// therefore state changes are thread-safe. // // 2. An initialized switch always starts in the "off" state. // From 16d2e9ea9ca0853197b416eba701b840a8587bca Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 13 Jan 2023 20:03:01 -0600 Subject: [PATCH 123/230] Defined lt, lte, gt, gte + misc. other updates. (#712) Details: - Changed invertsc operation to be a non-destructive operation; that is, it now takes separate input and output operands. This change applies to both the object and typed APIs. - Defined an alternative square root operation, sqrtrsc, which, when operating on complex scalars, assumes the imaginary part of the input to be zero. - Changed the semantics of addm, subm, copym, axpym, scal2m, and xpbym so that when the source matrix has an implicit unit diagonal, the operation leaves the diagonal of the destination matrix untouched. Previously, the operations would interpret an implicit unit diagonal on the source matrix as a request to manifest the unit diagonal *explicitly* on output (either as something to copy in the case of copym, or something to compute with in the cases of addm, subm, axpym, scal2m, and xpbym). It turns out that this behavior was too cute by half and could cause unintended headaches for practical use cases. (This change in behavior also required small modifications to the trmv and trsv testsuite modules so that they would properly test matrices with unit diagonals.) - Added missing dependencies for copym to gemv, ger, hemv, trmv, and trsv testsuite modules. - Implemented level-0-like ltsc, ltesc, gtsc, gtesc operations in frame/util, which use lt, lte, gt, and gte level-0 scalar macros. - Trivial variable rename in bli_part.c to harmonize with other variable naming conventions. --- examples/oapi/04level0.c | 2 +- frame/0/bli_l0_check.c | 16 +----- frame/0/bli_l0_check.h | 13 +---- frame/0/bli_l0_fpa.c | 1 + frame/0/bli_l0_fpa.h | 1 + frame/0/bli_l0_ft.h | 27 +++++----- frame/0/bli_l0_oapi.c | 33 +------------ frame/0/bli_l0_oapi.h | 11 +---- frame/0/bli_l0_tapi.c | 26 ++++++++-- frame/0/bli_l0_tapi.h | 12 +---- frame/1m/bli_l1m_tapi.c | 30 +++++++++++ frame/base/bli_part.c | 18 +++---- frame/include/bli_scalar_macro_defs.h | 2 + frame/include/level0/bli_lt.h | 71 +++++++++++++++++++++++++++ frame/include/level0/bli_lte.h | 71 +++++++++++++++++++++++++++ frame/util/bli_util_check.c | 12 +++-- frame/util/bli_util_check.h | 8 ++- frame/util/bli_util_fpa.c | 4 ++ frame/util/bli_util_fpa.h | 4 ++ frame/util/bli_util_ft.h | 23 +++++++-- frame/util/bli_util_oapi.c | 50 +++++++++++++++++++ frame/util/bli_util_oapi.h | 6 ++- frame/util/bli_util_tapi.c | 21 ++++++++ frame/util/bli_util_tapi.h | 16 ++++++ testsuite/src/test_gemv.c | 1 + testsuite/src/test_ger.c | 1 + testsuite/src/test_hemv.c | 1 + testsuite/src/test_trmv.c | 6 +++ testsuite/src/test_trsv.c | 6 +++ 29 files changed, 379 insertions(+), 114 deletions(-) create mode 100644 frame/include/level0/bli_lt.h create mode 100644 frame/include/level0/bli_lte.h diff --git a/examples/oapi/04level0.c b/examples/oapi/04level0.c index c876ac414..72fe98200 100644 --- a/examples/oapi/04level0.c +++ b/examples/oapi/04level0.c @@ -166,7 +166,7 @@ int main( int argc, char** argv ) bli_normfsc( &zeta, &alpha ); bli_printm( "alpha := normf( zeta ) # normf() = complex modulus in complex domain.", &alpha, "%4.1f", "" ); - bli_invertsc( &gamma ); + bli_invertsc( &gamma, &gamma ); bli_printm( "gamma := 1.0 / gamma", &gamma, "%4.2f", "" ); diff --git a/frame/0/bli_l0_check.c b/frame/0/bli_l0_check.c index 02867a22d..a1f1c1ca1 100644 --- a/frame/0/bli_l0_check.c +++ b/frame/0/bli_l0_check.c @@ -55,20 +55,8 @@ GENFRONT( copysc ) GENFRONT( divsc ) GENFRONT( mulsc ) GENFRONT( sqrtsc ) +GENFRONT( sqrtrsc ) GENFRONT( subsc ) - - -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - const obj_t* chi \ - ) \ -{ \ - bli_l0_xsc_check( chi ); \ -} - GENFRONT( invertsc ) @@ -357,7 +345,7 @@ void bli_l0_xxbsc_check ( const obj_t* chi, const obj_t* psi, - const bool* is_eq + const bool* is ) { err_t e_val; diff --git a/frame/0/bli_l0_check.h b/frame/0/bli_l0_check.h index 1bbb4a756..e5818dbde 100644 --- a/frame/0/bli_l0_check.h +++ b/frame/0/bli_l0_check.h @@ -51,17 +51,8 @@ GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) +GENTPROT( sqrtrsc ) GENTPROT( subsc ) - - -#undef GENTPROT -#define GENTPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - const obj_t* chi \ - ); - GENTPROT( invertsc ) @@ -152,5 +143,5 @@ void bli_l0_xxbsc_check ( const obj_t* chi, const obj_t* psi, - const bool* is_eq + const bool* is ); diff --git a/frame/0/bli_l0_fpa.c b/frame/0/bli_l0_fpa.c index 4aa7ae764..b841ce5a5 100644 --- a/frame/0/bli_l0_fpa.c +++ b/frame/0/bli_l0_fpa.c @@ -56,6 +56,7 @@ GENFRONT( mulsc ) GENFRONT( subsc ) GENFRONT( invertsc ) GENFRONT( sqrtsc ) +GENFRONT( sqrtrsc ) GENFRONT( unzipsc ) GENFRONT( zipsc ) diff --git a/frame/0/bli_l0_fpa.h b/frame/0/bli_l0_fpa.h index 0d9b28361..623a3f69b 100644 --- a/frame/0/bli_l0_fpa.h +++ b/frame/0/bli_l0_fpa.h @@ -50,6 +50,7 @@ GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) +GENPROT( sqrtrsc ) GENPROT( unzipsc ) GENPROT( zipsc ) diff --git a/frame/0/bli_l0_ft.h b/frame/0/bli_l0_ft.h index 01d90cc3b..9ca69d534 100644 --- a/frame/0/bli_l0_ft.h +++ b/frame/0/bli_l0_ft.h @@ -37,7 +37,7 @@ // -- Level-0 function types --------------------------------------------------- // -// addsc, divsc, subsc +// addsc, divsc, subsc, invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ @@ -52,18 +52,6 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \ INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) - -// invertsc - -#undef GENTDEF -#define GENTDEF( ctype, ch, opname, tsuf ) \ -\ -typedef void (*PASTECH2(ch,opname,tsuf)) \ - ( \ - conj_t conjchi, \ - ctype* chi \ - ); - INSERT_GENTDEF( invertsc ) // mulsc @@ -119,6 +107,19 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \ INSERT_GENTDEF( sqrtsc ) +// sqrtrsc + +#undef GENTDEF +#define GENTDEF( ctype, ch, opname, tsuf ) \ +\ +typedef void (*PASTECH2(ch,opname,tsuf)) \ + ( \ + const ctype* chi, \ + ctype* psi \ + ); + +INSERT_GENTDEF( sqrtrsc ) + // getsc #undef GENTDEF diff --git a/frame/0/bli_l0_oapi.c b/frame/0/bli_l0_oapi.c index 0bfdbe3b3..612babe56 100644 --- a/frame/0/bli_l0_oapi.c +++ b/frame/0/bli_l0_oapi.c @@ -115,38 +115,6 @@ GENFRONT( addsc ) GENFRONT( divsc ) GENFRONT( mulsc ) GENFRONT( subsc ) - - -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC0(opname) \ - ( \ - const obj_t* chi \ - ) \ -{ \ - bli_init_once(); \ -\ - num_t dt = bli_obj_dt( chi ); \ -\ - conj_t conjchi = bli_obj_conj_status( chi ); \ -\ - void* buf_chi = bli_obj_buffer_for_1x1( dt, chi ); \ -\ - if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( chi ); \ -\ - /* Query a type-specific function pointer, except one that uses - void* for function arguments instead of typed pointers. */ \ - PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \ -\ - f \ - ( \ - conjchi, \ - buf_chi \ - ); \ -} - GENFRONT( invertsc ) @@ -181,6 +149,7 @@ void PASTEMAC0(opname) \ } GENFRONT( sqrtsc ) +GENFRONT( sqrtrsc ) #undef GENFRONT diff --git a/frame/0/bli_l0_oapi.h b/frame/0/bli_l0_oapi.h index a34252cf7..713da0d70 100644 --- a/frame/0/bli_l0_oapi.h +++ b/frame/0/bli_l0_oapi.h @@ -63,17 +63,8 @@ GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) +GENPROT( sqrtrsc ) GENPROT( subsc ) - - -#undef GENPROT -#define GENPROT( opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ - ( \ - const obj_t* chi \ - ); - GENPROT( invertsc ) diff --git a/frame/0/bli_l0_tapi.c b/frame/0/bli_l0_tapi.c index e0cdffcf3..7d6d33131 100644 --- a/frame/0/bli_l0_tapi.c +++ b/frame/0/bli_l0_tapi.c @@ -66,8 +66,9 @@ INSERT_GENTFUNC_BASIC( subsc, subs ) \ void PASTEMAC(ch,opname) \ ( \ - conj_t conjchi, \ - ctype* chi \ + conj_t conjchi, \ + const ctype* chi, \ + ctype* psi \ ) \ { \ bli_init_once(); \ @@ -76,7 +77,7 @@ void PASTEMAC(ch,opname) \ \ PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \ PASTEMAC(ch,kername)( chi_conj ); \ - PASTEMAC(ch,copys)( chi_conj, *chi ); \ + PASTEMAC(ch,copys)( chi_conj, *psi ); \ } INSERT_GENTFUNC_BASIC( invertsc, inverts ) @@ -176,6 +177,25 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNC_BASIC0( sqrtsc ) +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + const ctype* chi, \ + ctype* psi \ + ) \ +{ \ + bli_init_once(); \ +\ + const ctype_r chi_r = PASTEMAC(ch,real)( *chi ); \ +\ + PASTEMAC2(chr,ch,sqrt2s)( chi_r, *psi ); \ +} + +INSERT_GENTFUNCR_BASIC0( sqrtrsc ) + + #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ diff --git a/frame/0/bli_l0_tapi.h b/frame/0/bli_l0_tapi.h index b39303410..ead89c056 100644 --- a/frame/0/bli_l0_tapi.h +++ b/frame/0/bli_l0_tapi.h @@ -51,17 +51,6 @@ INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) - - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ - ( \ - conj_t conjchi, \ - ctype* chi \ - ); - INSERT_GENTPROT_BASIC0( invertsc ) @@ -88,6 +77,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) +INSERT_GENTPROT_BASIC0( sqrtrsc ) #undef GENTPROT diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c index 487116329..83ccf6853 100644 --- a/frame/1m/bli_l1m_tapi.c +++ b/frame/1m/bli_l1m_tapi.c @@ -82,6 +82,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, we handle it with a separate post-processing step. */ \ + /* NOTE: This code was disabled after I realized that when matrix A has the + properties of having a unit diagonal (and being lower or upper stored), + the operation should only read the strictly lower/upper triangle and + leave the diagonal of B untouched. */ \ +/* if ( bli_is_upper_or_lower( uplox ) && \ bli_is_unit_diag( diagx ) ) \ { \ @@ -98,6 +103,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ rntm \ ); \ } \ +*/ \ } INSERT_GENTFUNC_BASIC( addm, addd ) @@ -146,6 +152,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, we handle it with a separate post-processing step. */ \ + /* NOTE: This code was disabled after I realized that when matrix A has the + properties of having a unit diagonal (and being lower or upper stored), + the operation should only read the strictly lower/upper triangle and + leave the diagonal of B untouched. */ \ +/* if ( bli_is_upper_or_lower( uplox ) && \ bli_is_unit_diag( diagx ) ) \ { \ @@ -167,6 +178,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ rntm \ ); \ } \ +*/ \ } INSERT_GENTFUNC_BASIC0( copym ) @@ -219,6 +231,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, we handle it with a separate post-processing step. */ \ + /* NOTE: This code was disabled after I realized that when matrix A has the + properties of having a unit diagonal (and being lower or upper stored), + the operation should only read the strictly lower/upper triangle and + leave the diagonal of B untouched. */ \ +/* if ( bli_is_upper_or_lower( uplox ) && \ bli_is_unit_diag( diagx ) ) \ { \ @@ -236,6 +253,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ rntm \ ); \ } \ +*/ \ } INSERT_GENTFUNC_BASIC0( axpym ) @@ -307,6 +325,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, we handle it with a separate post-processing step. */ \ + /* NOTE: This code was disabled after I realized that when matrix A has the + properties of having a unit diagonal (and being lower or upper stored), + the operation should only read the strictly lower/upper triangle and + leave the diagonal of B untouched. */ \ +/* if ( bli_is_upper_or_lower( uplox ) && \ bli_is_unit_diag( diagx ) ) \ { \ @@ -327,6 +350,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ rntm \ ); \ } \ +*/ \ } INSERT_GENTFUNC_BASIC0( scal2m ) @@ -441,6 +465,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, we handle it with a separate post-processing step. */ \ + /* NOTE: This code was disabled after I realized that when matrix A has the + properties of having a unit diagonal (and being lower or upper stored), + the operation should only read the strictly lower/upper triangle and + leave the diagonal of B untouched. */ \ +/* if ( bli_is_upper_or_lower( uplox ) && \ bli_is_unit_diag( diagx ) ) \ { \ @@ -458,6 +487,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \ rntm \ ); \ } \ +*/ \ } INSERT_GENTFUNC_BASIC0( xpbym ) diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c index f3a2deeb4..fd6ca3a0c 100644 --- a/frame/base/bli_part.c +++ b/frame/base/bli_part.c @@ -234,7 +234,7 @@ void bli_acquire_mpart_mdim // Compute the diagonal offset based on the m and n offsets. - doff_t diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; + doff_t diagoff_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; // Begin by copying the info, elem size, buffer, row stride, and column @@ -250,13 +250,13 @@ void bli_acquire_mpart_mdim { bli_obj_set_dims( m_part, n_part, sub_obj ); bli_obj_inc_offs( offm_inc, offn_inc, sub_obj ); - bli_obj_inc_diag_offset( diag_off_inc, sub_obj ); + bli_obj_inc_diag_offset( diagoff_inc, sub_obj ); } else // if ( bli_obj_has_trans( obj ) ) { bli_obj_set_dims( n_part, m_part, sub_obj ); bli_obj_inc_offs( offn_inc, offm_inc, sub_obj ); - bli_obj_inc_diag_offset( -diag_off_inc, sub_obj ); + bli_obj_inc_diag_offset( -diagoff_inc, sub_obj ); } @@ -457,7 +457,7 @@ void bli_acquire_mpart_ndim // Compute the diagonal offset based on the m and n offsets. - doff_t diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; + doff_t diagoff_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; // Begin by copying the info, elem size, buffer, row stride, and column @@ -473,13 +473,13 @@ void bli_acquire_mpart_ndim { bli_obj_set_dims( m_part, n_part, sub_obj ); bli_obj_inc_offs( offm_inc, offn_inc, sub_obj ); - bli_obj_inc_diag_offset( diag_off_inc, sub_obj ); + bli_obj_inc_diag_offset( diagoff_inc, sub_obj ); } else // if ( bli_obj_has_trans( obj ) ) { bli_obj_set_dims( n_part, m_part, sub_obj ); bli_obj_inc_offs( offn_inc, offm_inc, sub_obj ); - bli_obj_inc_diag_offset( -diag_off_inc, sub_obj ); + bli_obj_inc_diag_offset( -diagoff_inc, sub_obj ); } @@ -709,7 +709,7 @@ void bli_acquire_mpart_mndim // Compute the diagonal offset based on the m and n offsets. - doff_t diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; + doff_t diagoff_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; // Begin by copying the info, elem size, buffer, row stride, and column @@ -725,13 +725,13 @@ void bli_acquire_mpart_mndim { bli_obj_set_dims( m_part, n_part, sub_obj ); bli_obj_inc_offs( offm_inc, offn_inc, sub_obj ); - bli_obj_inc_diag_offset( diag_off_inc, sub_obj ); + bli_obj_inc_diag_offset( diagoff_inc, sub_obj ); } else // if ( bli_obj_has_trans( obj ) ) { bli_obj_set_dims( n_part, m_part, sub_obj ); bli_obj_inc_offs( offn_inc, offm_inc, sub_obj ); - bli_obj_inc_diag_offset( -diag_off_inc, sub_obj ); + bli_obj_inc_diag_offset( -diagoff_inc, sub_obj ); } // If the root matrix is not general (ie: has structure defined by the diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h index f567e7ef3..3d60e8ec3 100644 --- a/frame/include/bli_scalar_macro_defs.h +++ b/frame/include/bli_scalar_macro_defs.h @@ -154,6 +154,8 @@ #include "bli_dotjs.h" #include "bli_eq.h" +#include "bli_lt.h" +#include "bli_lte.h" #include "bli_fprints.h" diff --git a/frame/include/level0/bli_lt.h b/frame/include/level0/bli_lt.h new file mode 100644 index 000000000..b7c68ddaa --- /dev/null +++ b/frame/include/level0/bli_lt.h @@ -0,0 +1,71 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_LT_H +#define BLIS_LT_H + + +// lt (passed by value) + +#define bli_slt( a, b ) ( (a) < (b) ) +#define bli_dlt( a, b ) ( (a) < (b) ) +#define bli_clt( a, b ) ( bli_creal(a) < bli_creal(b) ) +#define bli_zlt( a, b ) ( bli_zreal(a) < bli_zreal(b) ) +#define bli_ilt( a, b ) ( (a) < (b) ) + +// lt0 + +#define bli_slt0( a ) ( (a) < 0.0F ) +#define bli_dlt0( a ) ( (a) < 0.0 ) +#define bli_clt0( a ) ( bli_creal(a) < 0.0F ) +#define bli_zlt0( a ) ( bli_zreal(a) < 0.0 ) + +// gt (passed by value) + +#define bli_sgt( a, b ) ( (a) > (b) ) +#define bli_dgt( a, b ) ( (a) > (b) ) +#define bli_cgt( a, b ) ( bli_creal(a) > bli_creal(b) ) +#define bli_zgt( a, b ) ( bli_zreal(a) > bli_zreal(b) ) +#define bli_igt( a, b ) ( (a) > (b) ) + +// gt0 + +#define bli_sgt0( a ) ( (a) > 0.0F ) +#define bli_dgt0( a ) ( (a) > 0.0 ) +#define bli_cgt0( a ) ( bli_creal(a) > 0.0F ) +#define bli_zgt0( a ) ( bli_zreal(a) > 0.0 ) + + + +#endif diff --git a/frame/include/level0/bli_lte.h b/frame/include/level0/bli_lte.h new file mode 100644 index 000000000..ab87ff800 --- /dev/null +++ b/frame/include/level0/bli_lte.h @@ -0,0 +1,71 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_LTE_H +#define BLIS_LTE_H + + +// lte (passed by value) + +#define bli_slte( a, b ) ( (a) <= (b) ) +#define bli_dlte( a, b ) ( (a) <= (b) ) +#define bli_clte( a, b ) ( bli_creal(a) <= bli_creal(b) ) +#define bli_zlte( a, b ) ( bli_zreal(a) <= bli_zreal(b) ) +#define bli_ilte( a, b ) ( (a) <= (b) ) + +// lte0 + +#define bli_slte0( a ) ( (a) <= 0.0F ) +#define bli_dlte0( a ) ( (a) <= 0.0 ) +#define bli_clte0( a ) ( bli_creal(a) <= 0.0F ) +#define bli_zlte0( a ) ( bli_zreal(a) <= 0.0 ) + +// gte (passed by value) + +#define bli_sgte( a, b ) ( (a) >= (b) ) +#define bli_dgte( a, b ) ( (a) >= (b) ) +#define bli_cgte( a, b ) ( bli_creal(a) >= bli_creal(b) ) +#define bli_zgte( a, b ) ( bli_zreal(a) >= bli_zreal(b) ) +#define bli_igte( a, b ) ( (a) >= (b) ) + +// gte0 + +#define bli_sgte0( a ) ( (a) >= 0.0F ) +#define bli_dgte0( a ) ( (a) >= 0.0 ) +#define bli_cgte0( a ) ( bli_creal(a) >= 0.0F ) +#define bli_zgte0( a ) ( bli_zreal(a) >= 0.0 ) + + + +#endif diff --git a/frame/util/bli_util_check.c b/frame/util/bli_util_check.c index a96f6f5e9..3fafb4e50 100644 --- a/frame/util/bli_util_check.c +++ b/frame/util/bli_util_check.c @@ -144,13 +144,17 @@ void PASTEMAC(opname,_check) \ ( \ const obj_t* chi, \ const obj_t* psi, \ - const bool* is_eq \ + const bool* is \ ) \ { \ - bli_l0_xxbsc_check( chi, psi, is_eq ); \ + bli_l0_xxbsc_check( chi, psi, is ); \ } GENFRONT( eqsc ) +GENFRONT( ltsc ) +GENFRONT( ltesc ) +GENFRONT( gtsc ) +GENFRONT( gtesc ) #undef GENFRONT @@ -160,7 +164,7 @@ void PASTEMAC(opname,_check) \ ( \ const obj_t* x, \ const obj_t* y, \ - const bool* is_eq \ + const bool* is \ ) \ { \ bli_l1v_xy_check( x, y ); \ @@ -176,7 +180,7 @@ void PASTEMAC(opname,_check) \ ( \ const obj_t* x, \ const obj_t* y, \ - const bool* is_eq \ + const bool* is \ ) \ { \ bli_l1m_xy_check( x, y ); \ diff --git a/frame/util/bli_util_check.h b/frame/util/bli_util_check.h index c3f4fd1aa..26986b52c 100644 --- a/frame/util/bli_util_check.h +++ b/frame/util/bli_util_check.h @@ -125,10 +125,14 @@ void PASTEMAC(opname,_check) \ ( \ const obj_t* chi, \ const obj_t* psi, \ - const bool* is_eq \ + const bool* is \ ); GENTPROT( eqsc ) +GENTPROT( ltsc ) +GENTPROT( ltesc ) +GENTPROT( gtsc ) +GENTPROT( gtesc ) #undef GENPROT @@ -138,7 +142,7 @@ void PASTEMAC(opname,_check) \ ( \ const obj_t* x, \ const obj_t* y, \ - const bool* is_eq \ + const bool* is \ ); GENPROT( eqv ) diff --git a/frame/util/bli_util_fpa.c b/frame/util/bli_util_fpa.c index fba513fae..4ed95d4c9 100644 --- a/frame/util/bli_util_fpa.c +++ b/frame/util/bli_util_fpa.c @@ -89,6 +89,10 @@ PASTEMAC(opname,_qfp)( num_t dt ) \ GENFRONT( eqsc ) GENFRONT( eqv ) GENFRONT( eqm ) +GENFRONT( ltsc ) +GENFRONT( ltesc ) +GENFRONT( gtsc ) +GENFRONT( gtesc ) GENFRONT( fprintv ) GENFRONT( fprintm ) //GENFRONT( printv ) diff --git a/frame/util/bli_util_fpa.h b/frame/util/bli_util_fpa.h index 9ed6a4cf7..f4b67ba36 100644 --- a/frame/util/bli_util_fpa.h +++ b/frame/util/bli_util_fpa.h @@ -69,6 +69,10 @@ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) +GENPROT( ltsc ) +GENPROT( ltesc ) +GENPROT( gtsc ) +GENPROT( gtesc ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) diff --git a/frame/util/bli_util_ft.h b/frame/util/bli_util_ft.h index ccdd7ae66..39c27bd9a 100644 --- a/frame/util/bli_util_ft.h +++ b/frame/util/bli_util_ft.h @@ -207,7 +207,7 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \ conj_t conjchi, \ const ctype* chi, \ const ctype* psi, \ - bool* is_eq \ + bool* is \ ); INSERT_GENTDEF( eqsc ) @@ -223,7 +223,7 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \ dim_t n, \ const ctype* x, inc_t incx, \ const ctype* y, inc_t incy, \ - bool* is_eq \ + bool* is \ ); INSERT_GENTDEF( eqv ) @@ -243,10 +243,27 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \ dim_t n, \ const ctype* x, inc_t rs_x, inc_t cs_x, \ const ctype* y, inc_t rs_y, inc_t cs_y, \ - bool* is_eq \ + bool* is \ ); INSERT_GENTDEF( eqm ) +// ltsc, ltesc, gtsc, gtesc + +#undef GENTDEF +#define GENTDEF( ctype, ch, opname, tsuf ) \ +\ +typedef void (*PASTECH2(ch,opname,tsuf)) \ + ( \ + const ctype* chi, \ + const ctype* psi, \ + bool* is \ + ); + +INSERT_GENTDEF( ltsc ) +INSERT_GENTDEF( ltesc ) +INSERT_GENTDEF( gtsc ) +INSERT_GENTDEF( gtesc ) + #endif // #ifdef BLIS_OAPI_BASIC diff --git a/frame/util/bli_util_oapi.c b/frame/util/bli_util_oapi.c index d4e5617ee..8223ffff8 100644 --- a/frame/util/bli_util_oapi.c +++ b/frame/util/bli_util_oapi.c @@ -526,6 +526,56 @@ void PASTEMAC0(opname) \ GENFRONT( eqm ) +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC0(opname) \ + ( \ + const obj_t* chi, \ + const obj_t* psi, \ + bool* is \ + ) \ +{ \ + bli_init_once(); \ +\ + num_t dt_chi = bli_obj_dt( chi ); \ + num_t dt_psi = bli_obj_dt( psi ); \ + num_t dt; \ +\ + if ( bli_error_checking_is_enabled() ) \ + PASTEMAC(opname,_check)( chi, psi, is ); \ +\ + /* Decide which datatype will be used to query the buffer from the + constant object (if there is one). */ \ + if ( bli_is_constant( dt_psi ) ) dt = dt_chi; \ + else dt = dt_psi; \ +\ + /* If chi and psi are both constants, then we compare only the dcomplex + fields. */ \ + if ( bli_is_constant( dt ) ) dt = BLIS_DOUBLE; \ +\ + void* buf_chi = bli_obj_buffer_for_1x1( dt, chi ); \ + void* buf_psi = bli_obj_buffer_for_1x1( dt, psi ); \ +\ + /* Query a type-specific function pointer, except one that uses + void* for function arguments instead of typed pointers. */ \ + PASTECH(opname,_vft) f = \ + PASTEMAC(opname,_qfp)( dt ); \ +\ + f \ + ( \ + buf_chi, \ + buf_psi, \ + is \ + ); \ +} + +GENFRONT( ltsc ) +GENFRONT( ltesc ) +GENFRONT( gtsc ) +GENFRONT( gtesc ) + + #undef GENFRONT #define GENFRONT( opname ) \ \ diff --git a/frame/util/bli_util_oapi.h b/frame/util/bli_util_oapi.h index ab48f841a..682a58cb3 100644 --- a/frame/util/bli_util_oapi.h +++ b/frame/util/bli_util_oapi.h @@ -147,12 +147,16 @@ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ const obj_t* x, \ const obj_t* y, \ - bool* is_eq \ + bool* is \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) +GENPROT( ltsc ) +GENPROT( ltesc ) +GENPROT( gtsc ) +GENPROT( gtesc ) #undef GENPROT diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c index 5bd03882a..8611b9164 100644 --- a/frame/util/bli_util_tapi.c +++ b/frame/util/bli_util_tapi.c @@ -475,6 +475,27 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNC_BASIC0( eqm ) +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, kername ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + const ctype* chi, \ + const ctype* psi, \ + bool* is \ + ) \ +{ \ + bli_init_once(); \ +\ + *is = PASTEMAC(ch,kername)( *chi, *psi ); \ +} + +INSERT_GENTFUNC_BASIC( ltsc, lt ) +INSERT_GENTFUNC_BASIC( ltesc, lte ) +INSERT_GENTFUNC_BASIC( gtsc, gt ) +INSERT_GENTFUNC_BASIC( gtesc, gte ) + + #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, varname ) \ \ diff --git a/frame/util/bli_util_tapi.h b/frame/util/bli_util_tapi.h index 29c67df23..b720877b5 100644 --- a/frame/util/bli_util_tapi.h +++ b/frame/util/bli_util_tapi.h @@ -202,6 +202,22 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ INSERT_GENTPROT_BASIC0( eqm ) +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ + ( \ + const ctype* chi, \ + const ctype* psi, \ + bool* is \ + ); + +INSERT_GENTPROT_BASIC0( ltsc ) +INSERT_GENTPROT_BASIC0( ltesc ) +INSERT_GENTPROT_BASIC0( gtsc ) +INSERT_GENTPROT_BASIC0( gtesc ) + + #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ diff --git a/testsuite/src/test_gemv.c b/testsuite/src/test_gemv.c index e6090e1c5..f63178564 100644 --- a/testsuite/src/test_gemv.c +++ b/testsuite/src/test_gemv.c @@ -104,6 +104,7 @@ void libblis_test_gemv_deps libblis_test_subv( tdata, params, &(op->ops->subv) ); libblis_test_copyv( tdata, params, &(op->ops->copyv) ); libblis_test_scalv( tdata, params, &(op->ops->scalv) ); + libblis_test_copym( tdata, params, &(op->ops->copym) ); } diff --git a/testsuite/src/test_ger.c b/testsuite/src/test_ger.c index b44fe6ba6..aad507cdc 100644 --- a/testsuite/src/test_ger.c +++ b/testsuite/src/test_ger.c @@ -101,6 +101,7 @@ void libblis_test_ger_deps libblis_test_subv( tdata, params, &(op->ops->subv) ); libblis_test_scal2v( tdata, params, &(op->ops->scal2v) ); libblis_test_dotv( tdata, params, &(op->ops->dotv) ); + libblis_test_copym( tdata, params, &(op->ops->copym) ); libblis_test_gemv( tdata, params, &(op->ops->gemv) ); } diff --git a/testsuite/src/test_hemv.c b/testsuite/src/test_hemv.c index 02e205392..06852f052 100644 --- a/testsuite/src/test_hemv.c +++ b/testsuite/src/test_hemv.c @@ -104,6 +104,7 @@ void libblis_test_hemv_deps libblis_test_subv( tdata, params, &(op->ops->subv) ); libblis_test_copyv( tdata, params, &(op->ops->copyv) ); libblis_test_scalv( tdata, params, &(op->ops->scalv) ); + libblis_test_copym( tdata, params, &(op->ops->copym) ); libblis_test_gemv( tdata, params, &(op->ops->gemv) ); } diff --git a/testsuite/src/test_trmv.c b/testsuite/src/test_trmv.c index 71acc90ba..243216e96 100644 --- a/testsuite/src/test_trmv.c +++ b/testsuite/src/test_trmv.c @@ -100,6 +100,7 @@ void libblis_test_trmv_deps libblis_test_subv( tdata, params, &(op->ops->subv) ); libblis_test_copyv( tdata, params, &(op->ops->copyv) ); libblis_test_scalv( tdata, params, &(op->ops->scalv) ); + libblis_test_copym( tdata, params, &(op->ops->copym) ); libblis_test_gemv( tdata, params, &(op->ops->gemv) ); } @@ -325,6 +326,11 @@ void libblis_test_trmv_check bli_obj_set_struc( BLIS_GENERAL, &a_local ); bli_obj_set_uplo( BLIS_DENSE, &a_local ); + // If matrix A has an implicit unit diagonal, we have to make it explicit + // for the gemv below. + if ( bli_obj_has_unit_diag( a ) ) + bli_setd( &BLIS_ONE, &a_local ); + bli_gemv( alpha, &a_local, x_orig, &BLIS_ZERO, &y ); bli_subv( x, &y ); diff --git a/testsuite/src/test_trsv.c b/testsuite/src/test_trsv.c index 12543cd9a..788be1b2c 100644 --- a/testsuite/src/test_trsv.c +++ b/testsuite/src/test_trsv.c @@ -100,6 +100,7 @@ void libblis_test_trsv_deps libblis_test_subv( tdata, params, &(op->ops->subv) ); libblis_test_copyv( tdata, params, &(op->ops->copyv) ); libblis_test_scalv( tdata, params, &(op->ops->scalv) ); + libblis_test_copym( tdata, params, &(op->ops->copym) ); libblis_test_gemv( tdata, params, &(op->ops->gemv) ); } @@ -330,6 +331,11 @@ void libblis_test_trsv_check bli_obj_set_struc( BLIS_GENERAL, &a_local ); bli_obj_set_uplo( BLIS_DENSE, &a_local ); + // If matrix A has an implicit unit diagonal, we have to make it explicit + // for the gemv below. + if ( bli_obj_has_unit_diag( a ) ) + bli_setd( &BLIS_ONE, &a_local ); + bli_gemv( &alpha_inv, &a_local, x, &BLIS_ZERO, &y ); bli_subv( x_orig, &y ); From 5793a77937aee9847a5692c8e44b36a6380800a1 Mon Sep 17 00:00:00 2001 From: HarshDave12 <122850830+HarshDave12@users.noreply.github.com> Date: Tue, 17 Jan 2023 21:55:02 +0530 Subject: [PATCH 124/230] Fixed mis-mapped instruction for VEXTRACTF64X2. (#713) Details: - This commit fixes a typo in the macro definition for the extended inline assembly macro VEXTRACTF64X2 in bli_x86_asm_macros.h. The macro was previously defined (incorrectly) in terms of the vextractf64x4 instruction rather than vextractf64x2. - CREDITS file update. --- CREDITS | 1 + frame/include/bli_x86_asm_macros.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index 51afcc276..53904234e 100644 --- a/CREDITS +++ b/CREDITS @@ -24,6 +24,7 @@ but many others have contributed code and feedback, including Dilyn Corner @dilyn-corner Mat Cross @matcross (NAG) @decandia50 + Harsh Dave @HarshDave12 (AMD) Daniël de Kok @danieldk (Explosion) Kay Dewhurst @jkd2016 (Max Planck Institute, Halle, Germany) Jeff Diamond (Oracle) diff --git a/frame/include/bli_x86_asm_macros.h b/frame/include/bli_x86_asm_macros.h index b470d320d..1c27b8ff2 100644 --- a/frame/include/bli_x86_asm_macros.h +++ b/frame/include/bli_x86_asm_macros.h @@ -1205,7 +1205,7 @@ #define VEXTRACTF128(_0, _1, _2) INSTR_(vextractf128, _0, _1, _2) #define VEXTRACTF32X4(_0, _1, _2) INSTR_(vextractf32x4, _0, _1, _2) #define VEXTRACTF32X8(_0, _1, _2) INSTR_(vextractf32x8, _0, _1, _2) -#define VEXTRACTF64X2(_0, _1, _2) INSTR_(vextractf64x4, _0, _1, _2) +#define VEXTRACTF64X2(_0, _1, _2) INSTR_(vextractf64x2, _0, _1, _2) #define VEXTRACTF64X4(_0, _1, _2) INSTR_(vextractf64x4, _0, _1, _2) #define VBLENDPS(_0, _1, _2, _3) INSTR_(vblendps, _0, _1, _2, _3) #define VBLENDPD(_0, _1, _2, _3) INSTR_(vblendpd, _0, _1, _2, _3) From c334ec278f5e2a101625629b2e13bbf1b38dede5 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 18 Jan 2023 13:10:19 -0600 Subject: [PATCH 125/230] Merge tlb- and slab/rr-specific gemm macrokernels. (#711) Details: - Merged the tlb-specific gemm macrokernel (_var2b) with the slab/rr- specific one (var2) so that a single function can be compiled with either tlb or slab/rr support, depending on the value of the BLIS_ENABLE_JRIR_TLB, _SLAB, and _RR. This is done by incorporating information from both approaches: the start/end/inc for the JR and IR loops from slab or rr partitioning; and the number of assigned microtiles, plus the starting IR dimension offset for all iterations after the first (ir_next). With these changes, slab, rr, and tlb can all be parameterized by initializing a similar set of variables prior to the jr loop. - Removed the wrap-around logic that sets the "b_next" field of the auxinfo_t struct, which executes during the last IR iteration of the last JR iteration. The potential benefit of this code is so minor (and hinges on the microkernel making use of the b_next field) that it's arguably not worth including. The code also does the wrong thing for some threads whenever JR_NT > 1, since only thread 0 (in the JR group) would even compute with the first micropanel of B. - Re-expressed the definition of bli_is_last_iter_slrr so that slab and tlb use the same code rather than rr and tlb. - Adjusted the initialization of the gemm control tree accordingly. --- frame/3/gemm/bli_gemm_cntl.c | 7 +- frame/3/gemm/bli_gemm_ker_var2.c | 65 ++++- frame/3/gemm/bli_gemm_ker_var2b.c | 379 --------------------------- frame/include/bli_param_macro_defs.h | 6 +- 4 files changed, 58 insertions(+), 399 deletions(-) delete mode 100644 frame/3/gemm/bli_gemm_ker_var2b.c diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index b9c231cf7..10484adf3 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -61,12 +61,7 @@ cntl_t* bli_gemmbp_cntl_create void_fp macro_kernel_fp; // Choose the default macrokernel based on the operation family... - if ( family == BLIS_GEMM ) macro_kernel_fp = - #ifdef BLIS_ENABLE_JRIR_TLB - bli_gemm_ker_var2b; - #else // ifdef ( _SLAB || _RR ) - bli_gemm_ker_var2; - #endif + if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2; else if ( family == BLIS_GEMMT ) macro_kernel_fp = #ifdef BLIS_ENABLE_JRIR_TLB bli_gemmt_x_ker_var2b; diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 3e862e6c5..732d5ec06 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -244,27 +244,66 @@ void bli_gemm_ker_var2 bli_auxinfo_set_ukr( gemm_ukr, &aux ); bli_auxinfo_set_params( params, &aux ); - // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - // loop around the microkernel. Here we query the thrinfo_t node for the - // 1st (ir) loop around the microkernel. - thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); - thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + dim_t jr_start, jr_end, jr_inc; + dim_t ir_start, ir_end, ir_inc; - // Query the number of threads and thread ids for each loop. +#ifdef BLIS_ENABLE_JRIR_TLB + + // Query the number of threads and thread ids for the jr loop around + // the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); const dim_t jr_nt = bli_thrinfo_n_way( thread ); const dim_t jr_tid = bli_thrinfo_work_id( thread ); + + const dim_t ir_nt = 1; + const dim_t ir_tid = 0; + + dim_t n_ut_for_me + = + bli_thread_range_tlb_d( jr_nt, jr_tid, m_iter, n_iter, MR, NR, + &jr_start, &ir_start ); + + // Always increment by 1 in both dimensions. + jr_inc = 1; + ir_inc = 1; + + // Each thread iterates over the entire panel of C until it exhausts its + // assigned set of microtiles. + jr_end = n_iter; + ir_end = m_iter; + + // Successive iterations of the ir loop should start at 0. + const dim_t ir_next = 0; + +#else // ifdef ( _SLAB || _RR ) + + // Query the number of threads and thread ids for the ir loop around + // the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); const dim_t ir_nt = bli_thrinfo_n_way( caucus ); const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - dim_t jr_start, jr_end, jr_inc; - dim_t ir_start, ir_end, ir_inc; - // Determine the thread range and increment for the 2nd and 1st loops. // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + // Calculate the total number of microtiles assigned to this thread. + dim_t n_ut_for_me = ( ( ir_end + ir_inc - 1 - ir_start ) / ir_inc ) * + ( ( jr_end + jr_inc - 1 - jr_start ) / jr_inc ); + + // Each succesive iteration of the ir loop always starts at ir_start. + const dim_t ir_next = ir_start; + +#endif + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) { @@ -294,8 +333,6 @@ void bli_gemm_ker_var2 { a2 = a_cast; b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter_slrr( j, jr_end, jr_tid, jr_nt ) ) - b2 = b_cast; } // Save addresses of next panels of A and B to the auxinfo_t @@ -350,7 +387,13 @@ void bli_gemm_ker_var2 c11, rs_c, cs_c ); } + + // Decrement the number of microtiles assigned to the thread; once + // it reaches zero, return immediately. + n_ut_for_me -= 1; if ( n_ut_for_me == 0 ) return; } + + ir_start = ir_next; } } diff --git a/frame/3/gemm/bli_gemm_ker_var2b.c b/frame/3/gemm/bli_gemm_ker_var2b.c deleted file mode 100644 index 50375708a..000000000 --- a/frame/3/gemm/bli_gemm_ker_var2b.c +++ /dev/null @@ -1,379 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -typedef void (*xpbys_mxn_vft) - ( - dim_t m, - dim_t n, - void* x, inc_t rs_x, inc_t cs_x, - void* b, - void* y, inc_t rs_y, inc_t cs_y - ); - -#undef GENTFUNC2 -#define GENTFUNC2(ctypex,ctypey,chx,chy,op) \ -\ -BLIS_INLINE void PASTEMAC2(chx,chy,op) \ - ( \ - dim_t m, \ - dim_t n, \ - void* x, inc_t rs_x, inc_t cs_x, \ - void* b, \ - void* y, inc_t rs_y, inc_t cs_y \ - ) \ -{ \ - ctypex* restrict x_cast = x; \ - ctypey* restrict b_cast = b; \ - ctypey* restrict y_cast = y; \ -\ - PASTEMAC3(chx,chy,chy,xpbys_mxn) \ - ( \ - m, n, \ - x_cast, rs_x, cs_x, \ - b_cast, \ - y_cast, rs_y, cs_y \ - ); \ -} - -INSERT_GENTFUNC2_BASIC0(xpbys_mxnb_fn); -INSERT_GENTFUNC2_MIXDP0(xpbys_mxnb_fn); - -static xpbys_mxn_vft GENARRAY2_ALL(xpbys_mxn, xpbys_mxnb_fn); - - -void bli_gemm_ker_var2b - ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, - thrinfo_t* thread_par - ) -{ - num_t dt_exec = bli_obj_exec_dt( c ); - num_t dt_c = bli_obj_dt( c ); - - const pack_t schema_a = bli_obj_pack_schema( a ); - const pack_t schema_b = bli_obj_pack_schema( b ); - - dim_t m = bli_obj_length( c ); - dim_t n = bli_obj_width( c ); - dim_t k = bli_obj_width( a ); - - const char* a_cast = bli_obj_buffer_at_off( a ); - const inc_t is_a = bli_obj_imag_stride( a ); - dim_t pd_a = bli_obj_panel_dim( a ); - inc_t ps_a = bli_obj_panel_stride( a ); - - const char* b_cast = bli_obj_buffer_at_off( b ); - const inc_t is_b = bli_obj_imag_stride( b ); - dim_t pd_b = bli_obj_panel_dim( b ); - inc_t ps_b = bli_obj_panel_stride( b ); - - char* c_cast = bli_obj_buffer_at_off( c ); - inc_t rs_c = bli_obj_row_stride( c ); - inc_t cs_c = bli_obj_col_stride( c ); - - // If any dimension is zero, return immediately. - if ( bli_zero_dim3( m, n, k ) ) return; - - // Detach and multiply the scalars attached to A and B. - // NOTE: We know that the internal scalars of A and B are already of the - // target datatypes because the necessary typecasting would have already - // taken place during bli_packm_init(). - obj_t scalar_a, scalar_b; - bli_obj_scalar_detach( a, &scalar_a ); - bli_obj_scalar_detach( b, &scalar_b ); - bli_mulsc( &scalar_a, &scalar_b ); - - // Grab the addresses of the internal scalar buffers for the scalar - // merged above and the scalar attached to C. - // NOTE: We know that scalar_b is of type dt_exec due to the above code - // that casts the scalars of A and B to dt_exec via scalar_a and scalar_b, - // and we know that the internal scalar in C is already of the type dt_c - // due to the casting in the implementation of bli_obj_scalar_attach(). - const char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b ); - const char* beta_cast = bli_obj_internal_scalar_buffer( c ); - - // If 1m is being employed on a column- or row-stored matrix with a - // real-valued beta, we can use the real domain macro-kernel, which - // eliminates a little overhead associated with the 1m virtual - // micro-kernel. - // Only employ this optimization if the storage datatype of C is - // equal to the execution/computation datatype. -#if 1 - if ( bli_cntx_method( cntx ) == BLIS_1M ) - { - bli_gemm_ind_recast_1m_params - ( - &dt_exec, - &dt_c, - schema_a, - c, - &m, &n, &k, - &pd_a, &ps_a, - &pd_b, &ps_b, - &rs_c, &cs_c, - cntx - ); - } -#endif - -#ifdef BLIS_ENABLE_GEMM_MD - // Tweak parameters in select mixed domain cases (rcc, crc, ccr). - if ( bli_cntx_method( cntx ) == BLIS_NAT ) - { - bli_gemm_md_ker_var2_recast - ( - &dt_exec, - bli_obj_dt( a ), - bli_obj_dt( b ), - &dt_c, - &m, &n, &k, - &pd_a, &ps_a, - &pd_b, &ps_b, - c, - &rs_c, &cs_c - ); - } -#endif - - const siz_t dt_size = bli_dt_size( dt_exec ); - const siz_t dt_c_size = bli_dt_size( dt_c ); - - // Alias some constants to simpler names. - const dim_t MR = pd_a; - const dim_t NR = pd_b; - - // Query the context for the micro-kernel address and cast it to its - // function pointer type. - gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx ); - - // Query the params field from the obj_t. If it is non-NULL, grab the ukr - // field of the params struct. If that function pointer is non-NULL, use it - // as our microkernel instead of the default microkernel queried from the - // cntx above. - const gemm_ker_params_t* params = bli_obj_ker_params( c ); - gemm_ukr_vft user_ukr = params ? params->ukr : NULL; - if ( user_ukr ) gemm_ukr = user_ukr; - - // Temporary C buffer for edge cases. Note that the strides of this - // temporary buffer are set so that they match the storage of the - // original C matrix. For example, if C is column-stored, ct will be - // column-stored as well. - char ct[ BLIS_STACK_BUF_MAX_SIZE ] - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); - const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx ); - const inc_t rs_ct = ( col_pref ? 1 : NR ); - const inc_t cs_ct = ( col_pref ? MR : 1 ); - const char* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); - - // - // Assumptions/assertions: - // rs_a == 1 - // cs_a == PACKMR - // pd_a == MR - // ps_a == stride to next micro-panel of A - // rs_b == PACKNR - // cs_b == 1 - // pd_b == NR - // ps_b == stride to next micro-panel of B - // rs_c == (no assumptions) - // cs_c == (no assumptions) - // - - // Compute number of primary and leftover components of the m and n - // dimensions. - const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); - const dim_t n_left = n % NR; - - const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); - const dim_t m_left = m % MR; - - // Determine some increments used to step through A, B, and C. - const inc_t rstep_a = ps_a * dt_size; - - const inc_t cstep_b = ps_b * dt_size; - - const inc_t rstep_c = rs_c * MR * dt_c_size; - const inc_t cstep_c = cs_c * NR * dt_c_size; - - auxinfo_t aux; - - // Save the pack schemas of A and B to the auxinfo_t object. - bli_auxinfo_set_schema_a( schema_a, &aux ); - bli_auxinfo_set_schema_b( schema_b, &aux ); - - // Save the imaginary stride of A and B to the auxinfo_t object. - bli_auxinfo_set_is_a( is_a, &aux ); - bli_auxinfo_set_is_b( is_b, &aux ); - - // Save the virtual microkernel address and the params. - bli_auxinfo_set_ukr( gemm_ukr, &aux ); - bli_auxinfo_set_params( params, &aux ); - - // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) - // loop around the microkernel. Notice that this variant doesn't utilize - // parallelism in the 1st (ir) loop around the microkernel. - thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); - //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); - - const dim_t jr_nt = bli_thrinfo_n_way( thread ); - const dim_t jr_tid = bli_thrinfo_work_id( thread ); - //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); - //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - - // Determine the starting microtile offsets and number of microtiles to - // compute for each thread. Note that assignment of microtiles is done - // according to the tlb policy. - dim_t jr_st, ir_st; - const dim_t n_ut_for_me - = - bli_thread_range_tlb_d( jr_nt, jr_tid, m_iter, n_iter, MR, NR, &jr_st, &ir_st ); - - // It's possible that there are so few microtiles relative to the number - // of threads that one or more threads gets no work. If that happens, those - // threads can return early. - if ( n_ut_for_me == 0 ) return; - - // Start the jr/ir loops with the current thread's microtile offsets computed - // by bli_thread_range_tlb(). - dim_t i = ir_st; - dim_t j = jr_st; - - // Initialize a counter to track the number of microtiles computed by the - // current thread. - dim_t ut = 0; - - // Loop over the n dimension (NR columns at a time). - for ( ; true; ++j ) - { - const char* b1 = b_cast + j * cstep_b; - char* c1 = c_cast + j * cstep_c; - - // Compute the current microtile's width. - const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) - ? NR : n_left ); - - // Initialize our next panel of B to be the current panel of B. - const char* b2 = b1; - - bli_auxinfo_set_next_b( b2, &aux ); - - // Loop over the m dimension (MR rows at a time). - for ( ; i < m_iter; ++i ) - { - const char* a1 = a_cast + i * rstep_a; - char* c11 = c1 + i * rstep_c; - - // Compute the current microtile's length. - const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) - ? MR : m_left ); - - // Compute the addresses of the next panels of A and B. - const char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, 1 ); - if ( bli_is_last_iter_sl( i, m_iter ) ) - { - a2 = a_cast; - b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, 1 ); - bli_auxinfo_set_next_b( b2, &aux ); - } - - // Save addresses of next panels of A and B to the auxinfo_t - // object. - bli_auxinfo_set_next_a( a2, &aux ); - - // Edge case handling now occurs within the microkernel itself, but - // we must still explicitly accumulate to a temporary microtile in - // situations where a virtual microkernel is being used, such as - // during the 1m method or some cases of mixed datatypes. - if ( dt_exec == dt_c ) - { - // Invoke the gemm micro-kernel. - gemm_ukr - ( - m_cur, - n_cur, - k, - ( void* )alpha_cast, - ( void* )a1, - ( void* )b1, - ( void* )beta_cast, - c11, rs_c, cs_c, - &aux, - ( cntx_t* )cntx - ); - } - else - { - // Invoke the gemm micro-kernel. - gemm_ukr - ( - MR, - NR, - k, - ( void* )alpha_cast, - ( void* )a1, - ( void* )b1, - ( void* )zero, - &ct, rs_ct, cs_ct, - &aux, - ( cntx_t* )cntx - ); - - // Accumulate to C with typecasting. - xpbys_mxn[ dt_exec ][ dt_c ] - ( - m_cur, n_cur, - &ct, rs_ct, cs_ct, - ( void* )beta_cast, - c11, rs_c, cs_c - ); - } - - ut += 1; - if ( ut == n_ut_for_me ) return; - } - - i = 0; - } -} - -//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: b1", k, NR, b1, NR, 1, "%4.1f", "" ); -//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: a1", MR, k, a1, 1, MR, "%4.1f", "" ); -//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 0865b11e9..fea67c0af 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -967,10 +967,10 @@ BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t BLIS_INLINE bool bli_is_last_iter_slrr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { -#ifdef BLIS_ENABLE_JRIR_SLAB - return bli_is_last_iter_sl( i, end_iter ); -#else // BLIS_ENABLE_JRIR_RR +#ifdef BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); +#else // ifdef ( _SLAB || _TLB ) + return bli_is_last_iter_sl( i, end_iter ); #endif } From ecbcf4008815035c695822fcaf106477debff89a Mon Sep 17 00:00:00 2001 From: Lee Killough <15950023+leekillough@users.noreply.github.com> Date: Wed, 18 Jan 2023 20:35:50 -0600 Subject: [PATCH 126/230] Use here-document for 'configure --help' output. (#714) Details: - Changed the configure script function that outputs "--help" text to do so via so-called "here-document" syntax for improved readability and maintainability. The change eliminates hundreds of echo statements and makes it easier to change existing configure options' help text, along with other benefits such as eliminating the need to escape double- quote characters ("). --- configure | 780 +++++++++++++++++++++++++++--------------------------- 1 file changed, 390 insertions(+), 390 deletions(-) diff --git a/configure b/configure index 06201b4fa..a89225107 100755 --- a/configure +++ b/configure @@ -46,396 +46,396 @@ print_usage() fi # Echo usage info. - echo " " - echo " ${script_name} (BLIS ${version})" - #echo " " - #echo " BLIS ${version}" - echo " " - echo " Configure BLIS's build system for compilation using a specified" - echo " configuration directory." - echo " " - echo " Usage:" - echo " " - echo " ${script_name} [options] [env. vars.] confname" - echo " " - echo " Arguments:" - echo " " - echo " confname The name of the sub-directory inside of the 'config'" - echo " directory containing the desired BLIS configuration." - echo " Note that confname MUST be specified; if it is not," - echo " configure will complain. To build a completely generic" - echo " implementation, use the 'generic' configuration" - echo " " - echo " Options:" - echo " " - echo " -p PREFIX, --prefix=PREFIX" - echo " " - echo " The common installation prefix for all files. If given," - echo " this option effectively implies:" - echo " --libdir=EXECPREFIX/lib" - echo " --includedir=PREFIX/include" - echo " --sharedir=PREFIX/share" - echo " where EXECPREFIX defaults to PREFIX. If this option is" - echo " not given, PREFIX defaults to '${prefix_def}'. If PREFIX" - echo " refers to a directory that does not exist, it will be" - echo " created." - echo " " - echo " --exec-prefix=EXECPREFIX" - echo " " - echo " The installation prefix for libraries. Specifically, if" - echo " given, this option effectively implies:" - echo " --libdir=EXECPREFIX/lib" - echo " If not given, EXECPREFIX defaults to PREFIX, which may be" - echo " modified by the --prefix option. If EXECPREFIX refers to" - echo " a directory that does not exist, it will be created." - echo " " - echo " --libdir=LIBDIR" - echo " " - echo " The path to which make will install libraries. If not" - echo " given, LIBDIR defaults to PREFIX/lib. If LIBDIR refers to" - echo " a directory that does not exist, it will be created." - echo " " - echo " --includedir=INCDIR" - echo " " - echo " The path to which make will install development header" - echo " files. If not given, INCDIR defaults to PREFIX/include." - echo " If INCDIR refers to a directory that does not exist, it" - echo " will be created." - echo " " - echo " --sharedir=SHAREDIR" - echo " " - echo " The path to which make will makefile fragments containing" - echo " make variables determined by configure (e.g. CC, CFLAGS," - echo " and LDFLAGS). These files allow certain BLIS makefiles," - echo " such as those in the examples or testsuite directories, to" - echo " operate on an installed copy of BLIS rather than a local" - echo " (and possibly uninstalled) copy. If not given, SHAREDIR" - echo " defaults to PREFIX/share. If SHAREDIR refers to a" - echo " directory that does not exist, it will be created." - echo " " - echo " --enable-verbose-make, --disable-verbose-make" - echo " " - echo " Enable (disabled by default) verbose compilation output" - echo " during make." - echo " " - echo " --enable-arg-max-hack --disable-arg-max-hack" - echo " " - echo " Enable (disabled by default) build system logic that" - echo " will allow archiving/linking the static/shared library" - echo " even if the command plus command line arguments exceeds" - echo " the operating system limit (ARG_MAX)." - echo " " - echo " -d DEBUG, --enable-debug[=DEBUG]" - echo " " - echo " Enable debugging symbols in the library. If argument" - echo " DEBUG is given as 'opt', then optimization flags are" - echo " kept in the framework, otherwise optimization is" - echo " turned off." - echo " " - echo " --disable-static, --enable-static" - echo " " - echo " Disable (enabled by default) building BLIS as a static" - echo " library. If the static library build is disabled, the" - echo " shared library build must remain enabled." - echo " " - echo " --disable-shared, --enable-shared" - echo " " - echo " Disable (enabled by default) building BLIS as a shared" - echo " library. If the shared library build is disabled, the" - echo " static library build must remain enabled." - echo " " - echo " --enable-rpath, --disable-rpath" - echo " " - echo " Enable (disabled by default) setting an install_name for" - echo " dynamic libraries on macOS which starts with @rpath rather" - echo " than the absolute install path." - echo " " - echo " -e SYMBOLS, --export-shared[=SYMBOLS]" - echo " " - echo " Specify the subset of library symbols that are exported" - echo " within a shared library. Valid values for SYMBOLS are:" - echo " 'public' (the default) and 'all'. By default, only" - echo " functions and variables that belong to public APIs are" - echo " exported in shared libraries. However, the user may" - echo " instead export all symbols in BLIS, even those that were" - echo " intended for internal use only. Note that the public APIs" - echo " encompass all functions that almost any user would ever" - echo " want to call, including the BLAS/CBLAS compatibility APIs" - echo " as well as the basic and expert interfaces to the typed" - echo " and object APIs that are unique to BLIS. Also note that" - echo " changing this option to 'all' will have no effect in some" - echo " environments, such as when compiling with clang on" - echo " Windows." - echo " " - echo " -t MODEL, --enable-threading[=MODEL], --disable-threading" - echo " " - echo " Enable threading in the library, using threading model(s)" - echo " MODEL={single,openmp,pthreads,hpx,auto}. If multiple values" - echo " are specified within MODEL, they will all be compiled into" - echo " BLIS, and the choice of which to use will be determined at" - echo " runtime. If the user does not express a preference (by" - echo " setting the BLIS_THREAD_IMPL environment variable to" - echo " 'single', 'openmp', 'pthreads', or 'hpx'; by calling the" - echo " global runtime API bli_thread_set_thread_impl(); or by" - echo " encoding a choice on a per-call basis within a rntm_t" - echo " passed into the expert API), then the first model listed" - echo " in MODEL will be used by default. Note that 'single' is" - echo " silently appended to whatever the user specifies in MODEL," - echo " meaning that single-threaded functionality will always be" - echo " available, even if it is not requested and even if it is" - echo " not enabled by default. Even --disable-threading is" - echo " actually shorthand for --enable-threading=single (which is" - echo " the default when the option is not specified)." - echo " " - echo " --enable-system, --disable-system" - echo " " - echo " Enable conventional operating system support, such as" - echo " pthreads for thread-safety. The default state is enabled." - echo " However, in rare circumstances you may wish to configure" - echo " BLIS for use with a minimal or nonexistent operating" - echo " system (e.g. hardware simulators). In these situations," - echo " --disable-system may be used to jettison all compile-time" - echo " and link-time dependencies outside of the standard C" - echo " library. When disabled, this option also forces the use" - echo " of --disable-threading." - echo " " - echo " --disable-pba-pools, --enable-pba-pools" - echo " --disable-sba-pools, --enable-sba-pools" - echo " " - echo " Disable (enabled by default) use of internal memory pools" - echo " within the packing block allocator (pba) and/or the small" - echo " block allocator (sba). The former is used to allocate" - echo " memory used to pack submatrices while the latter is used" - echo " to allocate control/thread tree nodes and thread" - echo " communicators. Both allocations take place in the context" - echo " of level-3 operations. When the pba is disabled, the" - echo " malloc()-like function specified by BLIS_MALLOC_POOL is" - echo " called on-demand whenever a packing block is needed, and" - echo " when the sba is disabled, the malloc()-like function" - echo " specified by BLIS_MALLOC_INTL is called whenever a small" - echo " block is needed, with the two allocators calling free()-" - echo " like functions BLIS_FREE_POOL and BLIS_FREE_INTL," - echo " respectively when blocks are released. When enabled," - echo " either or both pools are populated via the same functions" - echo " mentioned previously, and henceforth blocks are checked" - echo " out and in. The library quickly reaches a state in which" - echo " it no longer needs to call malloc() or free(), even" - echo " across many separate level-3 operation invocations." - echo " " - echo " --enable-mem-tracing, --disable-mem-tracing" - echo " " - echo " Enable (disabled by default) output to stdout that traces" - echo " the allocation and freeing of memory, including the names" - echo " of the functions that triggered the allocation/freeing." - echo " Enabling this option WILL NEGATIVELY IMPACT PERFORMANCE." - echo " Please use only for informational/debugging purposes." - echo " " - echo " --enable-asan, --disable-asan" - echo " " - echo " Enable (disabled by default) compiling and linking BLIS" - echo " framework code with the AddressSanitizer (ASan) library." - echo " Optimized kernels are NOT compiled with ASan support due" - echo " to limitations of register assignment in inline assembly." - echo " WARNING: ENABLING THIS OPTION WILL NEGATIVELY IMPACT" - echo " PERFORMANCE. Please use only for informational/debugging" - echo " purposes." - echo " " - echo " -i SIZE, --int-size=SIZE" - echo " " - echo " Set the size (in bits) of internal BLIS integers and" - echo " integer types used in native BLIS interfaces. The" - echo " default integer type size is architecture dependent." - echo " (Hint: You can always find this value printed at the" - echo " beginning of the testsuite output.)" - echo " " - echo " -b SIZE, --blas-int-size=SIZE" - echo " " - echo " Set the size (in bits) of integer types in external" - echo " BLAS and CBLAS interfaces, if enabled. The default" - echo " integer type size used in BLAS/CBLAS is 32 bits." - echo " " - echo " --disable-blas, --enable-blas" - echo " " - echo " Disable (enabled by default) building the BLAS" - echo " compatibility layer." - echo " " - echo " --enable-cblas, --disable-cblas" - echo " " - echo " Enable (disabled by default) building the CBLAS" - echo " compatibility layer. This automatically enables the" - echo " BLAS compatibility layer as well." - echo " " - echo " --disable-mixed-dt, --enable-mixed-dt" - echo " " - echo " Disable (enabled by default) support for mixing the" - echo " storage domain and/or storage precision of matrix" - echo " operands for the gemm operation, as well as support" - echo " for computing in a precision different from one or" - echo " both of matrices A and B." - echo " " - echo " --disable-mixed-dt-extra-mem, --enable-mixed-dt-extra-mem" - echo " " - echo " Disable (enabled by default) support for additional" - echo " mixed datatype optimizations that require temporarily" - echo " allocating extra memory--specifically, a single m x n" - echo " matrix (per application thread) whose storage datatype" - echo " is equal to the computation datatype. This option may" - echo " only be enabled when mixed domain/precision support is" - echo " enabled." - echo " " - echo " --disable-sup-handling, --enable-sup-handling" - echo " " - echo " Disable (enabled by default) handling of small/skinny" - echo " matrix problems via separate code branches. When disabled," - echo " these small/skinny level-3 operations will be performed by" - echo " the conventional implementation, which is optimized for" - echo " medium and large problems. Note that what qualifies as" - echo " \"small\" depends on thresholds that may vary by sub-" - echo " configuration." - echo " " - echo " --enable-amd-frame-tweaks, --disable-amd-frame-tweaks" - echo " " - echo " Enable building with certain framework files that have" - echo " been customized by AMD for Zen-based microarchitectures." - echo " The default counterparts of these files must be portable," - echo " and so these customized files may provide some (typically" - echo " modest) performance improvement for some select operations" - echo " and/or APIs, though there may a few (tiny dimension) cases" - echo " where the improvement is more pronounced. Note that the" - echo " target configuration must be Zen-based (or 'amd64') for" - echo " this option to have any effect. (Also note that this" - echo " option is NOT to be confused with enabling AMD *kernels*," - echo " which are determined by the BLIS subconfiguration used at" - echo " runtime.) By default, these customized files are disabled." - echo " " - echo " -a NAME --enable-addon=NAME" - echo " " - echo " Enable the code provided by an addon. An addon consists" - echo " of a separate directory of code that provides additional" - echo " APIs, implementations, and/or operations that would" - echo " otherwise not be present within a build of BLIS. This" - echo " option may be used multiple times to specify the inclusion" - echo " of multiple addons. By default, no addons are enabled." - echo " " - echo " -s NAME --enable-sandbox=NAME" - echo " " - echo " Enable a separate sandbox implementation of gemm. This" - echo " option disables BLIS's conventional gemm implementation" - echo " (which shares common infrastructure with other level-3" - echo " operations) and instead compiles and uses the code in" - echo " the NAME directory, which is expected to be a sub-" - echo " directory of 'sandbox'. By default, no sandboxes are" - echo " enabled." - echo " " - echo " --with-memkind, --without-memkind" - echo " " - echo " Forcibly enable or disable the use of libmemkind's" - echo " hbw_malloc() and hbw_free() as substitutes for malloc()" - echo " and free(), respectively, when allocating memory for" - echo " BLIS's memory pools, which are used to manage buffers" - echo " into which matrices are packed. The default behavior" - echo " for this option is environment-dependent; if configure" - echo " detects the presence of libmemkind, libmemkind is used" - echo " by default, and otherwise it is not used by default." - echo " " - echo " -r METHOD, --thread-part-jrir=METHOD" - echo " " - echo " Select a strategy for partitioning computation in JR and" - echo " IR loops and assigning that computation to threads. Valid" - echo " values for METHOD are 'rr', 'slab', and 'tlb':" - echo " 'rr': Assign the computation associated with whole" - echo " columns of microtiles to threads in a round-" - echo " robin fashion. When selected, round-robin" - echo " assignment is also employed during packing." - echo " 'slab': Partition the computation into N contiguous" - echo " regions, where each region contains a whole" - echo " number of microtile columns, and assign one" - echo " region to each thread. For some operations, the" - echo " number of microtile columns contained within a" - echo " given region may differ from that of other" - echo " regions, depending on how much work is implied" - echo " by each region. When selected, slab assignment" - echo " is also employed during packing." - echo " 'tlb': Tile-level load balancing is similar to slab," - echo " except that regions will be divided at a more" - echo " granular level (individual microtiles instead" - echo " of whole columns of microtiles) to ensure more" - echo " equitable assignment of work to threads. When" - echo " selected, tlb will only be employed for level-3" - echo " operations except trsm; due to practical and" - echo " algorithmic limitations, slab partitioning will" - echo " be used instead during packing and for trsm." - echo " The default strategy is 'slab'. NOTE: Specifying this" - echo " option constitutes a request, which may be ignored in" - echo " select situations if implementation has a good reason to" - echo " do so. (See description of 'tlb' above for an example of" - echo " this.)" - echo " " - echo " --disable-trsm-preinversion, --enable-trsm-preinversion" - echo " " - echo " Disable (enabled by default) pre-inversion of triangular" - echo " matrix diagonals when performing trsm. When pre-inversion" - echo " is enabled, diagonal elements are inverted outside of the" - echo " microkernel (e.g. during packing) so that the microkernel" - echo " can use multiply instructions. When disabled, division" - echo " instructions are used within the microkernel. Executing" - echo " these division instructions within the microkernel will" - echo " incur a performance penalty, but numerical robustness will" - echo " improve for certain cases involving denormal numbers that" - echo " would otherwise result in overflow in the pre-inverted" - echo " values." - echo " " - echo " --force-version=STRING" - echo " " - echo " Force configure to use an arbitrary version string" - echo " STRING. This option may be useful when repackaging" - echo " custom versions of BLIS by outside organizations." - echo " " - echo " -c, --show-config-lists" - echo " " - echo " Print the config and kernel lists, and kernel-to-config" - echo " map after they are read from file. This can be useful" - echo " when debugging certain configuration issues, and/or as" - echo " a sanity check to make sure these lists are constituted" - echo " as expected." - echo " " - echo " --complex-return=gnu|intel" - echo " " - echo " Specify the way in which complex numbers are returned" - echo " from Fortran functions, either \"gnu\" (return in" - echo " registers) or \"intel\" (return via hidden argument)." - echo " If not specified and the environment variable FC is set," - echo " attempt to determine the return type from the compiler." - echo " Otherwise, the default is \"gnu\"." - echo " " - echo " -q, --quiet Suppress informational output. By default, configure" - echo " is verbose. (NOTE: -q is not yet implemented)" - echo " " - echo " -h, --help Output this information and quit." - echo " " - echo " Environment Variables:" - echo " " - echo " CC Specifies the C compiler to use." - echo " CXX Specifies the C++ compiler to use (sandbox only)." - echo " FC Specifies the Fortran compiler to use (only to determine --complex-return)." - echo " AR Specifies the static library archiver to use." - echo " RANLIB Specifies the ranlib (library indexer) executable to use." - echo " PYTHON Specifies the python interpreter to use." - echo " CFLAGS Specifies additional compiler flags to use (prepended)." - echo " LDFLAGS Specifies additional linker flags to use (prepended)." - echo " LIBPTHREAD Pthreads library to use." - echo " " - echo " Environment variables are traditionally set prior to running configure:" - echo " " - echo " CC=gcc ./configure [options] haswell" - echo " " - echo " However, they may also be specified as command line options, e.g.:" - echo " " - echo " ./configure [options] CC=gcc haswell" - echo " " - echo " Note that not all compilers are compatible with a given" - echo " configuration." - echo " " + cat <