1616
1717// Globals
1818// Globals
19- uint32_t unp_cfg_context = 0 ;
20- uint32_t pack_sync_tile_dst_ptr = 0 ;
21- uint32_t math_sync_tile_dst_index = 0 ;
22- static constexpr int MAX_TILES_DEST = is_fp32_dest_acc_en ? 4 : 8 ;
19+ uint32_t unp_cfg_context = 0 ;
20+ uint32_t pack_sync_tile_dst_ptr = 0 ;
21+ uint32_t math_sync_tile_dst_index = 0 ;
22+ static constexpr ckernel::DstSync DST_SYNC = ckernel::DstSync::SyncHalf ;
2323
2424#ifdef LLK_TRISC_UNPACK
2525
@@ -28,6 +28,9 @@ static constexpr int MAX_TILES_DEST = is_fp32_dest_acc_en ? 4 : 8;
2828
2929void run_kernel (const volatile struct RuntimeParams * params)
3030{
31+ const int MAX_TILES_DEST =
32+ is_fp32_dest_acc_en ? (BIT32_DEST_REGISTER_HALF_SIZE / (params->num_faces * FACE_R_DIM)) : (DEST_REGISTER_HALF_SIZE / (params->num_faces * FACE_R_DIM));
33+
3134 {
3235 ZONE_SCOPED (" INIT" )
3336
@@ -79,11 +82,14 @@ void run_kernel(const volatile struct RuntimeParams* params)
7982
8083void run_kernel (const volatile struct RuntimeParams * params)
8184{
85+ const int MAX_TILES_DEST =
86+ is_fp32_dest_acc_en ? (BIT32_DEST_REGISTER_HALF_SIZE / (params->num_faces * FACE_R_DIM)) : (DEST_REGISTER_HALF_SIZE / (params->num_faces * FACE_R_DIM));
87+
8288 {
8389 ZONE_SCOPED (" INIT" )
8490
8591 _llk_math_eltwise_unary_datacopy_init_<DataCopyType::A2D, is_fp32_dest_acc_en>(params->num_faces , formats.math );
86- _llk_math_pack_sync_init_<DstSync::SyncHalf , is_fp32_dest_acc_en>();
92+ _llk_math_pack_sync_init_<DST_SYNC , is_fp32_dest_acc_en>();
8793 _llk_math_hw_configure_ (formats.math , formats.math );
8894
8995 _llk_math_eltwise_binary_sfpu_init_<SfpuType::add1>();
@@ -98,12 +104,15 @@ void run_kernel(const volatile struct RuntimeParams* params)
98104 {
99105 for (int loop = 0 ; loop < params->LOOP_FACTOR ; ++loop)
100106 {
101- for (int i = 0 ; i < params->TILE_CNT ; ++i )
107+ for (int block_start = 0 ; block_start < params->TILE_CNT ; block_start += MAX_TILES_DEST )
102108 {
103- // Only perform synchronization with unpacker, it does not copy
104- // the data when unpack_to_dest is true - as data is already in dest.
105- _llk_math_eltwise_unary_datacopy_<DataCopyType::A2D, DstSync::SyncHalf, is_fp32_dest_acc_en, BroadcastType::NONE, unpack_to_dest>(
106- i, formats.math , formats.math );
109+ int block_tiles = std::min (params->TILE_CNT - block_start, MAX_TILES_DEST);
110+
111+ for (int block_tile = 0 ; block_tile < block_tiles; ++block_tile)
112+ {
113+ _llk_math_eltwise_unary_datacopy_<DataCopyType::A2D, DST_SYNC, is_fp32_dest_acc_en, BroadcastType::NONE, unpack_to_dest>(
114+ block_tile, formats.math , formats.math );
115+ }
107116 }
108117 }
109118 }
@@ -128,11 +137,11 @@ void run_kernel(const volatile struct RuntimeParams* params)
128137
129138 for (int block_tile = 0 ; block_tile < block_tiles; ++block_tile)
130139 {
131- _llk_math_eltwise_unary_datacopy_<DataCopyType::A2D, DstSync::SyncHalf , is_fp32_dest_acc_en, BroadcastType::NONE, unpack_to_dest>(
132- block_start + block_tile, formats.math , formats.math );
140+ _llk_math_eltwise_unary_datacopy_<DataCopyType::A2D, DST_SYNC , is_fp32_dest_acc_en, BroadcastType::NONE, unpack_to_dest>(
141+ block_tile, formats.math , formats.math );
133142 }
134143
135- _llk_math_dest_section_done_<DstSync::SyncHalf , is_fp32_dest_acc_en>();
144+ _llk_math_dest_section_done_<DST_SYNC , is_fp32_dest_acc_en>();
136145 }
137146 }
138147 }
@@ -152,8 +161,8 @@ void run_kernel(const volatile struct RuntimeParams* params)
152161 /* iterations*/ params->num_faces );
153162 }
154163
155- _llk_math_wait_for_dest_available_<DstSync::SyncHalf >();
156- _llk_math_dest_section_done_<DstSync::SyncHalf , is_fp32_dest_acc_en>();
164+ _llk_math_wait_for_dest_available_<DST_SYNC >();
165+ _llk_math_dest_section_done_<DST_SYNC , is_fp32_dest_acc_en>();
157166 }
158167 }
159168 }
@@ -170,14 +179,13 @@ void run_kernel(const volatile struct RuntimeParams* params)
170179
171180 for (int block_tile = 0 ; block_tile < block_tiles; ++block_tile)
172181 {
173- _llk_math_eltwise_unary_datacopy_<DataCopyType::A2D, DstSync::SyncHalf , is_fp32_dest_acc_en, BroadcastType::NONE, unpack_to_dest>(
174- block_start + block_tile, formats.math , formats.math );
182+ _llk_math_eltwise_unary_datacopy_<DataCopyType::A2D, DST_SYNC , is_fp32_dest_acc_en, BroadcastType::NONE, unpack_to_dest>(
183+ block_tile, formats.math , formats.math );
175184 }
176185 }
177186
178- _llk_math_eltwise_binary_sfpu_start_<DstSync::SyncHalf>(/* dst_index */ block_start);
179-
180- test_utils::call_binary_sfpu_operation<APPROX_MODE, SFPU_BINARY_OPERATION, ITERATIONS>(block_start, formats.math );
187+ _llk_math_eltwise_binary_sfpu_start_<DST_SYNC>(/* dst_index */ 0 );
188+ test_utils::call_binary_sfpu_operation<APPROX_MODE, SFPU_BINARY_OPERATION, ITERATIONS, formats.math >();
181189
182190 _llk_math_eltwise_binary_sfpu_done_ ();
183191 }
@@ -191,22 +199,21 @@ void run_kernel(const volatile struct RuntimeParams* params)
191199 {
192200 int block_tiles = std::min (params->TILE_CNT - block_start, MAX_TILES_DEST);
193201
194- _llk_math_wait_for_dest_available_<DstSync::SyncHalf >();
202+ _llk_math_wait_for_dest_available_<DST_SYNC >();
195203
196204 // Copy from srcA to dest
197205 for (int block_tile = 0 ; block_tile < block_tiles; ++block_tile)
198206 {
199- _llk_math_eltwise_unary_datacopy_<DataCopyType::A2D, DstSync::SyncHalf, is_fp32_dest_acc_en, BroadcastType::NONE, unpack_to_dest>(
200- block_start + block_tile, formats.math , formats.math );
201- }
202-
203- // Start SFPU binary operation
204- _llk_math_eltwise_binary_sfpu_start_<DstSync::SyncHalf>(/* dst_index */ block_start);
207+ _llk_math_eltwise_unary_datacopy_<DataCopyType::A2D, DST_SYNC, is_fp32_dest_acc_en, BroadcastType::NONE, unpack_to_dest>(
208+ block_tile, formats.math , formats.math );
205209
206- test_utils::call_binary_sfpu_operation<APPROX_MODE, SFPU_BINARY_OPERATION, ITERATIONS>(block_start, formats.math );
210+ // Start SFPU binary operation
211+ _llk_math_eltwise_binary_sfpu_start_<DST_SYNC>(/* dst_index */ block_tile);
212+ test_utils::call_binary_sfpu_operation<APPROX_MODE, SFPU_BINARY_OPERATION, ITERATIONS, formats.math >();
213+ _llk_math_eltwise_binary_sfpu_done_ ();
214+ }
207215
208- _llk_math_eltwise_binary_sfpu_done_ ();
209- _llk_math_dest_section_done_<DstSync::SyncHalf, is_fp32_dest_acc_en>();
216+ _llk_math_dest_section_done_<DST_SYNC, is_fp32_dest_acc_en>();
210217 }
211218 }
212219 }
@@ -223,6 +230,9 @@ void run_kernel(const volatile struct RuntimeParams* params)
223230
224231void run_kernel (const volatile struct RuntimeParams * params)
225232{
233+ const int MAX_TILES_DEST =
234+ is_fp32_dest_acc_en ? (BIT32_DEST_REGISTER_HALF_SIZE / (params->num_faces * FACE_R_DIM)) : (DEST_REGISTER_HALF_SIZE / (params->num_faces * FACE_R_DIM));
235+
226236 {
227237 ZONE_SCOPED (" INIT" )
228238
@@ -235,7 +245,7 @@ void run_kernel(const volatile struct RuntimeParams* params)
235245 _llk_pack_init_<false , false >(formats.pack_dst , FACE_R_DIM, params->num_faces );
236246#endif
237247 // Initialize destination for packing
238- _llk_pack_dest_init_<DstSync::SyncHalf , is_fp32_dest_acc_en>();
248+ _llk_pack_dest_init_<DST_SYNC , is_fp32_dest_acc_en>();
239249
240250 PROFILER_SYNC ();
241251 }
@@ -252,7 +262,7 @@ void run_kernel(const volatile struct RuntimeParams* params)
252262
253263 for (int block_tile = 0 ; block_tile < block_tiles; block_tile++)
254264 {
255- _llk_pack_<DstSync::SyncHalf , is_fp32_dest_acc_en>(block_tile, PERF_ADDRESS (PERF_OUTPUT, block_start + block_tile));
265+ _llk_pack_<DST_SYNC , is_fp32_dest_acc_en, /* untilize */ false >(block_tile, PERF_ADDRESS (PERF_OUTPUT, block_tile));
256266 }
257267 }
258268 }
@@ -268,9 +278,9 @@ void run_kernel(const volatile struct RuntimeParams* params)
268278 _llk_packer_wait_for_math_done_ ();
269279 for (int block_tile = 0 ; block_tile < block_tiles; block_tile++)
270280 {
271- _llk_pack_<DstSync::SyncHalf , is_fp32_dest_acc_en>(block_tile, PERF_ADDRESS (PERF_OUTPUT, block_start + block_tile));
281+ _llk_pack_<DST_SYNC , is_fp32_dest_acc_en, /* untilize */ false >(block_tile, PERF_ADDRESS (PERF_OUTPUT, block_tile));
272282 }
273- _llk_pack_dest_section_done_<DstSync::SyncHalf , is_fp32_dest_acc_en>();
283+ _llk_pack_dest_section_done_<DST_SYNC , is_fp32_dest_acc_en>();
274284 }
275285 }
276286 }
0 commit comments