@@ -51,6 +51,30 @@ inline void eltwise_binary_reuse_dest_as_src()
5151 }
5252}
5353
54+ // Helper to run the eltwise binary loop with optional dest reuse and face clearing
55+ // Consolidates the repeated loop pattern for DRY
56+ template <bool is_fp32_dest_acc_en, EltwiseBinaryReuseDestType binary_reuse_dest>
57+ inline void eltwise_binary_reuse_dest_helper_func (
58+ const uint32_t loop_count, const uint32_t face_base_offset, const bool clear_fp32_dst_acc, const uint dst_index)
59+ {
60+ #pragma GCC unroll 0
61+ for (std::uint32_t face_num = 0 ; face_num < loop_count; face_num++)
62+ {
63+ eltwise_binary_reuse_dest_as_src<binary_reuse_dest>();
64+
65+ // Clear DEST face-by-face when reusing dest as source
66+ if constexpr (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE)
67+ {
68+ constexpr uint32_t ZERO_ACC_MODE = p_zeroacc::CLR_16;
69+ int clear_fp32 = is_fp32_dest_acc_en && clear_fp32_dst_acc ? 1 : 0 ;
70+ auto buffer_base = is_fp32_dest_acc_en && clear_fp32_dst_acc ? get_dest_buffer_base_32b () : get_dest_buffer_base_16b ();
71+ TT_ZEROACC (ZERO_ACC_MODE, clear_fp32, 0 , ADDR_MOD_1, (buffer_base + get_dest_index_in_faces (dst_index, face_base_offset + face_num)));
72+ }
73+
74+ ckernel_template::run ();
75+ }
76+ }
77+
5478template <
5579 EltwiseBinaryType eltwise_binary_type,
5680 BroadcastType src_b_bcast_type,
@@ -61,8 +85,7 @@ template <
6185inline void _llk_math_eltwise_binary_ (const std::uint32_t num_faces, uint dst_index, const bool clear_fp32_dst_acc)
6286{
6387 LLK_ASSERT (num_faces == 1 || num_faces == 2 || num_faces == 4 , " num_faces must be 1, 2, or 4" );
64- constexpr bool high_fidelity = (NUM_FIDELITY_PHASES > 0 );
65- constexpr uint32_t ZERO_ACC_MODE = p_zeroacc::CLR_16;
88+ constexpr bool high_fidelity = (NUM_FIDELITY_PHASES > 0 );
6689
6790 math::set_dst_write_addr<DstTileShape::Tile32x32, UnpackDestination::SrcRegs>(dst_index);
6891
@@ -111,125 +134,22 @@ inline void _llk_math_eltwise_binary_(const std::uint32_t num_faces, uint dst_in
111134 if constexpr (src_b_bcast_type == BroadcastType::COL)
112135 {
113136 // Mop for col broadcast only does 2 outerloops. Needs to clear B manually and call twice for full tile size
114- constexpr uint32_t outerloop = (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE) ? 2 : 1 ;
115- if constexpr (high_fidelity)
116- {
117- #pragma GCC unroll 0
118- for (std::uint32_t face_num = 0 ; face_num < 2 ; face_num++)
119- {
120- eltwise_binary_reuse_dest_as_src<binary_reuse_dest>();
121- if constexpr (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE)
122- {
123- // We clear the DEST face-by-face, given the DEST base, tile index and face index
124- int clear_fp32 = is_fp32_dest_acc_en && clear_fp32_dst_acc ? 1 : 0 ;
125- auto buffer_base = is_fp32_dest_acc_en && clear_fp32_dst_acc ? get_dest_buffer_base_32b () : get_dest_buffer_base_16b ();
126- TT_ZEROACC (
127- ZERO_ACC_MODE, clear_fp32, 0 , ADDR_MOD_1, (buffer_base + get_dest_index_in_faces (dst_index, (0 + face_num)))); // Clear faces 0 & 1
128- }
129- ckernel_template::run ();
130- }
131- }
132- else
133- {
134- #pragma GCC unroll 0
135- for (std::uint32_t face_num = 0 ; face_num < outerloop; face_num++)
136- {
137- eltwise_binary_reuse_dest_as_src<binary_reuse_dest>();
138- if constexpr (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE)
139- {
140- // We clear the DEST face-by-face, given the DEST base, tile index and face index
141- int clear_fp32 = is_fp32_dest_acc_en && clear_fp32_dst_acc ? 1 : 0 ;
142- auto buffer_base = is_fp32_dest_acc_en && clear_fp32_dst_acc ? get_dest_buffer_base_32b () : get_dest_buffer_base_16b ();
143- TT_ZEROACC (
144- ZERO_ACC_MODE, clear_fp32, 0 , ADDR_MOD_1, (buffer_base + get_dest_index_in_faces (dst_index, (0 + face_num)))); // Clear faces 0 & 1
145- }
146- ckernel_template::run ();
147- }
148- }
137+ constexpr uint32_t outerloop = (high_fidelity) ? 2 : ((binary_reuse_dest != EltwiseBinaryReuseDestType::NONE) ? 2 : 1 );
138+ eltwise_binary_reuse_dest_helper_func<is_fp32_dest_acc_en, binary_reuse_dest>(outerloop, 0 , clear_fp32_dst_acc, dst_index);
149139 TTI_SETRWC (p_setrwc::CLR_B, 0 , 0 , 0 , 0 , 0 );
140+
150141 if (num_faces == 4 )
151142 {
152- if constexpr (high_fidelity)
153- {
154- #pragma GCC unroll 0
155- for (std::uint32_t face_num = 0 ; face_num < 2 ; face_num++)
156- {
157- eltwise_binary_reuse_dest_as_src<binary_reuse_dest>();
158- if constexpr (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE)
159- {
160- // We clear the DEST face-by-face, given the DEST base, tile index and face index
161- int clear_fp32 = is_fp32_dest_acc_en && clear_fp32_dst_acc ? 1 : 0 ;
162- auto buffer_base = is_fp32_dest_acc_en && clear_fp32_dst_acc ? get_dest_buffer_base_32b () : get_dest_buffer_base_16b ();
163- TT_ZEROACC (
164- ZERO_ACC_MODE,
165- clear_fp32,
166- 0 ,
167- ADDR_MOD_1,
168- (buffer_base + get_dest_index_in_faces (dst_index, (2 + face_num)))); // Clear faces 2 & 3
169- }
170- ckernel_template::run ();
171- }
172- }
173- else
174- {
175- #pragma GCC unroll 0
176- for (std::uint32_t face_num = 0 ; face_num < outerloop; face_num++)
177- {
178- eltwise_binary_reuse_dest_as_src<binary_reuse_dest>();
179- if constexpr (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE)
180- {
181- // We clear the DEST face-by-face, given the DEST base, tile index and face index
182- int clear_fp32 = is_fp32_dest_acc_en && clear_fp32_dst_acc ? 1 : 0 ;
183- auto buffer_base = is_fp32_dest_acc_en && clear_fp32_dst_acc ? get_dest_buffer_base_32b () : get_dest_buffer_base_16b ();
184- TT_ZEROACC (
185- ZERO_ACC_MODE,
186- clear_fp32,
187- 0 ,
188- ADDR_MOD_1,
189- (buffer_base + get_dest_index_in_faces (dst_index, (2 + face_num)))); // Clear faces 2 & 3
190- }
191- ckernel_template::run ();
192- }
193- }
143+ eltwise_binary_reuse_dest_helper_func<is_fp32_dest_acc_en, binary_reuse_dest>(outerloop, 2 , clear_fp32_dst_acc, dst_index);
194144 TTI_SETRWC (p_setrwc::CLR_B, 0 , 0 , 0 , 0 , 0 );
195145 }
196146 }
197147 else
198148 {
199149 // Row and no broadcasted behaves similarly
200- const uint32_t outerloop = (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE) ? num_faces : 1 ;
201- if constexpr (high_fidelity)
202- {
203- #pragma GCC unroll 0
204- for (std::uint32_t face_num = 0 ; face_num < num_faces; face_num++)
205- {
206- eltwise_binary_reuse_dest_as_src<binary_reuse_dest>();
207- if constexpr (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE)
208- {
209- // We clear the DEST face-by-face, given the DEST base, tile index and face index
210- int clear_fp32 = is_fp32_dest_acc_en && clear_fp32_dst_acc ? 1 : 0 ;
211- auto buffer_base = is_fp32_dest_acc_en && clear_fp32_dst_acc ? get_dest_buffer_base_32b () : get_dest_buffer_base_16b ();
212- TT_ZEROACC (ZERO_ACC_MODE, clear_fp32, 0 , ADDR_MOD_1, (buffer_base + get_dest_index_in_faces (dst_index, face_num)));
213- }
214- ckernel_template::run ();
215- }
216- }
217- else
218- {
219- #pragma GCC unroll 0
220- for (std::uint32_t face_num = 0 ; face_num < outerloop; face_num++)
221- {
222- eltwise_binary_reuse_dest_as_src<binary_reuse_dest>();
223- if constexpr (binary_reuse_dest != EltwiseBinaryReuseDestType::NONE)
224- {
225- // We clear the DEST face-by-face, given the DEST base, tile index and face index
226- int clear_fp32 = is_fp32_dest_acc_en && clear_fp32_dst_acc ? 1 : 0 ;
227- auto buffer_base = is_fp32_dest_acc_en && clear_fp32_dst_acc ? get_dest_buffer_base_32b () : get_dest_buffer_base_16b ();
228- TT_ZEROACC (ZERO_ACC_MODE, clear_fp32, 0 , ADDR_MOD_1, (buffer_base + get_dest_index_in_faces (dst_index, face_num)));
229- }
230- ckernel_template::run ();
231- }
232- }
150+ const uint32_t outerloop = (high_fidelity) ? num_faces : ((binary_reuse_dest != EltwiseBinaryReuseDestType::NONE) ? num_faces : 1 );
151+ eltwise_binary_reuse_dest_helper_func<is_fp32_dest_acc_en, binary_reuse_dest>(outerloop, 0 , clear_fp32_dst_acc, dst_index);
152+
233153 if constexpr (src_b_bcast_type == BroadcastType::SCALAR)
234154 {
235155 TTI_SETRWC (p_setrwc::CLR_B, 0 , 0 , 0 , 0 , p_setrwc::SET_D);
0 commit comments