tenstorrent
diff --git a/‎tests/helpers/include/llk_sfpu_types.h‎
Lines changed: 1 addition & 0 deletions b/‎tests/helpers/include/llk_sfpu_types.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tt_llk_blackhole/common/inc/sfpu/ckernel_sfpu_typecast.h‎
Lines changed: 161 additions & 105 deletions b/‎tt_llk_blackhole/common/inc/sfpu/ckernel_sfpu_typecast.h‎
Lines changed: 161 additions & 105 deletions
diff --git a/‎tt_llk_blackhole/llk_lib/llk_math_eltwise_unary_sfpu.h‎
Lines changed: 10 additions & 0 deletions b/‎tt_llk_blackhole/llk_lib/llk_math_eltwise_unary_sfpu.h‎
Lines changed: 10 additions & 0 deletions
@@ -105,4 +105,5 @@ enum class SfpuType
     acosh,
     reduce,
     add_top_row,
+    typecast,
 };
@@ -1,4 +1,5 @@
 // SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
+// SPDX-FileCopyrightText: © 2025 Jason Davies <jason@jasondavies.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -70,64 +71,104 @@ inline void _calculate_typecast_int32_to_fp16b_()
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void _calculate_typecast_fp16b_to_int32_()
+inline void _calculate_typecast_fp32_to_int32_()
 {
-#pragma GCC unroll 0
+#pragma GCC unroll 8
     for (int d = 0; d < ITERATIONS; d++)
     {
-        sfpi::vFloat in = sfpi::dst_reg[0];
-
-        // extract exponent
-        sfpi::vInt exp = exexp(in);
-
-        v_if (exp < 0)
-        {
-            sfpi::dst_reg[0] = 0;
-        }
-        v_elseif (exp > 30)
-        {
-            // set to int32 max value in case of overflow
-            sfpi::vInt tmp = std::numeric_limits<int32_t>::max();
-            // check sign
-            v_if (in < 0)
-            {
-                // 2's complement conversion
-                tmp = (~tmp) + 1;
-            }
-            v_endif sfpi::dst_reg[0] = tmp;
-        }
-        v_else
-        {
-            // extract mantissa
-            sfpi::vInt man = exman8(in);
-            // shift the mantissa by (23-exponent) to the right
-            sfpi::vInt shift = exp - 23;
-            man              = sfpi::shft(sfpi::reinterpret<sfpi::vUInt>(man), shift);
-            // check sign
-            v_if (in < 0)
-            {
-                // 2's complement conversion
-                man = (~man) + 1;
-            }
-            v_endif sfpi::dst_reg[0] = man;
-        }
-        v_endif
-
-            sfpi::dst_reg++;
+        TTI_SFPLOAD(p_sfpu::LREG0, InstrModLoadStore::DEFAULT, ADDR_MOD_7, 0);
+        // result = 0
+        TTI_SFPLOADI(p_sfpu::LREG1, sfpi::SFPLOADI_MOD0_USHORT, 0);
+
+        // exp = in.Exp (LaneEnabled = exp >= 0)
+        TTI_SFPEXEXP(0, p_sfpu::LREG0, p_sfpu::LREG2, sfpi::SFPEXEXP_MOD1_SET_CC_SGN_EXP | sfpi::SFPEXEXP_MOD1_SET_CC_COMP_EXP);
+        // result = INT_MIN
+        TTI_SFPLOADI(p_sfpu::LREG1, sfpi::SFPLOADI_MOD0_FLOATB, 0x8000);
+        // exp -= 31 (LaneEnabled = exp < 31)
+        TTI_SFPIADD(-31 & 0xfff, p_sfpu::LREG2, p_sfpu::LREG2, sfpi::SFPIADD_MOD1_ARG_IMM | sfpi::SFPIADD_MOD1_CC_LT0);
+        // exp += 8
+        TTI_SFPIADD(8, p_sfpu::LREG2, p_sfpu::LREG2, sfpi::SFPIADD_MOD1_ARG_IMM | sfpi::SFPIADD_MOD1_CC_NONE);
+        // result = exman8(in) << (exp - 23)
+        TTI_SFPEXMAN(0, p_sfpu::LREG0, p_sfpu::LREG1, 0);
+        TTI_SFPSHFT(0, p_sfpu::LREG2, p_sfpu::LREG1, 0);
+        // LaneEnabled = true
+        TTI_SFPENCC(0, 0, 0, 0);
+
+        // LaneEnabled = in < 0
+        TTI_SFPSETCC(0, p_sfpu::LREG0, 0, sfpi::SFPSETCC_MOD1_LREG_LT0);
+        // result = -result (two's complement)
+        TTI_SFPIADD(0, p_sfpu::LCONST_0, p_sfpu::LREG1, sfpi::SFPIADD_MOD1_ARG_2SCOMP_LREG_DST | sfpi::SFPIADD_MOD1_CC_NONE);
+        // LaneEnabled = true
+        TTI_SFPENCC(0, 0, 0, 0);
+
+        TTI_SFPSTORE(p_sfpu::LREG1, InstrModLoadStore::INT32, ADDR_MOD_6, 0);
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void _calculate_typecast_fp32_to_uint32_()
+{
+#pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++)
+    {
+        TTI_SFPLOAD(p_sfpu::LREG0, InstrModLoadStore::DEFAULT, ADDR_MOD_7, 0);
+        // result = 0
+        TTI_SFPLOADI(p_sfpu::LREG1, sfpi::SFPLOADI_MOD0_USHORT, 0);
+
+        // LaneEnabled = in >= 0
+        TTI_SFPSETCC(0, p_sfpu::LREG0, 0, sfpi::SFPSETCC_MOD1_LREG_GTE0);
+        // exp = in.Exp (LaneEnabled = exp >= 0)
+        TTI_SFPEXEXP(0, p_sfpu::LREG0, p_sfpu::LREG2, sfpi::SFPEXEXP_MOD1_SET_CC_SGN_EXP | sfpi::SFPEXEXP_MOD1_SET_CC_COMP_EXP);
+        // result = 0xffffffff
+        TTI_SFPLOADI(p_sfpu::LREG1, sfpi::SFPLOADI_MOD0_SHORT, 0xffff);
+        // exp -= 32 (LaneEnabled = exp < 31)
+        TTI_SFPIADD(-32 & 0xfff, p_sfpu::LREG2, p_sfpu::LREG2, sfpi::SFPIADD_MOD1_ARG_IMM | sfpi::SFPIADD_MOD1_CC_LT0);
+        // exp += 9
+        TTI_SFPIADD(9, p_sfpu::LREG2, p_sfpu::LREG2, sfpi::SFPIADD_MOD1_ARG_IMM | sfpi::SFPIADD_MOD1_CC_NONE);
+        // result = exman8(in) << (exp - 23)
+        TTI_SFPEXMAN(0, p_sfpu::LREG0, p_sfpu::LREG1, 0);
+        TTI_SFPSHFT(0, p_sfpu::LREG2, p_sfpu::LREG1, 0);
+        // LaneEnabled = true
+        TTI_SFPENCC(0, 0, 0, 0);
+
+        TTI_SFPSTORE(p_sfpu::LREG1, InstrModLoadStore::INT32, ADDR_MOD_6, 0);
     }
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
 inline void _calculate_typecast_fp32_to_fp16b_()
 {
-#pragma GCC unroll 0
+    // This uses SFPLOADMACRO to achieve a throughput of 3 cycles per input row.
+    //
+    // Notation: [x] means scheduled by SFPLOADMACRO with VD=x.
+    //
+    // t | Load | Simple          | MAD | Round      | Store   |
+    // - | ---- | --------------- | --- | ---------- | ------- |
+    // 0 |  [a] |                 |     |            |         |
+    // 1 |  [b] |                 |     | [a] >>= 16 |         |
+    // 2 |      | a &= 1          |     |            |         |
+    // 0 |  ... | [b] += 0x7fff   |     |            |         |
+    // 1 |  ... | [a] L16 = a + b |     |            | [a]     |
+    // 2 |  ... |                 |     |            | [b] L16 |
+    //
+    // Note that [a] schedules a 32-bit store, writing all zeros except for the
+    // LSB, which may be 0 or 1.  Then, [b] schedules a 16-bit store with
+    // MOD0_FMT_BF16.  The zeros mean that even if rounding is applied by
+    // packers, the result will be truncated.
+
+    constexpr int b = p_sfpu::LREG2;
+
+#pragma GCC unroll 8
     for (int d = 0; d < ITERATIONS; d++)
     {
-        TTI_SFPLOAD(0, 0, ADDR_MOD_7, 0);
-        TTI_SFP_STOCH_RND(0, 0, 2, 0, 1, 1);
-        TTI_SFPSTORE(1, 0, ADDR_MOD_7, 0);
-        sfpi::dst_reg++;
+        int a = d & 1;
+        TT_SFPLOADMACRO((0 << 2) | (a & 3), 0, ADDR_MOD_7, a >> 2);
+        TTI_SFPLOADMACRO((1 << 2) | (b & 3), 0, ADDR_MOD_6, b >> 2);
+        TT_SFPAND(0, p_sfpu::LREG12, a, 0);
     }
+    TTI_SFPNOP;
+    TTI_SFPNOP;
+    TTI_SFPNOP;
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
@@ -166,61 +207,6 @@ inline void _calculate_typecast_int32_to_fp32_()
     }
 }
 
-template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void _calculate_typecast_fp16b_to_uint32_()
-{
-#pragma GCC unroll 0
-    for (int d = 0; d < ITERATIONS; d++)
-    {
-        sfpi::vFloat in = sfpi::dst_reg[0];
-
-        // check sign
-        v_if (in <= 0)
-        {
-            sfpi::dst_reg[0] = 0;
-        }
-        v_else
-        {
-            // extract exponent
-            sfpi::vInt exp = exexp(in);
-
-            v_if (exp < 0)
-            {
-                sfpi::dst_reg[0] = 0;
-            }
-            v_elseif (exp > 31)
-            {
-                // set to uint32 max value in case of overflow
-                sfpi::vInt tmp   = std::numeric_limits<int32_t>::max();
-                sfpi::dst_reg[0] = sfpi::setsgn(sfpi::reinterpret<sfpi::vFloat>(tmp), 1);
-            }
-            v_elseif (exp == 31)
-            {
-                // extract mantissa without hidden bit
-                sfpi::vInt man = exman9(in);
-                // shift the mantissa by (23-exponent) to the right
-                sfpi::vInt shift = exp - 23;
-                man              = sfpi::shft(sfpi::reinterpret<sfpi::vUInt>(man), shift);
-                // add hidden bit back (due to bug when shifting a 1 into MSB)
-                sfpi::dst_reg[0] = sfpi::setsgn(sfpi::reinterpret<sfpi::vFloat>(man), 1);
-            }
-            v_else
-            {
-                // extract mantissa
-                sfpi::vInt man = exman8(in);
-                // shift the mantissa by (23-exponent) to the right
-                sfpi::vInt shift = exp - 23;
-                man              = sfpi::shft(sfpi::reinterpret<sfpi::vUInt>(man), shift);
-                sfpi::dst_reg[0] = man;
-            }
-            v_endif
-        }
-        v_endif
-
-            sfpi::dst_reg++;
-    }
-}
-
 template <bool APPROXIMATION_MODE, int ITERATIONS>
 inline void _calculate_typecast_uint32_to_fp16b_()
 {
@@ -259,13 +245,12 @@ inline void _calculate_typecast_uint32_to_fp32_()
 template <bool APPROXIMATION_MODE, int ITERATIONS>
 inline void _calculate_typecast_uint16_to_uint32_()
 {
-#pragma GCC unroll 0
+#pragma GCC unroll 8
     for (int d = 0; d < ITERATIONS; d++)
     {
-        TTI_SFPLOAD(p_sfpu::LREG0, InstrModLoadStore::LO16, ADDR_MOD_7, 0);
-        TTI_SFPSTORE(p_sfpu::LREG0, InstrModLoadStore::INT32_2S_COMP, ADDR_MOD_7, 0);
-        sfpi::dst_reg++;
+        TTI_SFPLOADMACRO((0 << 2) | 0, InstrModLoadStore::LO16, ADDR_MOD_6, 0);
     }
+    TTI_SFPNOP;
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
@@ -301,5 +286,76 @@ inline void _calculate_typecast_int32_to_uint16_()
     }
 }
 
+template <bool APPROXIMATION_MODE>
+inline void _init_typecast_fp32_to_fp16b_()
+{
+    constexpr int b = p_sfpu::LREG2;
+
+    sfpi::vConstIntPrgm0 = 1;
+    sfpi::vConstIntPrgm1 = 0x7fff;
+
+    // InstructionTemplate[0]
+    TTI_SFPSHFT2(-16 & 0xfff, 0, 12, 6); // SFPSHFT2_MOD1_SHFT_IMM
+
+    // InstructionTemplate[1]
+    TTI_SFPIADD(0, p_sfpu::LREG13, 13, sfpi::SFPIADD_MOD1_CC_NONE);
+
+    // InstructionTemplate[2]
+    TTI_SFPIADD(0, b, 14, sfpi::SFPIADD_MOD1_CC_NONE);
+
+    // Macro 0: [a]
+    {
+        constexpr uint simple_bits = 0x80 | 0x40 | (3 << 3) | (4 + 2);
+        constexpr uint mad_bits    = 0;
+        constexpr uint round_bits  = 0x80 | 0x00 | (0 << 3) | (4 + 0);
+        constexpr uint store_bits  = 0x00 | 0x00 | (3 << 3) | 3;
+
+        TTI_SFPLOADI(0, sfpi::SFPLOADI_MOD0_LOWER, (mad_bits << 8) | simple_bits);
+        TTI_SFPLOADI(0, sfpi::SFPLOADI_MOD0_UPPER, (store_bits << 8) | round_bits);
+        TTI_SFPCONFIG(0, 4 + 0, 0);
+    }
+
+    // Macro 1: [b]
+    {
+        constexpr uint simple_bits = 0x80 | 0x00 | (1 << 3) | (4 + 1);
+        constexpr uint mad_bits    = 0;
+        constexpr uint round_bits  = 0;
+        constexpr uint store_bits  = 0x00 | 0x40 | (3 << 3) | 3;
+
+        TTI_SFPLOADI(0, sfpi::SFPLOADI_MOD0_LOWER, (mad_bits << 8) | simple_bits);
+        TTI_SFPLOADI(0, sfpi::SFPLOADI_MOD0_UPPER, (store_bits << 8) | round_bits);
+        TTI_SFPCONFIG(0, 4 + 1, 0);
+    }
+
+    // Misc: {
+    //   StoreMod0: 2,
+    //   UsesLoadMod0ForStore: {1,0},
+    //   UnitDelayKind: {1,1}, (WaitForElapsedInstructions=1)
+    // }
+    TTI_SFPCONFIG(0x312, 8, 1);
+}
+
+template <bool APPROXIMATION_MODE>
+inline void _init_typecast_uint16_to_uint32_()
+{
+    {
+        constexpr uint simple_bits = 0;
+        constexpr uint mad_bits    = 0;
+        constexpr uint round_bits  = 0;
+        constexpr uint store_bits  = 0x00 | 0x00 | (0 << 3) | 3;
+
+        TTI_SFPLOADI(0, sfpi::SFPLOADI_MOD0_LOWER, (mad_bits << 8) | simple_bits);
+        TTI_SFPLOADI(0, sfpi::SFPLOADI_MOD0_UPPER, (store_bits << 8) | round_bits);
+        TTI_SFPCONFIG(0, 4 + 0, 0);
+    }
+
+    // Misc: {
+    //   StoreMod0: InstrModLoadStore::INT32,
+    //   UsesLoadMod0ForStore: {0},
+    //   UnitDelayKind: {1}, (WaitForElapsedInstructions=1)
+    // }
+    TTI_SFPCONFIG(0x100 | InstrModLoadStore::INT32, 8, 1);
+}
+
 } // namespace sfpu
 } // namespace ckernel
@@ -50,6 +50,16 @@ inline void eltwise_unary_sfpu_configure_addrmod()
         }
             .set(ADDR_MOD_6);
     }
+
+    if constexpr (sfpu_op == SfpuType::typecast)
+    {
+        addr_mod_t {
+            .srca = {.incr = 0},
+            .srcb = {.incr = 0},
+            .dest = {.incr = 2},
+        }
+            .set(ADDR_MOD_6);
+    }
 }
 
 inline void eltwise_unary_sfpu_configure_mop();
Original file line number	Diff line number	Diff line change
`@@ -50,6 +50,16 @@ inline void eltwise_unary_sfpu_configure_addrmod()`
`50`	`50`	`}`
`51`	`51`	`.set(ADDR_MOD_6);`
`52`	`52`	`}`
	`53`	`+`
	`54`	`+ if constexpr (sfpu_op == SfpuType::typecast)`
	`55`	`+ {`
	`56`	`+ addr_mod_t {`
	`57`	`+ .srca = {.incr = 0},`
	`58`	`+ .srcb = {.incr = 0},`
	`59`	`+ .dest = {.incr = 2},`
	`60`	`+ }`
	`61`	`+ .set(ADDR_MOD_6);`
	`62`	`+ }`
`53`	`63`	`}`
`54`	`64`
`55`	`65`	`inline void eltwise_unary_sfpu_configure_mop();`