From b2af579ff8fc696e0844913aec78f6a6e44f4b0a Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 27 Feb 2025 14:42:11 +0700
Subject: [PATCH 01/11] split out fresnel stuff, functions.hlsl fixes

---
 include/nbl/builtin/hlsl/bxdf/fresnel.hlsl   | 155 +++++++++++
 include/nbl/builtin/hlsl/math/functions.hlsl | 278 +++----------------
 2 files changed, 200 insertions(+), 233 deletions(-)
 create mode 100644 include/nbl/builtin/hlsl/bxdf/fresnel.hlsl
diff --git a/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl b/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl
new file mode 100644
index 0000000000..5d54c6c261
--- /dev/null
+++ b/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl
@@ -0,0 +1,155 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_BXDF_FRESNEL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_BXDF_FRESNEL_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/numbers.hlsl"
+#include "nbl/builtin/hlsl/vector_utils/vector_traits.hlsl"
+#include "nbl/builtin/hlsl/spirv_intrinsics/core.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+
+namespace bxdf
+{
+
+namespace impl
+{
+template<typename T>
+struct orientedEtas;
+
+template<>
+struct orientedEtas<float>
+{
+    static bool __call(NBL_REF_ARG(float) orientedEta, NBL_REF_ARG(float) rcpOrientedEta, float NdotI, float eta)
+    {
+        const bool backside = NdotI < 0.0;
+        const float rcpEta = 1.0 / eta;
+        orientedEta = backside ? rcpEta : eta;
+        rcpOrientedEta = backside ? eta : rcpEta;
+        return backside;
+    }
+};
+
+template<>
+struct orientedEtas<float32_t3>
+{
+    static bool __call(NBL_REF_ARG(float32_t3) orientedEta, NBL_REF_ARG(float32_t3) rcpOrientedEta, float NdotI, float32_t3 eta)
+    {
+        const bool backside = NdotI < 0.0;
+        const float32_t3 rcpEta = (float32_t3)1.0 / eta;
+        orientedEta = backside ? rcpEta:eta;
+        rcpOrientedEta = backside ? eta:rcpEta;
+        return backside;
+    }
+};
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T> || is_vector_v<T>)
+bool getOrientedEtas(NBL_REF_ARG(T) orientedEta, NBL_REF_ARG(T) rcpOrientedEta, scalar_type_t<T> NdotI, T eta)
+{
+    return impl::orientedEtas<T>::__call(orientedEta, rcpOrientedEta, NdotI, eta);
+}
+
+}
+
+
+template <typename T NBL_FUNC_REQUIRES(vector_traits<T>::Dimensions == 3)
+T reflect(NBL_CONST_REF_ARG(T) I, NBL_CONST_REF_ARG(T) N, typename vector_traits<T>::scalar_type NdotI)
+{
+    return N * 2.0f * NdotI - I;
+}
+
+template <typename T NBL_FUNC_REQUIRES(vector_traits<T>::Dimensions == 3)
+T reflect(NBL_CONST_REF_ARG(T) I, NBL_CONST_REF_ARG(T) N)
+{
+    typename vector_traits<T>::scalar_type NdotI = nbl::hlsl::dot<T>(N, I);
+    return reflect<T>(I, N, NdotI);
+}
+
+template<typename T NBL_PRIMARY_REQUIRES(vector_traits<T>::Dimensions == 3)
+struct refract
+{
+    using this_t = refract;
+    using scalar_type = typename vector_traits<T>::scalar_type;
+    using vector_type = T;
+
+    static this_t create(NBL_CONST_REF_ARG(vector_type) I, NBL_CONST_REF_ARG(vector_type) N, bool backside, scalar_type NdotI, scalar_type NdotI2, scalar_type rcpOrientedEta, scalar_type rcpOrientedEta2)
+    {
+        this_t retval;
+        retval.I = I;
+        retval.N = N;
+        retval.backside = backside;
+        retval.NdotI = NdotI;
+        retval.NdotI2 = NdotI2;
+        retval.rcpOrientedEta = rcpOrientedEta;
+        retval.rcpOrientedEta2 = rcpOrientedEta2;
+        return retval;
+    }
+
+    static this_t create(NBL_CONST_REF_ARG(vector_type) I, NBL_CONST_REF_ARG(vector_type) N, scalar_type NdotI, scalar_type eta)
+    {
+        this_t retval;
+        retval.I = I;
+        retval.N = N;
+        T orientedEta;
+        retval.backside = bxdf::getOrientedEtas<scalar_type>(orientedEta, retval.rcpOrientedEta, NdotI, eta);
+        retval.NdotI = NdotI;
+        retval.NdotI2 = NdotI * NdotI;
+        retval.rcpOrientedEta2 = retval.rcpOrientedEta * retval.rcpOrientedEta;
+        return retval;
+    }
+
+    static this_t create(NBL_CONST_REF_ARG(vector_type) I, NBL_CONST_REF_ARG(vector_type) N, scalar_type eta)
+    {
+        this_t retval;
+        retval.I = I;
+        retval.N = N;
+        retval.NdotI = nbl::hlsl::dot<vector_type>(N, I);
+        scalar_type orientedEta;
+        retval.backside = bxdf::getOrientedEtas<scalar_type>(orientedEta, retval.rcpOrientedEta, retval.NdotI, eta);
+        retval.NdotI2 = retval.NdotI * retval.NdotI;
+        retval.rcpOrientedEta2 = retval.rcpOrientedEta * retval.rcpOrientedEta;
+        return retval;
+    }
+
+    static scalar_type computeNdotT(bool backside, scalar_type NdotI2, scalar_type rcpOrientedEta2)
+    {
+        scalar_type NdotT2 = rcpOrientedEta2 * NdotI2 + 1.0 - rcpOrientedEta2;
+        scalar_type absNdotT = nbl::hlsl::sqrt<scalar_type>(NdotT2);
+        return backside ? absNdotT : -(absNdotT);
+    }
+
+    vector_type doRefract()
+    {
+        return N * (NdotI * rcpOrientedEta + computeNdotT(backside, NdotI2, rcpOrientedEta2)) - rcpOrientedEta * I;
+    }
+
+    static vector_type doReflectRefract(bool _refract, NBL_CONST_REF_ARG(vector_type) _I, NBL_CONST_REF_ARG(vector_type) _N, scalar_type _NdotI, scalar_type _NdotTorR, scalar_type _rcpOrientedEta)
+    {
+        return _N * (_NdotI * (_refract ? _rcpOrientedEta : 1.0f) + _NdotTorR) - _I * (_refract ? _rcpOrientedEta : 1.0f);
+    }
+
+    vector_type doReflectRefract(bool r)
+    {
+        const T NdotTorR = r ? computeNdotT(backside, NdotI2, rcpOrientedEta2) : NdotI;
+        return doReflectRefract(r, I, N, NdotI, NdotTorR, rcpOrientedEta);
+    }
+
+    vector_type I;
+    vector_type N;
+    bool backside;
+    T NdotI;
+    T NdotI2;
+    T rcpOrientedEta;
+    T rcpOrientedEta2;
+};
+
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl
index a36c2027f8..41e1f376a1 100644
--- a/include/nbl/builtin/hlsl/math/functions.hlsl
+++ b/include/nbl/builtin/hlsl/math/functions.hlsl
@@ -6,6 +6,8 @@
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/numbers.hlsl"
+#include "nbl/builtin/hlsl/vector_utils/vector_traits.hlsl"
+#include "nbl/builtin/hlsl/concepts/vector.hlsl"
 #include "nbl/builtin/hlsl/spirv_intrinsics/core.hlsl"
 
 namespace nbl
@@ -24,28 +26,31 @@ struct lp_norm;
 template<typename T>
 struct lp_norm<T,0,false>
 {
-    static scalar_type_t<T> __call(const T v)
+    using scalar_type = typename vector_traits<T>::scalar_type;
+
+    static scalar_type __call(const T v)
     {
-        scalar_type_t<T> retval = abs<T>(v[0]);
+        scalar_type retval = nbl::hlsl::abs<T>(v[0]);
         for (int i = 1; i < extent<T>::value; i++)
-            retval = max<T>(abs<T>(v[i]),retval);
+            retval = nbl::hlsl::max<T>(nbl::hlsl::abs<T>(v[i]),retval);
         return retval;
     }
 };
 
-// TOOD: is this doing what it should be?
 template<typename T>
-struct lp_norm<T,1,false>
+struct lp_norm<T,1,true>
 {
-    static scalar_type_t<T> __sum(const T v)
+    using scalar_type = typename vector_traits<T>::scalar_type;
+
+    static scalar_type __sum(const T v)
     {
-        scalar_type_t<T> retval = abs<T>(v[0]);
+        scalar_type retval = nbl::hlsl::abs<T>(v[0]);
         for (int i = 1; i < extent<T>::value; i++)
-            retval += abs<T>(v[i]);
+            retval += nbl::hlsl::abs<T>(v[i]);
         return retval;
     }
 
-    static scalar_type_t<T> __call(const T v)
+    static scalar_type __call(const T v)
     {
         return __sum(v);
     }
@@ -54,218 +59,32 @@ struct lp_norm<T,1,false>
 template<typename T>
 struct lp_norm<T,2,false>
 {
-    static scalar_type_t<T> __sum(const T v)
+    using scalar_type = typename vector_traits<T>::scalar_type;
+
+    static scalar_type __sum(const T v)
     {
-        return dot<T>(v, v);   // TODO: wait for overloaded dot?
+        return nbl::hlsl::dot<T>(v, v);
     }
 
-    static scalar_type_t<T> __call(const T v)
+    static scalar_type __call(const T v)
     {
-        return sqrt<T>(__sum(v));
+        return nbl::hlsl::sqrt<scalar_type>(__sum(v));
     }
 };
-
-// TODO: even/odd cases
 }
 
-template<typename T, uint32_t LP NBL_FUNC_REQUIRES(LP>0)
+template<typename T, uint32_t LP NBL_FUNC_REQUIRES((concepts::FloatingPointVector<T> || concepts::FloatingPointVectorial<T>) && LP>0)
 scalar_type_t<T> lpNormPreroot(NBL_CONST_REF_ARG(T) v)
 {
     return impl::lp_norm<T,LP>::__sum(v);
 }
 
-template<typename T, uint32_t LP>
+template<typename T, uint32_t LP NBL_FUNC_REQUIRES(concepts::FloatingPointVector<T> || concepts::FloatingPointVectorial<T>)
 scalar_type_t<T> lpNorm(NBL_CONST_REF_ARG(T) v)
 {
     return impl::lp_norm<T,LP>::__call(v);
 }
 
-
-template <typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
-vector<T, 3> reflect(vector<T, 3> I, vector<T, 3> N, T NdotI)
-{
-    return N * 2.0f * NdotI - I;
-}
-
-template <typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
-vector<T, 3> reflect(vector<T, 3> I, vector<T, 3> N)
-{
-    T NdotI = dot<T>(N, I);
-    return reflect<T>(I, N, NdotI);
-}
-
-
-namespace impl
-{
-template<typename T>
-struct orientedEtas;
-
-template<>
-struct orientedEtas<float>
-{
-    static bool __call(NBL_REF_ARG(float) orientedEta, NBL_REF_ARG(float) rcpOrientedEta, float NdotI, float eta)
-    {
-        const bool backside = NdotI < 0.0;
-        const float rcpEta = 1.0 / eta;
-        orientedEta = backside ? rcpEta : eta;
-        rcpOrientedEta = backside ? eta : rcpEta;
-        return backside;
-    }
-};
-
-template<>
-struct orientedEtas<float32_t3>
-{
-    static bool __call(NBL_REF_ARG(float32_t3) orientedEta, NBL_REF_ARG(float32_t3) rcpOrientedEta, float NdotI, float32_t3 eta)
-    {
-        const bool backside = NdotI < 0.0;
-        const float32_t3 rcpEta = (float32_t3)1.0 / eta;
-        orientedEta = backside ? rcpEta:eta;
-        rcpOrientedEta = backside ? eta:rcpEta;
-        return backside;
-    }
-};
-}
-
-template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T> || is_vector_v<T>)
-bool getOrientedEtas(NBL_REF_ARG(T) orientedEta, NBL_REF_ARG(T) rcpOrientedEta, scalar_type_t<T> NdotI, T eta)
-{
-    return impl::orientedEtas<T>::__call(orientedEta, rcpOrientedEta, NdotI, eta);
-}
-
-
-namespace impl
-{
-template<typename T>
-struct refract
-{
-    using this_t = refract;
-    using vector_type = vector<T,3>;
-
-    static this_t create(vector_type I, vector_type N, bool backside, T NdotI, T NdotI2, T rcpOrientedEta, T rcpOrientedEta2)
-    {
-        this_t retval;
-        retval.I = I;
-        retval.N = N;
-        retval.backside = backside;
-        retval.NdotI = NdotI;
-        retval.NdotI2 = NdotI2;
-        retval.rcpOrientedEta = rcpOrientedEta;
-        retval.rcpOrientedEta2 = rcpOrientedEta2;
-        return retval;
-    }
-
-    static this_t create(vector_type I, vector_type N, T NdotI, T eta)
-    {
-        this_t retval;
-        retval.I = I;
-        retval.N = N;
-        T orientedEta;
-        retval.backside = getOrientedEtas<T>(orientedEta, retval.rcpOrientedEta, NdotI, eta);
-        retval.NdotI = NdotI;
-        retval.NdotI2 = NdotI * NdotI;
-        retval.rcpOrientedEta2 = retval.rcpOrientedEta * retval.rcpOrientedEta;
-        return retval;
-    }
-
-    static this_t create(vector_type I, vector_type N, T eta)
-    {
-        this_t retval;
-        retval.I = I;
-        retval.N = N;
-        retval.NdotI = dot<T>(N, I);
-        T orientedEta;
-        retval.backside = getOrientedEtas<T>(orientedEta, retval.rcpOrientedEta, retval.NdotI, eta);        
-        retval.NdotI2 = retval.NdotI * retval.NdotI;
-        retval.rcpOrientedEta2 = retval.rcpOrientedEta * retval.rcpOrientedEta;
-        return retval;
-    }
-
-    T computeNdotT()
-    {
-        T NdotT2 = rcpOrientedEta2 * NdotI2 + 1.0 - rcpOrientedEta2;
-        T absNdotT = sqrt<T>(NdotT2);
-        return backside ? absNdotT : -(absNdotT);
-    }
-
-    vector_type doRefract()
-    {
-        return N * (NdotI * rcpOrientedEta + computeNdotT()) - rcpOrientedEta * I;
-    }
-
-    static vector_type doReflectRefract(bool _refract, vector_type _I, vector_type _N, T _NdotI, T _NdotTorR, T _rcpOrientedEta)
-    {    
-        return _N * (_NdotI * (_refract ? _rcpOrientedEta : 1.0f) + _NdotTorR) - _I * (_refract ? _rcpOrientedEta : 1.0f);
-    }
-
-    vector_type doReflectRefract(bool r)
-    {
-        const T NdotTorR = r ? computeNdotT() : NdotI;
-        return doReflectRefract(r, I, N, NdotI, NdotTorR, rcpOrientedEta);
-    }
-
-    vector_type I;
-    vector_type N;
-    bool backside;
-    T NdotI;
-    T NdotI2;
-    T rcpOrientedEta;
-    T rcpOrientedEta2;
-};
-}
-
-template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
-vector<T,3> refract(vector<T,3> I, vector<T,3> N, bool backside, T NdotI, T NdotI2, T rcpOrientedEta, T rcpOrientedEta2)
-{
-    impl::refract<T> r = impl::refract<T>::create(I, N, backside, NdotI, NdotI2, rcpOrientedEta, rcpOrientedEta2);
-    return r.doRefract();
-}
-
-template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
-vector<T,3> refract(vector<T,3> I, vector<T,3> N, T NdotI, T eta)
-{
-    impl::refract<T> r = impl::refract<T>::create(I, N, NdotI, eta);
-    return r.doRefract();
-}
-
-template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
-vector<T,3> refract(vector<T,3> I, vector<T,3> N, T eta)
-{
-    impl::refract<T> r = impl::refract<T>::create(I, N, eta);
-    return r.doRefract();
-}
-
-template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
-T reflectRefract_computeNdotT(bool backside, T NdotI2, T rcpOrientedEta2)
-{
-    impl::refract<T> r;
-    r.NdotI2 = NdotI2;
-    r.rcpOrientedEta2 = rcpOrientedEta2;
-    r.backside = backside;
-    return r.computeNdotT();
-}
-
-template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
-vector<T,3> reflectRefract_impl(bool _refract, vector<T,3> _I, vector<T,3> _N, T _NdotI, T _NdotTorR, T _rcpOrientedEta)
-{
-    return impl::refract<T>::doReflectRefract(_refract, _I, _N, _NdotI, _NdotTorR, _rcpOrientedEta);
-}
-
-template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
-vector<T,3> reflectRefract(bool _refract, vector<T,3> I, vector<T,3> N, bool backside, T NdotI, T NdotI2, T rcpOrientedEta, T rcpOrientedEta2)
-{
-    impl::refract<T> r = impl::refract<T>::create(I, N, backside, NdotI, NdotI2, rcpOrientedEta, rcpOrientedEta2);
-    return r.doReflectRefract(_refract);
-}
-
-template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
-vector<T,3> reflectRefract(bool _refract, vector<T,3> I, vector<T,3> N, T NdotI, T eta)
-{
-    impl::refract<T> r = impl::refract<T>::create(I, N, NdotI, eta);
-    return r.doReflectRefract(_refract);
-}
-
-
 // valid only for `theta` in [-PI,PI]
 template <typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
 void sincos(T theta, NBL_REF_ARG(T) s, NBL_REF_ARG(T) c)
@@ -275,13 +94,21 @@ void sincos(T theta, NBL_REF_ARG(T) s, NBL_REF_ARG(T) c)
     s = (theta < 0.0) ? -s : s; // TODO: test with XOR
 }
 
-template <typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
-matrix<T, 2, 3> frisvad(vector<T, 3> n)
+template <typename T NBL_FUNC_REQUIRES(vector_traits<T>::Dimension == 3)
+void frisvad(NBL_CONST_REF_ARG(T) normal, NBL_REF_ARG(T) tangent, NBL_REF_ARG(T) bitangent)
 {
-	const T a = 1.0 / (1.0 + n.z);
-	const T b = -n.x * n.y * a;
-	return (n.z < -0.9999999) ? matrix<T, 2, 3>(vector<T, 3>(0.0,-1.0,0.0), vector<T, 3>(-1.0,0.0,0.0)) : 
-        matrix<T, 2, 3>(vector<T, 3>(1.0-n.x*n.x*a, b, -n.x), vector<T, 3>(b, 1.0-n.y*n.y*a, -n.y));
+	const typename vector_traits<T>::scalar_type a = 1.0 / (1.0 + normal.z);
+	const typename vector_traits<T>::scalar_type b = -normal.x * normal.y * a;
+    if (normal.z < -0.9999999)
+    {
+        tangent = T(0.0,-1.0,0.0);
+        bitangent = T(-1.0,0.0,0.0);
+    }
+    else
+    {
+        tangent = T(1.0-normal.x*normal.x*a, b, -normal.x);
+        bitangent = T(b, 1.0-normal.y*normal.y*a, -normal.y);
+    }
 }
 
 bool partitionRandVariable(float leftProb, NBL_REF_ARG(float) xi, NBL_REF_ARG(float) rcpChoiceProb)
@@ -303,40 +130,25 @@ bool partitionRandVariable(float leftProb, NBL_REF_ARG(float) xi, NBL_REF_ARG(fl
 }
 
 
-// TODO: make it work in C++, ignoring problem for now
-#ifdef __HLSL_VERSION
+// TODO: impl signed integer versions
 // @ return abs(x) if cond==true, max(x,0.0) otherwise
-template <typename T NBL_FUNC_REQUIRES(is_scalar_v<T> || is_vector_v<T>)
+template <typename T NBL_FUNC_REQUIRES(is_floating_point_v<T> || concepts::FloatingPointVector<T> || concepts::FloatingPointVectorial<T>)
 T conditionalAbsOrMax(bool cond, T x, T limit);
 
 template <>
 float conditionalAbsOrMax<float>(bool cond, float x, float limit)
 {
-    const float condAbs = asfloat(asuint(x) & uint(cond ? 0x7fFFffFFu : 0xffFFffFFu));
-    return max(condAbs,limit);
+    const float condAbs = nbl::hlsl::bit_cast<float32_t, uint32_t>(nbl::hlsl::bit_cast<uint32_t, float32_t>(x) & uint(cond ? 0x7fFFffFFu : 0xffFFffFFu));
+    return nbl::hlsl::max<float>(condAbs,limit);
 }
 
-template <>
-float32_t2 conditionalAbsOrMax<float32_t2>(bool cond, float32_t2 x, float32_t2 limit)
+template <uint16_t N>
+vector<float, N> conditionalAbsOrMax<vector<float, N> >(bool cond, NBL_CONST_REF_ARG(vector<float, N>) x, NBL_CONST_REF_ARG(vector<float, N>) limit)
 {
-    const float32_t2 condAbs = asfloat(asuint(x) & select(cond, (uint32_t2)0x7fFFffFFu, (uint32_t2)0xffFFffFFu));
-    return max(condAbs,limit);
+    const vector<float, N> condAbs = nbl::hlsl::bit_cast<vector<float, N>, vector<uint, N> >(nbl::hlsl::bit_cast<vector<uint, N>, vector<float, N> >(x) & nbl::hlsl::mix((vector<uint, N>)0x7fFFffFFu, (vector<uint, N>)0xffFFffFFu, promote<vector<bool, N>, bool>(cond)));
+    return nbl::hlsl::max<vector<float, N> >(condAbs,limit);
 }
 
-template <>
-float32_t3 conditionalAbsOrMax<float32_t3>(bool cond, float32_t3 x, float32_t3 limit)
-{
-    const float32_t3 condAbs = asfloat(asuint(x) & select(cond, (uint32_t3)0x7fFFffFFu, (uint32_t3)0xffFFffFFu));
-    return max(condAbs,limit);
-}
-
-template <>
-float32_t4 conditionalAbsOrMax<float32_t4>(bool cond, float32_t4 x, float32_t4 limit)
-{
-    const float32_t4 condAbs = asfloat(asuint(x) & select(cond, (uint32_t4)0x7fFFffFFu, (uint32_t4)0xffFFffFFu));
-    return max(condAbs,limit);
-}
-#endif
 
 namespace impl
 {
@@ -441,7 +253,7 @@ float getSumofArccosABCD(float cosA, float cosB, float cosC, float cosD)
 }
 
 template<typename T, uint16_t M, uint16_t N, uint16_t P NBL_FUNC_REQUIRES(is_scalar_v<T>)
-matrix<T,M,P> applyChainRule(matrix<T,N,M> dFdG, matrix<T,M,P> dGdR)
+matrix<T,M,P> applyChainRule(NBL_CONST_REF_ARG(matrix<T,N,M>) dFdG, NBL_CONST_REF_ARG(matrix<T,M,P>) dGdR)
 {
     return mul(dFdG,dGdR);
 }

From 8e8c55c8e19ae474e62eedad137068dc884e739e Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 27 Feb 2025 17:40:55 +0700
Subject: [PATCH 02/11] fix conditionalAbsOrMax

---
 include/nbl/builtin/hlsl/math/functions.hlsl | 36 ++++++++++++++------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl
index 41e1f376a1..2972023540 100644
--- a/include/nbl/builtin/hlsl/math/functions.hlsl
+++ b/include/nbl/builtin/hlsl/math/functions.hlsl
@@ -130,23 +130,39 @@ bool partitionRandVariable(float leftProb, NBL_REF_ARG(float) xi, NBL_REF_ARG(fl
 }
 
 
+namespace impl
+{
 // TODO: impl signed integer versions
 // @ return abs(x) if cond==true, max(x,0.0) otherwise
-template <typename T NBL_FUNC_REQUIRES(is_floating_point_v<T> || concepts::FloatingPointVector<T> || concepts::FloatingPointVectorial<T>)
-T conditionalAbsOrMax(bool cond, T x, T limit);
+template<typename T NBL_PRIMARY_REQUIRES(is_floating_point_v<T> || concepts::FloatingPointVector<T> || concepts::FloatingPointVectorial<T>)
+struct ConditionalAbsOrMax;
+
+template<>
+struct ConditionalAbsOrMax<float>
+{
+    static float absOrMax(bool cond, float x, float limit)
+    {
+        const float condAbs = nbl::hlsl::bit_cast<float32_t, uint32_t>(nbl::hlsl::bit_cast<uint32_t, float32_t>(x) & uint32_t(cond ? 0x7fFFffFFu : 0xffFFffFFu));
+        return nbl::hlsl::max<float>(condAbs,limit);
+    }
+};
 
-template <>
-float conditionalAbsOrMax<float>(bool cond, float x, float limit)
+template<uint32_t N>
+struct ConditionalAbsOrMax<vector<float, N> >
 {
-    const float condAbs = nbl::hlsl::bit_cast<float32_t, uint32_t>(nbl::hlsl::bit_cast<uint32_t, float32_t>(x) & uint(cond ? 0x7fFFffFFu : 0xffFFffFFu));
-    return nbl::hlsl::max<float>(condAbs,limit);
+    static vector<float, N> absOrMax(bool cond, NBL_CONST_REF_ARG(vector<float, N>) x, NBL_CONST_REF_ARG(vector<float, N>) limit)
+    {
+        const vector<float, N> condAbs = nbl::hlsl::bit_cast<vector<float, N>, vector<uint32_t, N> >(nbl::hlsl::bit_cast<vector<uint32_t, N>, vector<float, N> >(x) & nbl::hlsl::mix((vector<uint32_t, N>)0x7fFFffFFu, (vector<uint32_t, N>)0xffFFffFFu, promote<vector<bool, N>, bool>(cond)));
+        return nbl::hlsl::max<vector<float, N> >(condAbs,limit);
+    }
+};
+
 }
 
-template <uint16_t N>
-vector<float, N> conditionalAbsOrMax<vector<float, N> >(bool cond, NBL_CONST_REF_ARG(vector<float, N>) x, NBL_CONST_REF_ARG(vector<float, N>) limit)
+template<typename T>
+T conditionalAbsOrMax(bool cond, NBL_CONST_REF_ARG(T) x, NBL_CONST_REF_ARG(T) limit)
 {
-    const vector<float, N> condAbs = nbl::hlsl::bit_cast<vector<float, N>, vector<uint, N> >(nbl::hlsl::bit_cast<vector<uint, N>, vector<float, N> >(x) & nbl::hlsl::mix((vector<uint, N>)0x7fFFffFFu, (vector<uint, N>)0xffFFffFFu, promote<vector<bool, N>, bool>(cond)));
-    return nbl::hlsl::max<vector<float, N> >(condAbs,limit);
+    return impl::ConditionalAbsOrMax<T>::absOrMax(cond, x, limit);
 }
 
 

From 107ca800645254e95ae64d36ab8336995afcefdc Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 28 Feb 2025 16:58:28 +0700
Subject: [PATCH 03/11] fix getArccosSumofABC_minus_PI incorrect operator

---
 include/nbl/builtin/hlsl/math/functions.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl
index 2972023540..d931b311a4 100644
--- a/include/nbl/builtin/hlsl/math/functions.hlsl
+++ b/include/nbl/builtin/hlsl/math/functions.hlsl
@@ -204,7 +204,7 @@ struct trigonometry
         const bool ABltC = cosSumAB < tmp2;
         // apply triple angle formula
         const float absArccosSumABC = acos<float>(clamp<float>(cosSumAB * tmp2 - (tmp0 * tmp4 + tmp3 * tmp1) * tmp5, -1.f, 1.f));
-        return ((AltminusB ? ABltC : ABltminusC) ? (-absArccosSumABC) : absArccosSumABC) + (AltminusB | ABltminusC ? numbers::pi<float> : (-numbers::pi<float>));
+        return ((AltminusB ? ABltC : ABltminusC) ? (-absArccosSumABC) : absArccosSumABC) + ((AltminusB || ABltminusC) ? numbers::pi<float> : (-numbers::pi<float>));
     }
 
     static void combineCosForSumOfAcos(float cosA, float cosB, float biasA, float biasB, NBL_REF_ARG(float) out0, NBL_REF_ARG(float) out1)

From 8009dab4ccddb793730821ca7e849fc7c246fe48 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 3 Mar 2025 10:46:34 +0700
Subject: [PATCH 04/11] fix typo

---
 include/nbl/builtin/hlsl/bxdf/fresnel.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl b/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl
index e28c1faf44..2e0d6a6fa0 100644
--- a/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl
@@ -87,7 +87,7 @@ struct refract
         this_t retval;
         retval.I = I;
         retval.N = N;
-        T orientedEta;
+        scalar_type orientedEta;
         retval.backside = getOrientedEtas<scalar_type>(orientedEta, retval.rcpOrientedEta, NdotI, eta);
         retval.NdotI = NdotI;
         retval.NdotI2 = NdotI * NdotI;

From 93c051d10fff4c5bdd5dcf886b5ddd4c2b11e4ec Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 3 Mar 2025 11:20:30 +0700
Subject: [PATCH 05/11] fix typo bug

---
 include/nbl/builtin/hlsl/bxdf/fresnel.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl b/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl
index 2e0d6a6fa0..d3b3543a28 100644
--- a/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/fresnel.hlsl
@@ -62,7 +62,7 @@ T reflect(T I, T N, typename vector_traits<T>::scalar_type NdotI)
     return N * 2.0f * NdotI - I;
 }
 
-template<typename T NBL_PRIMARY_REQUIRES(vector_traits<T>::Dimensions == 3)
+template<typename T NBL_PRIMARY_REQUIRES(vector_traits<T>::Dimension == 3)
 struct refract
 {
     using this_t = refract<T>;

From 0c2de650d20caa61986f253b598228596bcd3f2c Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 4 Mar 2025 14:43:18 +0700
Subject: [PATCH 06/11] erf for fp16 and fp64

---
 include/nbl/builtin/hlsl/tgmath/impl.hlsl | 307 +++++++++++++++++-----
 1 file changed, 237 insertions(+), 70 deletions(-)

diff --git a/include/nbl/builtin/hlsl/tgmath/impl.hlsl b/include/nbl/builtin/hlsl/tgmath/impl.hlsl
index 6e80ef2fd6..faa939459e 100644
--- a/include/nbl/builtin/hlsl/tgmath/impl.hlsl
+++ b/include/nbl/builtin/hlsl/tgmath/impl.hlsl
@@ -50,26 +50,26 @@ template<typename T NBL_STRUCT_CONSTRAINABLE>
 struct sin_helper;
 template<typename T NBL_STRUCT_CONSTRAINABLE>
 struct acos_helper;
-template<typename T NBL_STRUCT_CONSTRAINABLE>
-struct tan_helper;
-template<typename T NBL_STRUCT_CONSTRAINABLE>
-struct asin_helper;
-template<typename T NBL_STRUCT_CONSTRAINABLE>
-struct atan_helper;
-template<typename T NBL_STRUCT_CONSTRAINABLE>
-struct sinh_helper;
-template<typename T NBL_STRUCT_CONSTRAINABLE>
-struct cosh_helper;
-template<typename T NBL_STRUCT_CONSTRAINABLE>
-struct tanh_helper;
-template<typename T NBL_STRUCT_CONSTRAINABLE>
-struct asinh_helper;
-template<typename T NBL_STRUCT_CONSTRAINABLE>
-struct acosh_helper;
-template<typename T NBL_STRUCT_CONSTRAINABLE>
-struct atanh_helper;
-template<typename T NBL_STRUCT_CONSTRAINABLE>
-struct atan2_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct tan_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct asin_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct atan_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct sinh_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct cosh_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct tanh_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct asinh_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct acosh_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct atanh_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct atan2_helper;
 
 template<typename T NBL_STRUCT_CONSTRAINABLE>
 struct sqrt_helper;
@@ -115,15 +115,15 @@ struct HELPER_NAME<BOOST_PP_SEQ_FOR_EACH_I(WRAP, _, ARG_TYPE_LIST) NBL_PARTIAL_R
 template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(sin_helper, sin, (T), (T), T)
 template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(cos_helper, cos, (T), (T), T)
 template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(acos_helper, acos, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(tan_helper, tan, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(asin_helper, asin, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(atan_helper, atan, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(sinh_helper, sinh, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(cosh_helper, cosh, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(tanh_helper, tanh, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(asinh_helper, asinh, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(acosh_helper, acosh, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(atanh_helper, atanh, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(tan_helper, tan, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(asin_helper, asin, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(atan_helper, atan, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(sinh_helper, sinh, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(cosh_helper, cosh, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(tanh_helper, tanh, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(asinh_helper, asinh, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(acosh_helper, acosh, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(atanh_helper, atanh, (T), (T), T)
 template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(atan2_helper, atan2, (T), (T)(T), T)
 template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(abs_helper, sAbs, (T), (T), T)
 template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(abs_helper, fAbs, (T), (T), T)
@@ -189,23 +189,169 @@ struct erf_helper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointScala
 {
 	static FloatingPoint __call(NBL_CONST_REF_ARG(FloatingPoint) _x)
 	{
-		const FloatingPoint a1 = FloatingPoint(NBL_FP64_LITERAL(0.254829592));
-		const FloatingPoint a2 = FloatingPoint(NBL_FP64_LITERAL(-0.284496736));
-		const FloatingPoint a3 = FloatingPoint(NBL_FP64_LITERAL(1.421413741));
-		const FloatingPoint a4 = FloatingPoint(NBL_FP64_LITERAL(-1.453152027));
-		const FloatingPoint a5 = FloatingPoint(NBL_FP64_LITERAL(1.061405429));
-		const FloatingPoint p = FloatingPoint(NBL_FP64_LITERAL(0.3275911));
+		// glibc implementation
+		const float64_t tiny = NBL_FP64_LITERAL(1e-300),
+			one = NBL_FP64_LITERAL(1.00000000000000000000e+00), /* 0x3FF00000, 0x00000000 */
+			erx = NBL_FP64_LITERAL(8.45062911510467529297e-01); /* 0x3FEB0AC1, 0x60000000 */
+
+		// Coefficients for approximation to erf in [0,0.84375]
+		const float64_t efx = NBL_FP64_LITERAL(1.28379167095512586316e-01); /* 0x3FC06EBA, 0x8214DB69 */
+		const float64_t pp0 = NBL_FP64_LITERAL(1.28379167095512558561e-01); /* 0x3FC06EBA, 0x8214DB68 */
+		const float64_t pp1 = NBL_FP64_LITERAL(-3.25042107247001499370e-01); /* 0xBFD4CD7D, 0x691CB913 */
+		const float64_t pp2 = NBL_FP64_LITERAL(-2.84817495755985104766e-02); /* 0xBF9D2A51, 0xDBD7194F */
+		const float64_t pp3 = NBL_FP64_LITERAL(-5.77027029648944159157e-03); /* 0xBF77A291, 0x236668E4 */
+		const float64_t pp4 = NBL_FP64_LITERAL(-2.37630166566501626084e-05); /* 0xBEF8EAD6, 0x120016AC */
+		const float64_t qq1 = NBL_FP64_LITERAL(3.97917223959155352819e-01); /* 0x3FD97779, 0xCDDADC09 */
+		const float64_t qq2 = NBL_FP64_LITERAL(6.50222499887672944485e-02); /* 0x3FB0A54C, 0x5536CEBA */
+		const float64_t qq3 = NBL_FP64_LITERAL(5.08130628187576562776e-03); /* 0x3F74D022, 0xC4D36B0F */
+		const float64_t qq4 = NBL_FP64_LITERAL(1.32494738004321644526e-04); /* 0x3F215DC9, 0x221C1A10 */
+		const float64_t qq5 = NBL_FP64_LITERAL(-3.96022827877536812320e-06); /* 0xBED09C43, 0x42A26120 */
+
+		//Coefficients for approximation to erf in [0.84375,1.25]
+		const float64_t pa0 = NBL_FP64_LITERAL(-2.36211856075265944077e-03); /* 0xBF6359B8, 0xBEF77538 */
+		const float64_t pa1 = NBL_FP64_LITERAL(4.14856118683748331666e-01); /* 0x3FDA8D00, 0xAD92B34D */
+		const float64_t pa2 = NBL_FP64_LITERAL(-3.72207876035701323847e-01); /* 0xBFD7D240, 0xFBB8C3F1 */
+		const float64_t pa3 = NBL_FP64_LITERAL(3.18346619901161753674e-01); /* 0x3FD45FCA, 0x805120E4 */
+		const float64_t pa4 = NBL_FP64_LITERAL(-1.10894694282396677476e-01); /* 0xBFBC6398, 0x3D3E28EC */
+		const float64_t pa5 = NBL_FP64_LITERAL(3.54783043256182359371e-02); /* 0x3FA22A36, 0x599795EB */
+		const float64_t pa6 = NBL_FP64_LITERAL(-2.16637559486879084300e-03); /* 0xBF61BF38, 0x0A96073F */
+		const float64_t qa1 = NBL_FP64_LITERAL(1.06420880400844228286e-01); /* 0x3FBB3E66, 0x18EEE323 */
+		const float64_t qa2 = NBL_FP64_LITERAL(5.40397917702171048937e-01); /* 0x3FE14AF0, 0x92EB6F33 */
+		const float64_t qa3 = NBL_FP64_LITERAL(7.18286544141962662868e-02); /* 0x3FB2635C, 0xD99FE9A7 */
+		const float64_t qa4 = NBL_FP64_LITERAL(1.26171219808761642112e-01); /* 0x3FC02660, 0xE763351F */
+		const float64_t qa5 = NBL_FP64_LITERAL(1.36370839120290507362e-02); /* 0x3F8BEDC2, 0x6B51DD1C */
+		const float64_t qa6 = NBL_FP64_LITERAL(1.19844998467991074170e-02); /* 0x3F888B54, 0x5735151D */
+
+		// Coefficients for approximation to erfc in [1.25,1/0.35]
+		const float64_t ra0 = NBL_FP64_LITERAL(-9.86494403484714822705e-03); /* 0xBF843412, 0x600D6435 */
+		const float64_t ra1 = NBL_FP64_LITERAL(-6.93858572707181764372e-01); /* 0xBFE63416, 0xE4BA7360 */
+		const float64_t ra2 = NBL_FP64_LITERAL(-1.05586262253232909814e+01); /* 0xC0251E04, 0x41B0E726 */
+		const float64_t ra3 = NBL_FP64_LITERAL(-6.23753324503260060396e+01); /* 0xC04F300A, 0xE4CBA38D */
+		const float64_t ra4 = NBL_FP64_LITERAL(-1.62396669462573470355e+02); /* 0xC0644CB1, 0x84282266 */
+		const float64_t ra5 = NBL_FP64_LITERAL(-1.84605092906711035994e+02); /* 0xC067135C, 0xEBCCABB2 */
+		const float64_t ra6 = NBL_FP64_LITERAL(-8.12874355063065934246e+01); /* 0xC0545265, 0x57E4D2F2 */
+		const float64_t ra7 = NBL_FP64_LITERAL(-9.81432934416914548592e+00); /* 0xC023A0EF, 0xC69AC25C */
+		const float64_t sa1 = NBL_FP64_LITERAL(1.96512716674392571292e+01); /* 0x4033A6B9, 0xBD707687 */
+		const float64_t sa2 = NBL_FP64_LITERAL(1.37657754143519042600e+02); /* 0x4061350C, 0x526AE721 */
+		const float64_t sa3 = NBL_FP64_LITERAL(4.34565877475229228821e+02); /* 0x407B290D, 0xD58A1A71 */
+		const float64_t sa4 = NBL_FP64_LITERAL(6.45387271733267880336e+02); /* 0x40842B19, 0x21EC2868 */
+		const float64_t sa5 = NBL_FP64_LITERAL(4.29008140027567833386e+02); /* 0x407AD021, 0x57700314 */
+		const float64_t sa6 = NBL_FP64_LITERAL(1.08635005541779435134e+02); /* 0x405B28A3, 0xEE48AE2C */
+		const float64_t sa7 = NBL_FP64_LITERAL(6.57024977031928170135e+00); /* 0x401A47EF, 0x8E484A93 */
+		const float64_t sa8 = NBL_FP64_LITERAL(-6.04244152148580987438e-02); /* 0xBFAEEFF2, 0xEE749A62 */
+
+		// Coefficients for approximation to erfc in [1/.35,28]
+		const float64_t rb0 = NBL_FP64_LITERAL(-9.86494292470009928597e-03); /* 0xBF843412, 0x39E86F4A */
+		const float64_t rb1 = NBL_FP64_LITERAL(-7.99283237680523006574e-01); /* 0xBFE993BA, 0x70C285DE */
+		const float64_t rb2 = NBL_FP64_LITERAL(-1.77579549177547519889e+01); /* 0xC031C209, 0x555F995A */
+		const float64_t rb3 = NBL_FP64_LITERAL(-1.60636384855821916062e+02); /* 0xC064145D, 0x43C5ED98 */
+		const float64_t rb4 = NBL_FP64_LITERAL(-6.37566443368389627722e+02); /* 0xC083EC88, 0x1375F228 */
+		const float64_t rb5 = NBL_FP64_LITERAL(-1.02509513161107724954e+03); /* 0xC0900461, 0x6A2E5992 */
+		const float64_t rb6 = NBL_FP64_LITERAL(-4.83519191608651397019e+02); /* 0xC07E384E, 0x9BDC383F */
+		const float64_t sb1 = NBL_FP64_LITERAL(3.03380607434824582924e+01); /* 0x403E568B, 0x261D5190 */
+		const float64_t sb2 = NBL_FP64_LITERAL(3.25792512996573918826e+02); /* 0x40745CAE, 0x221B9F0A */
+		const float64_t sb3 = NBL_FP64_LITERAL(1.53672958608443695994e+03); /* 0x409802EB, 0x189D5118 */
+		const float64_t sb4 = NBL_FP64_LITERAL(3.19985821950859553908e+03); /* 0x40A8FFB7, 0x688C246A */
+		const float64_t sb5 = NBL_FP64_LITERAL(2.55305040643316442583e+03); /* 0x40A3F219, 0xCEDF3BE6 */
+		const float64_t sb6 = NBL_FP64_LITERAL(4.74528541206955367215e+02); /* 0x407DA874, 0xE79FE763 */
+		const float64_t sb7 = NBL_FP64_LITERAL(-2.24409524465858183362e+01); /* 0xC03670E2, 0x42712D62 */
+
+		float64_t x = float64_t(_x);
+		int32_t hx, ix;
+		float64_t s, y, z, r;
+		hx = int32_t(bit_cast<uint64_t, float64_t>(x) >> 32);
+		ix = hx & 0x7fffffff;
+		if (ix >= 0x7ff00000)           // erf(nan)=nan, erf(+-inf)=+-1
+		{
+			int32_t i = ((uint32_t)hx >> 31) << 1;
+			return (float64_t)(1.0 - i) + one / x;
+		}
+
+		float64_t P, Q;
+		if (ix < 0x3feb0000)            // |x| < 0.84375
+		{
+			if (ix < 0x3e300000)        // |x| < 2**-28
+			{
+				if (ix < 0x00800000)
+				{
+					// avoid underflow
+					return FloatingPoint(0.0625 * (16.0 * x + (16.0 * efx) * x));
+				}
+				return FloatingPoint(x + efx * x);
+			}
+			z = x * x;
+			r = pp0 + z * (pp1 + z * (pp2 + z * (pp3 + z * pp4)));
+			s = one + z * (qq1 + z * (qq2 + z * (qq3 + z * (qq4 + z * qq5))));
+			y = r / s;
+			return FloatingPoint(x + x * y);
+		}
+		if (ix < 0x3ff40000)            // 0.84375 <= |x| < 1.25
+		{
+			s = abs_helper<float64_t>::__call(x) - one;
+			P = pa0 + s * (pa1 + s * (pa2 + s * (pa3 + s * (pa4 + s * (pa5 + s * pa6)))));
+			Q = one + s * (qa1 + s * (qa2 + s * (qa3 + s * (qa4 + s * (qa5 + s * (qa5 + s * qa6))))));
+			if (hx >= 0)
+				return FloatingPoint(erx + P / Q);
+			else
+				return FloatingPoint(-erx - P / Q);
+		}
+		if (ix >= 0x40180000)           // inf > |x| >= 6
+		{
+			if (hx >= 0)
+				return FloatingPoint(one - tiny);
+			else
+				return FloatingPoint(tiny - one);
+		}
+
+		x = abs_helper<float64_t>::__call(x);
+		s = one / (x * x);
+		float64_t R, S;
+		if (ix < 0x4006DB6E)            // |x| < 1/0.35     ~2.85714
+		{
+			R = ra0 + s * (ra1 + s * (ra2 + s * (ra3 + s * (ra4 + s * (ra5 + s * (ra6 + s * ra7))))));
+			S = one + s * (sa1 + s * (sa2 + s * (sa3 + s * (sa4 + s * (sa5 + s * (sa6 + s * sa7))))));
+		}
+		else                            // |x| >= 1/0.35
+		{
+			R = rb0 + s * (rb1 + s * (rb2 + s * (rb3 + s * (rb4 + s * rb5))));
+			S = one + s * (sb1 + s * (sb2 + s * (sb3 + s * (sb4 + s * (sb5 + s * (sb6 + s * sb7))))));
+		}
+		z = x;
+		uint64_t z1 = bit_cast<uint64_t, float64_t>(x);
+		z1 &= 0xffffffff00000000;
+		z = bit_cast<float64_t, uint64_t>(z1);
+		r = exp_helper<float64_t>::__call(-z * z - 0.5625) * exp_helper<float64_t>::__call((z - x) * (z + x) + R / S);
+		if (hx >= 0)
+			return FloatingPoint(one - r / x);
+		else
+			return FloatingPoint(r / x - one);
+	}
+};
+
+template<>
+struct erf_helper<float32_t>
+{
+	static float32_t __call(NBL_CONST_REF_ARG(float32_t) _x)
+	{
+		// A&S approximation to 1.5x10-7
+		const float32_t a1 = float32_t(NBL_FP64_LITERAL(0.254829592));
+		const float32_t a2 = float32_t(NBL_FP64_LITERAL(-0.284496736));
+		const float32_t a3 = float32_t(NBL_FP64_LITERAL(1.421413741));
+		const float32_t a4 = float32_t(NBL_FP64_LITERAL(-1.453152027));
+		const float32_t a5 = float32_t(NBL_FP64_LITERAL(1.061405429));
+		const float32_t p = float32_t(NBL_FP64_LITERAL(0.3275911));
 
-		FloatingPoint _sign = FloatingPoint(sign(_x));
-		FloatingPoint x = abs(_x);
+		float32_t _sign = float32_t(sign(_x));
+		float32_t x = abs(_x);
 
-		FloatingPoint t = FloatingPoint(NBL_FP64_LITERAL(1.0)) / (FloatingPoint(NBL_FP64_LITERAL(1.0)) + p * x);
-		FloatingPoint y = FloatingPoint(NBL_FP64_LITERAL(1.0)) - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x);
+		float32_t t = float32_t(NBL_FP64_LITERAL(1.0)) / (float32_t(NBL_FP64_LITERAL(1.0)) + p * x);
+		float32_t y = float32_t(NBL_FP64_LITERAL(1.0)) - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x);
 
 		return _sign * y;
 	}
 };
 
+
 #else // C++ only specializations
 
 #define DECL_ARG(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) const _T arg##i
@@ -226,16 +372,16 @@ struct HELPER_NAME<BOOST_PP_SEQ_FOR_EACH_I(WRAP, _, ARG_TYPE_LIST)>\
 
 template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(cos_helper, cos, concepts::FloatingPointScalar<T>, (T), (T), T)
 template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(sin_helper, sin, concepts::FloatingPointScalar<T>, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(tan_helper, tan, concepts::FloatingPointScalar<T>, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(asin_helper, asin, concepts::FloatingPointScalar<T>, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(acos_helper, acos, concepts::FloatingPointScalar<T>, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(atan_helper, atan, concepts::FloatingPointScalar<T>, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(sinh_helper, sinh, concepts::FloatingPointScalar<T>, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(cosh_helper, cosh, concepts::FloatingPointScalar<T>, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(tanh_helper, tanh, concepts::FloatingPointScalar<T>, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(asinh_helper, asinh, concepts::FloatingPointScalar<T>, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(acosh_helper, acosh, concepts::FloatingPointScalar<T>, (T), (T), T)
-template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(atanh_helper, atanh, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(tan_helper, tan, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(asin_helper, asin, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(acos_helper, acos, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(atan_helper, atan, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(sinh_helper, sinh, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(cosh_helper, cosh, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(tanh_helper, tanh, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(asinh_helper, asinh, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(acosh_helper, acosh, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(atanh_helper, atanh, concepts::FloatingPointScalar<T>, (T), (T), T)
 template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(atan2_helper, atan2, concepts::FloatingPointScalar<T>, (T), (T)(T), T)
 template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(sqrt_helper, sqrt, concepts::FloatingPointScalar<T>, (T), (T), T)
 template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(abs_helper, abs, concepts::Scalar<T>, (T), (T), T)
@@ -283,11 +429,11 @@ requires concepts::FloatingPointScalar<T>
 struct isinf_helper<T>
 {
 	using return_t = bool;
-	static inline return_t __call(const T arg)
+	static inline return_t __call(const T arg)
 	{
-		// GCC and Clang will always return false with call to std::isinf when fast math is enabled,
-		// this implementation will always return appropriate output regardless is fast math is enabled or not
-		using AsUint = typename unsigned_integer_of_size<sizeof(T)>::type;
+		// GCC and Clang will always return false with call to std::isinf when fast math is enabled,
+		// this implementation will always return appropriate output regardless is fast math is enabled or not
+		using AsUint = typename unsigned_integer_of_size<sizeof(T)>::type;
 		return cpp_compat_intrinsics_impl::isinf_uint_impl(reinterpret_cast<const AsUint&>(arg));
 	}
 };
@@ -297,7 +443,7 @@ requires concepts::FloatingPointScalar<T>
 struct isnan_helper<T>
 {
 	using return_t = bool;
-	static inline return_t __call(const T arg)
+	static inline return_t __call(const T arg)
 	{
 		// GCC and Clang will always return false with call to std::isnan when fast math is enabled,
 		// this implementation will always return appropriate output regardless is fast math is enabled or not
@@ -324,13 +470,13 @@ struct roundEven_helper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPoin
 	static FloatingPoint __call(NBL_CONST_REF_ARG(FloatingPoint) x)
 	{
 		// TODO: no way this is optimal, find a better implementation
-		float tmp;
-		if (std::abs(std::modf(x, &tmp)) == 0.5f)
-		{
-			int32_t result = static_cast<int32_t>(x);
-			if (result % 2 != 0)
-				result >= 0 ? ++result : --result;
-			return result;
+		float tmp;
+		if (std::abs(std::modf(x, &tmp)) == 0.5f)
+		{
+			int32_t result = static_cast<int32_t>(x);
+			if (result % 2 != 0)
+				result >= 0 ? ++result : --result;
+			return result;
 		}
 
 		return std::round(x);
@@ -389,6 +535,27 @@ struct frexpStruct_helper<T>
 
 // C++ and HLSL specializations
 
+template<>
+struct erf_helper<float16_t>
+{
+	static float16_t __call(float16_t _x)
+	{
+		// A&S approximation to 2.5x10-5
+		const float16_t a1 = float16_t(0.3480242f);
+		const float16_t a2 = float16_t(-0.0958798f);
+		const float16_t a3 = float16_t(0.7478556f);
+		const float16_t p = float16_t(0.47047f);
+
+		float16_t _sign = float16_t(sign<float16_t>(_x));
+		float16_t x = abs_helper<float16_t>::__call(_x);
+
+		float16_t t = float16_t(1.f) / (float16_t(1.f) + p * x);
+		float16_t y = float16_t(1.f) - (((a3 * t + a2) * t) + a1) * t * exp(-x * x);
+
+		return _sign * y;
+	}
+};
+
 template<typename FloatingPoint>
 NBL_PARTIAL_REQ_TOP(concepts::FloatingPointScalar<FloatingPoint>)
 struct erfInv_helper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointScalar<FloatingPoint>) >
@@ -471,14 +638,14 @@ AUTO_SPECIALIZE_HELPER_FOR_VECTOR(isnan_helper, VECTOR_SPECIALIZATION_CONCEPT, I
 AUTO_SPECIALIZE_HELPER_FOR_VECTOR(cos_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
 AUTO_SPECIALIZE_HELPER_FOR_VECTOR(sin_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
 AUTO_SPECIALIZE_HELPER_FOR_VECTOR(acos_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
-AUTO_SPECIALIZE_HELPER_FOR_VECTOR(tan_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
-AUTO_SPECIALIZE_HELPER_FOR_VECTOR(asin_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
-AUTO_SPECIALIZE_HELPER_FOR_VECTOR(atan_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
-AUTO_SPECIALIZE_HELPER_FOR_VECTOR(sinh_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
-AUTO_SPECIALIZE_HELPER_FOR_VECTOR(cosh_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
-AUTO_SPECIALIZE_HELPER_FOR_VECTOR(tanh_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
-AUTO_SPECIALIZE_HELPER_FOR_VECTOR(asinh_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
-AUTO_SPECIALIZE_HELPER_FOR_VECTOR(acosh_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(tan_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(asin_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(atan_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(sinh_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(cosh_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(tanh_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(asinh_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(acosh_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
 AUTO_SPECIALIZE_HELPER_FOR_VECTOR(atanh_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
 AUTO_SPECIALIZE_HELPER_FOR_VECTOR(modf_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
 AUTO_SPECIALIZE_HELPER_FOR_VECTOR(round_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
@@ -636,4 +803,4 @@ struct atan2_helper<T NBL_PARTIAL_REQ_BOT(VECTOR_SPECIALIZATION_CONCEPT) >
 }
 }
 
-#endif
\ No newline at end of file
+#endif

From d1ff526e771c843cf0938293a8d6e411c4e910e0 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 4 Mar 2025 15:40:38 +0700
Subject: [PATCH 07/11] erfinv for fp64

---
 include/nbl/builtin/hlsl/tgmath/impl.hlsl | 91 ++++++++++++++++++++++-
 1 file changed, 88 insertions(+), 3 deletions(-)

diff --git a/include/nbl/builtin/hlsl/tgmath/impl.hlsl b/include/nbl/builtin/hlsl/tgmath/impl.hlsl
index faa939459e..f46aeabdc1 100644
--- a/include/nbl/builtin/hlsl/tgmath/impl.hlsl
+++ b/include/nbl/builtin/hlsl/tgmath/impl.hlsl
@@ -146,7 +146,7 @@ template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(frexpStruct_helper, fre
 #define ISINF_AND_ISNAN_RETURN_TYPE conditional_t<is_vector_v<T>, vector<bool, vector_traits<T>::Dimension>, bool>
 template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(isinf_helper, isInf, (T), (T), ISINF_AND_ISNAN_RETURN_TYPE)
 template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(isnan_helper, isNan, (T), (T), ISINF_AND_ISNAN_RETURN_TYPE)
-#undef ISINF_AND_ISNAN_RETURN_TYPE 
+#undef ISINF_AND_ISNAN_RETURN_TYPE
 
 #undef DECLVAL
 #undef DECL_ARG
@@ -596,6 +596,91 @@ struct erfInv_helper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointSc
 	}
 };
 
+template<>
+struct erfInv_helper<float64_t>
+{
+	static float64_t __call(NBL_CONST_REF_ARG(float64_t) _x)
+	{
+		float64_t x = clamp<float64_t>(_x, NBL_FP64_LITERAL(-0.99999), NBL_FP64_LITERAL(0.99999));
+
+		float64_t w = -log_helper<float64_t>::__call((NBL_FP64_LITERAL(1.0) - x) * (NBL_FP64_LITERAL(1.0) + x));
+		float64_t p;
+		if (w < 6.250000)
+		{
+			w -= NBL_FP64_LITERAL(3.125000);
+			p = NBL_FP64_LITERAL(-3.6444120640178196996e-21);
+			p = NBL_FP64_LITERAL(-1.685059138182016589e-19) + p * w;
+			p = NBL_FP64_LITERAL(1.2858480715256400167e-18) + p * w;
+			p = NBL_FP64_LITERAL(1.115787767802518096e-17) + p * w;
+			p = NBL_FP64_LITERAL(-1.333171662854620906e-16) + p * w;
+			p = NBL_FP64_LITERAL(2.0972767875968561637e-17) + p * w;
+			p = NBL_FP64_LITERAL(6.6376381343583238325e-15) + p * w;
+			p = NBL_FP64_LITERAL(-4.0545662729752068639e-14) + p * w;
+			p = NBL_FP64_LITERAL(-8.1519341976054721522e-14) + p * w;
+			p = NBL_FP64_LITERAL(2.6335093153082322977e-12) + p * w;
+			p = NBL_FP64_LITERAL(-1.2975133253453532498e-11) + p * w;
+			p = NBL_FP64_LITERAL(-5.4154120542946279317e-11) + p * w;
+			p = NBL_FP64_LITERAL(1.051212273321532285e-09) + p * w;
+			p = NBL_FP64_LITERAL(-4.1126339803469836976e-09) + p * w;
+			p = NBL_FP64_LITERAL(-2.9070369957882005086e-08) + p * w;
+			p = NBL_FP64_LITERAL(4.2347877827932403518e-07) + p * w;
+			p = NBL_FP64_LITERAL(-1.3654692000834678645e-06) + p * w;
+			p = NBL_FP64_LITERAL(-1.3882523362786468719e-05) + p * w;
+			p = NBL_FP64_LITERAL(0.0001867342080340571352) + p * w;
+			p = NBL_FP64_LITERAL(-0.00074070253416626697512) + p * w;
+			p = NBL_FP64_LITERAL(-0.0060336708714301490533) + p * w;
+			p = NBL_FP64_LITERAL(0.24015818242558961693) + p * w;
+			p = NBL_FP64_LITERAL(1.6536545626831027356) + p * w;
+		}
+		else if (w < 16.000000)
+		{
+			w = sqrt_helper<float64_t>::__call(w) - NBL_FP64_LITERAL(3.250000);
+			p = NBL_FP64_LITERAL(2.2137376921775787049e-09);
+			p = NBL_FP64_LITERAL(9.0756561938885390979e-08) + p * w;
+			p = NBL_FP64_LITERAL(-2.7517406297064545428e-07) + p * w;
+			p = NBL_FP64_LITERAL(1.8239629214389227755e-08) + p * w;
+			p = NBL_FP64_LITERAL(1.5027403968909827627e-06) + p * w;
+			p = NBL_FP64_LITERAL(-4.013867526981545969e-06) + p * w;
+			p = NBL_FP64_LITERAL(2.9234449089955446044e-06) + p * w;
+			p = NBL_FP64_LITERAL(1.2475304481671778723e-05) + p * w;
+			p = NBL_FP64_LITERAL(-4.7318229009055733981e-05) + p * w;
+			p = NBL_FP64_LITERAL(6.8284851459573175448e-05) + p * w;
+			p = NBL_FP64_LITERAL(2.4031110387097893999e-05) + p * w;
+			p = NBL_FP64_LITERAL(-0.0003550375203628474796) + p * w;
+			p = NBL_FP64_LITERAL(0.00095328937973738049703) + p * w;
+			p = NBL_FP64_LITERAL(-0.0016882755560235047313) + p * w;
+			p = NBL_FP64_LITERAL(0.0024914420961078508066) + p * w;
+			p = NBL_FP64_LITERAL(-0.0037512085075692412107) + p * w;
+			p = NBL_FP64_LITERAL(0.005370914553590063617) + p * w;
+			p = NBL_FP64_LITERAL(1.0052589676941592334) + p * w;
+			p = NBL_FP64_LITERAL(3.0838856104922207635) + p * w;
+		}
+		else
+		{
+			w = sqrt_helper<float64_t>::__call(w) - NBL_FP64_LITERAL(5.000000);
+			p = NBL_FP64_LITERAL(-2.7109920616438573243e-11);
+			p = NBL_FP64_LITERAL(-2.5556418169965252055e-10) + p * w;
+			p = NBL_FP64_LITERAL(1.5076572693500548083e-09) + p * w;
+			p = NBL_FP64_LITERAL(-3.7894654401267369937e-09) + p * w;
+			p = NBL_FP64_LITERAL(7.6157012080783393804e-09) + p * w;
+			p = NBL_FP64_LITERAL(-1.4960026627149240478e-08) + p * w;
+			p = NBL_FP64_LITERAL(2.9147953450901080826e-08) + p * w;
+			p = NBL_FP64_LITERAL(-6.7711997758452339498e-08) + p * w;
+			p = NBL_FP64_LITERAL(2.2900482228026654717e-07) + p * w;
+			p = NBL_FP64_LITERAL(-9.9298272942317002539e-07) + p * w;
+			p = NBL_FP64_LITERAL(4.5260625972231537039e-06) + p * w;
+			p = NBL_FP64_LITERAL(-1.9681778105531670567e-05) + p * w;
+			p = NBL_FP64_LITERAL(7.5995277030017761139e-05) + p * w;
+			p = NBL_FP64_LITERAL(-0.00021503011930044477347) + p * w;
+			p = NBL_FP64_LITERAL(-0.00013871931833623122026) + p * w;
+			p = NBL_FP64_LITERAL(1.0103004648645343977) + p * w;
+			p = NBL_FP64_LITERAL(4.8499064014085844221) + p * w;
+		}
+
+		return p * x;
+	}
+};
+
 #ifdef __HLSL_VERSION
 // SPIR-V already defines specializations for builtin vector types
 #define VECTOR_SPECIALIZATION_CONCEPT concepts::Vectorial<T> && !is_vector_v<T>
@@ -668,11 +753,11 @@ struct pow_helper<T NBL_PARTIAL_REQ_BOT(VECTOR_SPECIALIZATION_CONCEPT) >
 		using traits = hlsl::vector_traits<T>;
 		array_get<T, typename traits::scalar_type> getter;
 		array_set<T, typename traits::scalar_type> setter;
-		
+
 		return_t output;
 		for (uint32_t i = 0; i < traits::Dimension; ++i)
 			setter(output, i, pow_helper<typename traits::scalar_type>::__call(getter(x, i), getter(y, i)));
-	
+
 		return output;
 	}
 };

From 4c963587b5cc8700224a02a15e41301bc49654ff Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 4 Mar 2025 15:53:40 +0700
Subject: [PATCH 08/11] reverted erf_helper

---
 include/nbl/builtin/hlsl/concepts/core.hlsl |   6 +-
 include/nbl/builtin/hlsl/tgmath/impl.hlsl   | 169 ++------------------
 2 files changed, 15 insertions(+), 160 deletions(-)

diff --git a/include/nbl/builtin/hlsl/concepts/core.hlsl b/include/nbl/builtin/hlsl/concepts/core.hlsl
index dcbafae8a5..a732783f2c 100644
--- a/include/nbl/builtin/hlsl/concepts/core.hlsl
+++ b/include/nbl/builtin/hlsl/concepts/core.hlsl
@@ -29,13 +29,13 @@ template<typename T>
 NBL_BOOL_CONCEPT UnsignedIntegral = !nbl::hlsl::is_signed_v<T> && ::nbl::hlsl::is_integral_v<T>;
 
 template<typename T>
-NBL_BOOL_CONCEPT FloatingPoint = nbl::hlsl::is_floating_point_v<T>;
+NBL_BOOL_CONCEPT FloatingPoint = nbl::hlsl::is_floating_point_v<T> || nbl::hlsl::is_same_v<T, float16_t>;
 
 template<typename T>
 NBL_BOOL_CONCEPT Boolean = nbl::hlsl::is_same_v<T, bool> || (nbl::hlsl::is_vector_v<T> && nbl::hlsl::is_same_v<typename vector_traits<T>::scalar_type, bool>);
 
 template <typename T>
-NBL_BOOL_CONCEPT Scalar = nbl::hlsl::is_scalar_v<T>;
+NBL_BOOL_CONCEPT Scalar = nbl::hlsl::is_scalar_v<T> || nbl::hlsl::is_same_v<T, float16_t>;
 
 template<typename T>
 NBL_BOOL_CONCEPT IntegralScalar = nbl::hlsl::is_integral_v<T> && nbl::hlsl::is_scalar_v<T>;
@@ -47,7 +47,7 @@ template<typename T>
 NBL_BOOL_CONCEPT UnsignedIntegralScalar = !nbl::hlsl::is_signed_v<T> && ::nbl::hlsl::is_integral_v<T> && nbl::hlsl::is_scalar_v<T>;
 
 template<typename T>
-NBL_BOOL_CONCEPT FloatingPointScalar = nbl::hlsl::is_floating_point_v<T> && nbl::hlsl::is_scalar_v<T>;
+NBL_BOOL_CONCEPT FloatingPointScalar = (nbl::hlsl::is_floating_point_v<T> && nbl::hlsl::is_scalar_v<T>) || nbl::hlsl::is_same_v<T, float16_t>;
 
 template<typename T>
 NBL_BOOL_CONCEPT BooleanScalar = concepts::Boolean<T> && nbl::hlsl::is_scalar_v<T>;
diff --git a/include/nbl/builtin/hlsl/tgmath/impl.hlsl b/include/nbl/builtin/hlsl/tgmath/impl.hlsl
index f46aeabdc1..46f18c85db 100644
--- a/include/nbl/builtin/hlsl/tgmath/impl.hlsl
+++ b/include/nbl/builtin/hlsl/tgmath/impl.hlsl
@@ -189,163 +189,18 @@ struct erf_helper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointScala
 {
 	static FloatingPoint __call(NBL_CONST_REF_ARG(FloatingPoint) _x)
 	{
-		// glibc implementation
-		const float64_t tiny = NBL_FP64_LITERAL(1e-300),
-			one = NBL_FP64_LITERAL(1.00000000000000000000e+00), /* 0x3FF00000, 0x00000000 */
-			erx = NBL_FP64_LITERAL(8.45062911510467529297e-01); /* 0x3FEB0AC1, 0x60000000 */
-
-		// Coefficients for approximation to erf in [0,0.84375]
-		const float64_t efx = NBL_FP64_LITERAL(1.28379167095512586316e-01); /* 0x3FC06EBA, 0x8214DB69 */
-		const float64_t pp0 = NBL_FP64_LITERAL(1.28379167095512558561e-01); /* 0x3FC06EBA, 0x8214DB68 */
-		const float64_t pp1 = NBL_FP64_LITERAL(-3.25042107247001499370e-01); /* 0xBFD4CD7D, 0x691CB913 */
-		const float64_t pp2 = NBL_FP64_LITERAL(-2.84817495755985104766e-02); /* 0xBF9D2A51, 0xDBD7194F */
-		const float64_t pp3 = NBL_FP64_LITERAL(-5.77027029648944159157e-03); /* 0xBF77A291, 0x236668E4 */
-		const float64_t pp4 = NBL_FP64_LITERAL(-2.37630166566501626084e-05); /* 0xBEF8EAD6, 0x120016AC */
-		const float64_t qq1 = NBL_FP64_LITERAL(3.97917223959155352819e-01); /* 0x3FD97779, 0xCDDADC09 */
-		const float64_t qq2 = NBL_FP64_LITERAL(6.50222499887672944485e-02); /* 0x3FB0A54C, 0x5536CEBA */
-		const float64_t qq3 = NBL_FP64_LITERAL(5.08130628187576562776e-03); /* 0x3F74D022, 0xC4D36B0F */
-		const float64_t qq4 = NBL_FP64_LITERAL(1.32494738004321644526e-04); /* 0x3F215DC9, 0x221C1A10 */
-		const float64_t qq5 = NBL_FP64_LITERAL(-3.96022827877536812320e-06); /* 0xBED09C43, 0x42A26120 */
-
-		//Coefficients for approximation to erf in [0.84375,1.25]
-		const float64_t pa0 = NBL_FP64_LITERAL(-2.36211856075265944077e-03); /* 0xBF6359B8, 0xBEF77538 */
-		const float64_t pa1 = NBL_FP64_LITERAL(4.14856118683748331666e-01); /* 0x3FDA8D00, 0xAD92B34D */
-		const float64_t pa2 = NBL_FP64_LITERAL(-3.72207876035701323847e-01); /* 0xBFD7D240, 0xFBB8C3F1 */
-		const float64_t pa3 = NBL_FP64_LITERAL(3.18346619901161753674e-01); /* 0x3FD45FCA, 0x805120E4 */
-		const float64_t pa4 = NBL_FP64_LITERAL(-1.10894694282396677476e-01); /* 0xBFBC6398, 0x3D3E28EC */
-		const float64_t pa5 = NBL_FP64_LITERAL(3.54783043256182359371e-02); /* 0x3FA22A36, 0x599795EB */
-		const float64_t pa6 = NBL_FP64_LITERAL(-2.16637559486879084300e-03); /* 0xBF61BF38, 0x0A96073F */
-		const float64_t qa1 = NBL_FP64_LITERAL(1.06420880400844228286e-01); /* 0x3FBB3E66, 0x18EEE323 */
-		const float64_t qa2 = NBL_FP64_LITERAL(5.40397917702171048937e-01); /* 0x3FE14AF0, 0x92EB6F33 */
-		const float64_t qa3 = NBL_FP64_LITERAL(7.18286544141962662868e-02); /* 0x3FB2635C, 0xD99FE9A7 */
-		const float64_t qa4 = NBL_FP64_LITERAL(1.26171219808761642112e-01); /* 0x3FC02660, 0xE763351F */
-		const float64_t qa5 = NBL_FP64_LITERAL(1.36370839120290507362e-02); /* 0x3F8BEDC2, 0x6B51DD1C */
-		const float64_t qa6 = NBL_FP64_LITERAL(1.19844998467991074170e-02); /* 0x3F888B54, 0x5735151D */
-
-		// Coefficients for approximation to erfc in [1.25,1/0.35]
-		const float64_t ra0 = NBL_FP64_LITERAL(-9.86494403484714822705e-03); /* 0xBF843412, 0x600D6435 */
-		const float64_t ra1 = NBL_FP64_LITERAL(-6.93858572707181764372e-01); /* 0xBFE63416, 0xE4BA7360 */
-		const float64_t ra2 = NBL_FP64_LITERAL(-1.05586262253232909814e+01); /* 0xC0251E04, 0x41B0E726 */
-		const float64_t ra3 = NBL_FP64_LITERAL(-6.23753324503260060396e+01); /* 0xC04F300A, 0xE4CBA38D */
-		const float64_t ra4 = NBL_FP64_LITERAL(-1.62396669462573470355e+02); /* 0xC0644CB1, 0x84282266 */
-		const float64_t ra5 = NBL_FP64_LITERAL(-1.84605092906711035994e+02); /* 0xC067135C, 0xEBCCABB2 */
-		const float64_t ra6 = NBL_FP64_LITERAL(-8.12874355063065934246e+01); /* 0xC0545265, 0x57E4D2F2 */
-		const float64_t ra7 = NBL_FP64_LITERAL(-9.81432934416914548592e+00); /* 0xC023A0EF, 0xC69AC25C */
-		const float64_t sa1 = NBL_FP64_LITERAL(1.96512716674392571292e+01); /* 0x4033A6B9, 0xBD707687 */
-		const float64_t sa2 = NBL_FP64_LITERAL(1.37657754143519042600e+02); /* 0x4061350C, 0x526AE721 */
-		const float64_t sa3 = NBL_FP64_LITERAL(4.34565877475229228821e+02); /* 0x407B290D, 0xD58A1A71 */
-		const float64_t sa4 = NBL_FP64_LITERAL(6.45387271733267880336e+02); /* 0x40842B19, 0x21EC2868 */
-		const float64_t sa5 = NBL_FP64_LITERAL(4.29008140027567833386e+02); /* 0x407AD021, 0x57700314 */
-		const float64_t sa6 = NBL_FP64_LITERAL(1.08635005541779435134e+02); /* 0x405B28A3, 0xEE48AE2C */
-		const float64_t sa7 = NBL_FP64_LITERAL(6.57024977031928170135e+00); /* 0x401A47EF, 0x8E484A93 */
-		const float64_t sa8 = NBL_FP64_LITERAL(-6.04244152148580987438e-02); /* 0xBFAEEFF2, 0xEE749A62 */
-
-		// Coefficients for approximation to erfc in [1/.35,28]
-		const float64_t rb0 = NBL_FP64_LITERAL(-9.86494292470009928597e-03); /* 0xBF843412, 0x39E86F4A */
-		const float64_t rb1 = NBL_FP64_LITERAL(-7.99283237680523006574e-01); /* 0xBFE993BA, 0x70C285DE */
-		const float64_t rb2 = NBL_FP64_LITERAL(-1.77579549177547519889e+01); /* 0xC031C209, 0x555F995A */
-		const float64_t rb3 = NBL_FP64_LITERAL(-1.60636384855821916062e+02); /* 0xC064145D, 0x43C5ED98 */
-		const float64_t rb4 = NBL_FP64_LITERAL(-6.37566443368389627722e+02); /* 0xC083EC88, 0x1375F228 */
-		const float64_t rb5 = NBL_FP64_LITERAL(-1.02509513161107724954e+03); /* 0xC0900461, 0x6A2E5992 */
-		const float64_t rb6 = NBL_FP64_LITERAL(-4.83519191608651397019e+02); /* 0xC07E384E, 0x9BDC383F */
-		const float64_t sb1 = NBL_FP64_LITERAL(3.03380607434824582924e+01); /* 0x403E568B, 0x261D5190 */
-		const float64_t sb2 = NBL_FP64_LITERAL(3.25792512996573918826e+02); /* 0x40745CAE, 0x221B9F0A */
-		const float64_t sb3 = NBL_FP64_LITERAL(1.53672958608443695994e+03); /* 0x409802EB, 0x189D5118 */
-		const float64_t sb4 = NBL_FP64_LITERAL(3.19985821950859553908e+03); /* 0x40A8FFB7, 0x688C246A */
-		const float64_t sb5 = NBL_FP64_LITERAL(2.55305040643316442583e+03); /* 0x40A3F219, 0xCEDF3BE6 */
-		const float64_t sb6 = NBL_FP64_LITERAL(4.74528541206955367215e+02); /* 0x407DA874, 0xE79FE763 */
-		const float64_t sb7 = NBL_FP64_LITERAL(-2.24409524465858183362e+01); /* 0xC03670E2, 0x42712D62 */
-
-		float64_t x = float64_t(_x);
-		int32_t hx, ix;
-		float64_t s, y, z, r;
-		hx = int32_t(bit_cast<uint64_t, float64_t>(x) >> 32);
-		ix = hx & 0x7fffffff;
-		if (ix >= 0x7ff00000)           // erf(nan)=nan, erf(+-inf)=+-1
-		{
-			int32_t i = ((uint32_t)hx >> 31) << 1;
-			return (float64_t)(1.0 - i) + one / x;
-		}
-
-		float64_t P, Q;
-		if (ix < 0x3feb0000)            // |x| < 0.84375
-		{
-			if (ix < 0x3e300000)        // |x| < 2**-28
-			{
-				if (ix < 0x00800000)
-				{
-					// avoid underflow
-					return FloatingPoint(0.0625 * (16.0 * x + (16.0 * efx) * x));
-				}
-				return FloatingPoint(x + efx * x);
-			}
-			z = x * x;
-			r = pp0 + z * (pp1 + z * (pp2 + z * (pp3 + z * pp4)));
-			s = one + z * (qq1 + z * (qq2 + z * (qq3 + z * (qq4 + z * qq5))));
-			y = r / s;
-			return FloatingPoint(x + x * y);
-		}
-		if (ix < 0x3ff40000)            // 0.84375 <= |x| < 1.25
-		{
-			s = abs_helper<float64_t>::__call(x) - one;
-			P = pa0 + s * (pa1 + s * (pa2 + s * (pa3 + s * (pa4 + s * (pa5 + s * pa6)))));
-			Q = one + s * (qa1 + s * (qa2 + s * (qa3 + s * (qa4 + s * (qa5 + s * (qa5 + s * qa6))))));
-			if (hx >= 0)
-				return FloatingPoint(erx + P / Q);
-			else
-				return FloatingPoint(-erx - P / Q);
-		}
-		if (ix >= 0x40180000)           // inf > |x| >= 6
-		{
-			if (hx >= 0)
-				return FloatingPoint(one - tiny);
-			else
-				return FloatingPoint(tiny - one);
-		}
-
-		x = abs_helper<float64_t>::__call(x);
-		s = one / (x * x);
-		float64_t R, S;
-		if (ix < 0x4006DB6E)            // |x| < 1/0.35     ~2.85714
-		{
-			R = ra0 + s * (ra1 + s * (ra2 + s * (ra3 + s * (ra4 + s * (ra5 + s * (ra6 + s * ra7))))));
-			S = one + s * (sa1 + s * (sa2 + s * (sa3 + s * (sa4 + s * (sa5 + s * (sa6 + s * sa7))))));
-		}
-		else                            // |x| >= 1/0.35
-		{
-			R = rb0 + s * (rb1 + s * (rb2 + s * (rb3 + s * (rb4 + s * rb5))));
-			S = one + s * (sb1 + s * (sb2 + s * (sb3 + s * (sb4 + s * (sb5 + s * (sb6 + s * sb7))))));
-		}
-		z = x;
-		uint64_t z1 = bit_cast<uint64_t, float64_t>(x);
-		z1 &= 0xffffffff00000000;
-		z = bit_cast<float64_t, uint64_t>(z1);
-		r = exp_helper<float64_t>::__call(-z * z - 0.5625) * exp_helper<float64_t>::__call((z - x) * (z + x) + R / S);
-		if (hx >= 0)
-			return FloatingPoint(one - r / x);
-		else
-			return FloatingPoint(r / x - one);
-	}
-};
-
-template<>
-struct erf_helper<float32_t>
-{
-	static float32_t __call(NBL_CONST_REF_ARG(float32_t) _x)
-	{
-		// A&S approximation to 1.5x10-7
-		const float32_t a1 = float32_t(NBL_FP64_LITERAL(0.254829592));
-		const float32_t a2 = float32_t(NBL_FP64_LITERAL(-0.284496736));
-		const float32_t a3 = float32_t(NBL_FP64_LITERAL(1.421413741));
-		const float32_t a4 = float32_t(NBL_FP64_LITERAL(-1.453152027));
-		const float32_t a5 = float32_t(NBL_FP64_LITERAL(1.061405429));
-		const float32_t p = float32_t(NBL_FP64_LITERAL(0.3275911));
-
-		float32_t _sign = float32_t(sign(_x));
-		float32_t x = abs(_x);
-
-		float32_t t = float32_t(NBL_FP64_LITERAL(1.0)) / (float32_t(NBL_FP64_LITERAL(1.0)) + p * x);
-		float32_t y = float32_t(NBL_FP64_LITERAL(1.0)) - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x);
+		const FloatingPoint a1 = FloatingPoint(NBL_FP64_LITERAL(0.254829592));
+		const FloatingPoint a2 = FloatingPoint(NBL_FP64_LITERAL(-0.284496736));
+		const FloatingPoint a3 = FloatingPoint(NBL_FP64_LITERAL(1.421413741));
+		const FloatingPoint a4 = FloatingPoint(NBL_FP64_LITERAL(-1.453152027));
+		const FloatingPoint a5 = FloatingPoint(NBL_FP64_LITERAL(1.061405429));
+		const FloatingPoint p = FloatingPoint(NBL_FP64_LITERAL(0.3275911));
+
+		FloatingPoint _sign = FloatingPoint(sign(_x));
+		FloatingPoint x = abs(_x);
+
+		FloatingPoint t = FloatingPoint(NBL_FP64_LITERAL(1.0)) / (FloatingPoint(NBL_FP64_LITERAL(1.0)) + p * x);
+		FloatingPoint y = FloatingPoint(NBL_FP64_LITERAL(1.0)) - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x);
 
 		return _sign * y;
 	}

From 75b50faa02c0af386c9cc4f8ae299bbe37cd7867 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 4 Mar 2025 17:01:16 +0700
Subject: [PATCH 09/11] some minor fixes

---
 include/nbl/builtin/hlsl/ieee754.hlsl     | 2 +-
 include/nbl/builtin/hlsl/tgmath/impl.hlsl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl
index 8d9c78a9f0..4b281c2111 100644
--- a/include/nbl/builtin/hlsl/ieee754.hlsl
+++ b/include/nbl/builtin/hlsl/ieee754.hlsl
@@ -148,7 +148,7 @@ NBL_CONSTEXPR_INLINE_FUNC FloatingPoint flipSign(FloatingPoint val, bool flip =
 	using AsFloat = typename float_of_size<sizeof(FloatingPoint)>::type;
 	using AsUint = typename unsigned_integer_of_size<sizeof(FloatingPoint)>::type;
 	const AsUint asUint = ieee754::impl::bitCastToUintType(val);
-	return bit_cast<FloatingPoint>(asUint ^ (flip ? ieee754::traits<AsFloat>::signMask : 0ull));
+	return bit_cast<FloatingPoint>(asUint ^ (flip ? ieee754::traits<AsFloat>::signMask : AsUint(0ull)));
 }
 
 }
diff --git a/include/nbl/builtin/hlsl/tgmath/impl.hlsl b/include/nbl/builtin/hlsl/tgmath/impl.hlsl
index 46f18c85db..24bf6796ea 100644
--- a/include/nbl/builtin/hlsl/tgmath/impl.hlsl
+++ b/include/nbl/builtin/hlsl/tgmath/impl.hlsl
@@ -458,7 +458,7 @@ struct erfInv_helper<float64_t>
 	{
 		float64_t x = clamp<float64_t>(_x, NBL_FP64_LITERAL(-0.99999), NBL_FP64_LITERAL(0.99999));
 
-		float64_t w = -log_helper<float64_t>::__call((NBL_FP64_LITERAL(1.0) - x) * (NBL_FP64_LITERAL(1.0) + x));
+		float64_t w = float64_t(-log_helper<float32_t>::__call((float32_t(NBL_FP64_LITERAL(1.0)) - x) * float32_t(NBL_FP64_LITERAL(1.0)) + x));
 		float64_t p;
 		if (w < 6.250000)
 		{

From 37a8b7b5335fae41743943694b98ffee6158c1fd Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 5 Mar 2025 10:39:18 +0700
Subject: [PATCH 10/11] reverted some changes, erf/erfInv fixes

---
 include/nbl/builtin/hlsl/concepts/core.hlsl |   6 +-
 include/nbl/builtin/hlsl/tgmath/impl.hlsl   | 171 ++++++++++----------
 2 files changed, 89 insertions(+), 88 deletions(-)

diff --git a/include/nbl/builtin/hlsl/concepts/core.hlsl b/include/nbl/builtin/hlsl/concepts/core.hlsl
index a732783f2c..c1bc0277df 100644
--- a/include/nbl/builtin/hlsl/concepts/core.hlsl
+++ b/include/nbl/builtin/hlsl/concepts/core.hlsl
@@ -29,13 +29,13 @@ template<typename T>
 NBL_BOOL_CONCEPT UnsignedIntegral = !nbl::hlsl::is_signed_v<T> && ::nbl::hlsl::is_integral_v<T>;
 
 template<typename T>
-NBL_BOOL_CONCEPT FloatingPoint = nbl::hlsl::is_floating_point_v<T> || nbl::hlsl::is_same_v<T, float16_t>;
+NBL_BOOL_CONCEPT FloatingPoint = nbl::hlsl::is_floating_point_v<T>;
 
 template<typename T>
 NBL_BOOL_CONCEPT Boolean = nbl::hlsl::is_same_v<T, bool> || (nbl::hlsl::is_vector_v<T> && nbl::hlsl::is_same_v<typename vector_traits<T>::scalar_type, bool>);
 
 template <typename T>
-NBL_BOOL_CONCEPT Scalar = nbl::hlsl::is_scalar_v<T> || nbl::hlsl::is_same_v<T, float16_t>;
+NBL_BOOL_CONCEPT Scalar = nbl::hlsl::is_scalar_v<T>;
 
 template<typename T>
 NBL_BOOL_CONCEPT IntegralScalar = nbl::hlsl::is_integral_v<T> && nbl::hlsl::is_scalar_v<T>;
@@ -47,7 +47,7 @@ template<typename T>
 NBL_BOOL_CONCEPT UnsignedIntegralScalar = !nbl::hlsl::is_signed_v<T> && ::nbl::hlsl::is_integral_v<T> && nbl::hlsl::is_scalar_v<T>;
 
 template<typename T>
-NBL_BOOL_CONCEPT FloatingPointScalar = (nbl::hlsl::is_floating_point_v<T> && nbl::hlsl::is_scalar_v<T>) || nbl::hlsl::is_same_v<T, float16_t>;
+NBL_BOOL_CONCEPT FloatingPointScalar = (nbl::hlsl::is_floating_point_v<T> && nbl::hlsl::is_scalar_v<T>);
 
 template<typename T>
 NBL_BOOL_CONCEPT BooleanScalar = concepts::Boolean<T> && nbl::hlsl::is_scalar_v<T>;
diff --git a/include/nbl/builtin/hlsl/tgmath/impl.hlsl b/include/nbl/builtin/hlsl/tgmath/impl.hlsl
index 24bf6796ea..c73ad500c4 100644
--- a/include/nbl/builtin/hlsl/tgmath/impl.hlsl
+++ b/include/nbl/builtin/hlsl/tgmath/impl.hlsl
@@ -405,7 +405,7 @@ struct erf_helper<float16_t>
 		float16_t x = abs_helper<float16_t>::__call(_x);
 
 		float16_t t = float16_t(1.f) / (float16_t(1.f) + p * x);
-		float16_t y = float16_t(1.f) - (((a3 * t + a2) * t) + a1) * t * exp(-x * x);
+		float16_t y = float16_t(1.f) - (((a3 * t + a2) * t) + a1) * t * exp_helper<float16_t>::__call(-x * x);
 
 		return _sign * y;
 	}
@@ -451,90 +451,91 @@ struct erfInv_helper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointSc
 	}
 };
 
-template<>
-struct erfInv_helper<float64_t>
-{
-	static float64_t __call(NBL_CONST_REF_ARG(float64_t) _x)
-	{
-		float64_t x = clamp<float64_t>(_x, NBL_FP64_LITERAL(-0.99999), NBL_FP64_LITERAL(0.99999));
-
-		float64_t w = float64_t(-log_helper<float32_t>::__call((float32_t(NBL_FP64_LITERAL(1.0)) - x) * float32_t(NBL_FP64_LITERAL(1.0)) + x));
-		float64_t p;
-		if (w < 6.250000)
-		{
-			w -= NBL_FP64_LITERAL(3.125000);
-			p = NBL_FP64_LITERAL(-3.6444120640178196996e-21);
-			p = NBL_FP64_LITERAL(-1.685059138182016589e-19) + p * w;
-			p = NBL_FP64_LITERAL(1.2858480715256400167e-18) + p * w;
-			p = NBL_FP64_LITERAL(1.115787767802518096e-17) + p * w;
-			p = NBL_FP64_LITERAL(-1.333171662854620906e-16) + p * w;
-			p = NBL_FP64_LITERAL(2.0972767875968561637e-17) + p * w;
-			p = NBL_FP64_LITERAL(6.6376381343583238325e-15) + p * w;
-			p = NBL_FP64_LITERAL(-4.0545662729752068639e-14) + p * w;
-			p = NBL_FP64_LITERAL(-8.1519341976054721522e-14) + p * w;
-			p = NBL_FP64_LITERAL(2.6335093153082322977e-12) + p * w;
-			p = NBL_FP64_LITERAL(-1.2975133253453532498e-11) + p * w;
-			p = NBL_FP64_LITERAL(-5.4154120542946279317e-11) + p * w;
-			p = NBL_FP64_LITERAL(1.051212273321532285e-09) + p * w;
-			p = NBL_FP64_LITERAL(-4.1126339803469836976e-09) + p * w;
-			p = NBL_FP64_LITERAL(-2.9070369957882005086e-08) + p * w;
-			p = NBL_FP64_LITERAL(4.2347877827932403518e-07) + p * w;
-			p = NBL_FP64_LITERAL(-1.3654692000834678645e-06) + p * w;
-			p = NBL_FP64_LITERAL(-1.3882523362786468719e-05) + p * w;
-			p = NBL_FP64_LITERAL(0.0001867342080340571352) + p * w;
-			p = NBL_FP64_LITERAL(-0.00074070253416626697512) + p * w;
-			p = NBL_FP64_LITERAL(-0.0060336708714301490533) + p * w;
-			p = NBL_FP64_LITERAL(0.24015818242558961693) + p * w;
-			p = NBL_FP64_LITERAL(1.6536545626831027356) + p * w;
-		}
-		else if (w < 16.000000)
-		{
-			w = sqrt_helper<float64_t>::__call(w) - NBL_FP64_LITERAL(3.250000);
-			p = NBL_FP64_LITERAL(2.2137376921775787049e-09);
-			p = NBL_FP64_LITERAL(9.0756561938885390979e-08) + p * w;
-			p = NBL_FP64_LITERAL(-2.7517406297064545428e-07) + p * w;
-			p = NBL_FP64_LITERAL(1.8239629214389227755e-08) + p * w;
-			p = NBL_FP64_LITERAL(1.5027403968909827627e-06) + p * w;
-			p = NBL_FP64_LITERAL(-4.013867526981545969e-06) + p * w;
-			p = NBL_FP64_LITERAL(2.9234449089955446044e-06) + p * w;
-			p = NBL_FP64_LITERAL(1.2475304481671778723e-05) + p * w;
-			p = NBL_FP64_LITERAL(-4.7318229009055733981e-05) + p * w;
-			p = NBL_FP64_LITERAL(6.8284851459573175448e-05) + p * w;
-			p = NBL_FP64_LITERAL(2.4031110387097893999e-05) + p * w;
-			p = NBL_FP64_LITERAL(-0.0003550375203628474796) + p * w;
-			p = NBL_FP64_LITERAL(0.00095328937973738049703) + p * w;
-			p = NBL_FP64_LITERAL(-0.0016882755560235047313) + p * w;
-			p = NBL_FP64_LITERAL(0.0024914420961078508066) + p * w;
-			p = NBL_FP64_LITERAL(-0.0037512085075692412107) + p * w;
-			p = NBL_FP64_LITERAL(0.005370914553590063617) + p * w;
-			p = NBL_FP64_LITERAL(1.0052589676941592334) + p * w;
-			p = NBL_FP64_LITERAL(3.0838856104922207635) + p * w;
-		}
-		else
-		{
-			w = sqrt_helper<float64_t>::__call(w) - NBL_FP64_LITERAL(5.000000);
-			p = NBL_FP64_LITERAL(-2.7109920616438573243e-11);
-			p = NBL_FP64_LITERAL(-2.5556418169965252055e-10) + p * w;
-			p = NBL_FP64_LITERAL(1.5076572693500548083e-09) + p * w;
-			p = NBL_FP64_LITERAL(-3.7894654401267369937e-09) + p * w;
-			p = NBL_FP64_LITERAL(7.6157012080783393804e-09) + p * w;
-			p = NBL_FP64_LITERAL(-1.4960026627149240478e-08) + p * w;
-			p = NBL_FP64_LITERAL(2.9147953450901080826e-08) + p * w;
-			p = NBL_FP64_LITERAL(-6.7711997758452339498e-08) + p * w;
-			p = NBL_FP64_LITERAL(2.2900482228026654717e-07) + p * w;
-			p = NBL_FP64_LITERAL(-9.9298272942317002539e-07) + p * w;
-			p = NBL_FP64_LITERAL(4.5260625972231537039e-06) + p * w;
-			p = NBL_FP64_LITERAL(-1.9681778105531670567e-05) + p * w;
-			p = NBL_FP64_LITERAL(7.5995277030017761139e-05) + p * w;
-			p = NBL_FP64_LITERAL(-0.00021503011930044477347) + p * w;
-			p = NBL_FP64_LITERAL(-0.00013871931833623122026) + p * w;
-			p = NBL_FP64_LITERAL(1.0103004648645343977) + p * w;
-			p = NBL_FP64_LITERAL(4.8499064014085844221) + p * w;
-		}
-
-		return p * x;
-	}
-};
+// log doesn't accept float64_t
+// template<>
+// struct erfInv_helper<float64_t>
+// {
+// 	static float64_t __call(NBL_CONST_REF_ARG(float64_t) _x)
+// 	{
+// 		float64_t x = clamp<float64_t>(_x, NBL_FP64_LITERAL(-0.99999), NBL_FP64_LITERAL(0.99999));
+
+// 		float64_t w = -log_helper<float64_t>::__call((NBL_FP64_LITERAL(1.0) - x) * (NBL_FP64_LITERAL(1.0) + x));
+// 		float64_t p;
+// 		if (w < 6.250000)
+// 		{
+// 			w -= NBL_FP64_LITERAL(3.125000);
+// 			p = NBL_FP64_LITERAL(-3.6444120640178196996e-21);
+// 			p = NBL_FP64_LITERAL(-1.685059138182016589e-19) + p * w;
+// 			p = NBL_FP64_LITERAL(1.2858480715256400167e-18) + p * w;
+// 			p = NBL_FP64_LITERAL(1.115787767802518096e-17) + p * w;
+// 			p = NBL_FP64_LITERAL(-1.333171662854620906e-16) + p * w;
+// 			p = NBL_FP64_LITERAL(2.0972767875968561637e-17) + p * w;
+// 			p = NBL_FP64_LITERAL(6.6376381343583238325e-15) + p * w;
+// 			p = NBL_FP64_LITERAL(-4.0545662729752068639e-14) + p * w;
+// 			p = NBL_FP64_LITERAL(-8.1519341976054721522e-14) + p * w;
+// 			p = NBL_FP64_LITERAL(2.6335093153082322977e-12) + p * w;
+// 			p = NBL_FP64_LITERAL(-1.2975133253453532498e-11) + p * w;
+// 			p = NBL_FP64_LITERAL(-5.4154120542946279317e-11) + p * w;
+// 			p = NBL_FP64_LITERAL(1.051212273321532285e-09) + p * w;
+// 			p = NBL_FP64_LITERAL(-4.1126339803469836976e-09) + p * w;
+// 			p = NBL_FP64_LITERAL(-2.9070369957882005086e-08) + p * w;
+// 			p = NBL_FP64_LITERAL(4.2347877827932403518e-07) + p * w;
+// 			p = NBL_FP64_LITERAL(-1.3654692000834678645e-06) + p * w;
+// 			p = NBL_FP64_LITERAL(-1.3882523362786468719e-05) + p * w;
+// 			p = NBL_FP64_LITERAL(0.0001867342080340571352) + p * w;
+// 			p = NBL_FP64_LITERAL(-0.00074070253416626697512) + p * w;
+// 			p = NBL_FP64_LITERAL(-0.0060336708714301490533) + p * w;
+// 			p = NBL_FP64_LITERAL(0.24015818242558961693) + p * w;
+// 			p = NBL_FP64_LITERAL(1.6536545626831027356) + p * w;
+// 		}
+// 		else if (w < 16.000000)
+// 		{
+// 			w = sqrt_helper<float64_t>::__call(w) - NBL_FP64_LITERAL(3.250000);
+// 			p = NBL_FP64_LITERAL(2.2137376921775787049e-09);
+// 			p = NBL_FP64_LITERAL(9.0756561938885390979e-08) + p * w;
+// 			p = NBL_FP64_LITERAL(-2.7517406297064545428e-07) + p * w;
+// 			p = NBL_FP64_LITERAL(1.8239629214389227755e-08) + p * w;
+// 			p = NBL_FP64_LITERAL(1.5027403968909827627e-06) + p * w;
+// 			p = NBL_FP64_LITERAL(-4.013867526981545969e-06) + p * w;
+// 			p = NBL_FP64_LITERAL(2.9234449089955446044e-06) + p * w;
+// 			p = NBL_FP64_LITERAL(1.2475304481671778723e-05) + p * w;
+// 			p = NBL_FP64_LITERAL(-4.7318229009055733981e-05) + p * w;
+// 			p = NBL_FP64_LITERAL(6.8284851459573175448e-05) + p * w;
+// 			p = NBL_FP64_LITERAL(2.4031110387097893999e-05) + p * w;
+// 			p = NBL_FP64_LITERAL(-0.0003550375203628474796) + p * w;
+// 			p = NBL_FP64_LITERAL(0.00095328937973738049703) + p * w;
+// 			p = NBL_FP64_LITERAL(-0.0016882755560235047313) + p * w;
+// 			p = NBL_FP64_LITERAL(0.0024914420961078508066) + p * w;
+// 			p = NBL_FP64_LITERAL(-0.0037512085075692412107) + p * w;
+// 			p = NBL_FP64_LITERAL(0.005370914553590063617) + p * w;
+// 			p = NBL_FP64_LITERAL(1.0052589676941592334) + p * w;
+// 			p = NBL_FP64_LITERAL(3.0838856104922207635) + p * w;
+// 		}
+// 		else
+// 		{
+// 			w = sqrt_helper<float64_t>::__call(w) - NBL_FP64_LITERAL(5.000000);
+// 			p = NBL_FP64_LITERAL(-2.7109920616438573243e-11);
+// 			p = NBL_FP64_LITERAL(-2.5556418169965252055e-10) + p * w;
+// 			p = NBL_FP64_LITERAL(1.5076572693500548083e-09) + p * w;
+// 			p = NBL_FP64_LITERAL(-3.7894654401267369937e-09) + p * w;
+// 			p = NBL_FP64_LITERAL(7.6157012080783393804e-09) + p * w;
+// 			p = NBL_FP64_LITERAL(-1.4960026627149240478e-08) + p * w;
+// 			p = NBL_FP64_LITERAL(2.9147953450901080826e-08) + p * w;
+// 			p = NBL_FP64_LITERAL(-6.7711997758452339498e-08) + p * w;
+// 			p = NBL_FP64_LITERAL(2.2900482228026654717e-07) + p * w;
+// 			p = NBL_FP64_LITERAL(-9.9298272942317002539e-07) + p * w;
+// 			p = NBL_FP64_LITERAL(4.5260625972231537039e-06) + p * w;
+// 			p = NBL_FP64_LITERAL(-1.9681778105531670567e-05) + p * w;
+// 			p = NBL_FP64_LITERAL(7.5995277030017761139e-05) + p * w;
+// 			p = NBL_FP64_LITERAL(-0.00021503011930044477347) + p * w;
+// 			p = NBL_FP64_LITERAL(-0.00013871931833623122026) + p * w;
+// 			p = NBL_FP64_LITERAL(1.0103004648645343977) + p * w;
+// 			p = NBL_FP64_LITERAL(4.8499064014085844221) + p * w;
+// 		}
+
+// 		return p * x;
+// 	}
+// };
 
 #ifdef __HLSL_VERSION
 // SPIR-V already defines specializations for builtin vector types

From ff66405843fa0bd4fb81dbcb2831097aec7e9230 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 19 Mar 2025 16:13:26 +0700
Subject: [PATCH 11/11] removed cast fp64 from frisvad

---
 include/nbl/builtin/hlsl/math/functions.hlsl | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl
index 283841d4ab..be341b6a12 100644
--- a/include/nbl/builtin/hlsl/math/functions.hlsl
+++ b/include/nbl/builtin/hlsl/math/functions.hlsl
@@ -103,17 +103,20 @@ void sincos(T theta, NBL_REF_ARG(T) s, NBL_REF_ARG(T) c)
 template <typename T NBL_FUNC_REQUIRES(vector_traits<T>::Dimension == 3)
 void frisvad(NBL_CONST_REF_ARG(T) normal, NBL_REF_ARG(T) tangent, NBL_REF_ARG(T) bitangent)
 {
-	const typename vector_traits<T>::scalar_type a = NBL_FP64_LITERAL(1.0) / (NBL_FP64_LITERAL(1.0) + normal.z);
-	const typename vector_traits<T>::scalar_type b = -normal.x * normal.y * a;
-    if (normal.z < -NBL_FP64_LITERAL(0.9999999))
+    using scalar_t = typename vector_traits<T>::scalar_type;
+    const scalar_t unit = _static_cast<scalar_t>(1);
+
+	const scalar_t a = unit / (unit + normal.z);
+	const scalar_t b = -normal.x * normal.y * a;
+    if (normal.z < -_static_cast<scalar_t>(0.9999999))
     {
         tangent = T(0.0,-1.0,0.0);
         bitangent = T(-1.0,0.0,0.0);
     }
     else
     {
-        tangent = T(NBL_FP64_LITERAL(1.0)-normal.x*normal.x*a, b, -normal.x);
-        bitangent = T(b, NBL_FP64_LITERAL(1.0)-normal.y*normal.y*a, -normal.y);
+        tangent = T(unit - normal.x * normal.x * a, b, -normal.x);
+        bitangent = T(b, unit - normal.y * normal.y * a, -normal.y);
     }
 }