hsalehipour · Apolo-ADSK · Sep 24, 2025 · Sep 24, 2025 · Sep 24, 2025 · Sep 25, 2025
diff --git a/examples/drivaer_fastback.py b/examples/drivaer_fastback.py
diff --git a/examples/mres_json.py b/examples/mres_json.py
diff --git a/examples/project.json b/examples/project.json
diff --git a/examples/sphere.py b/examples/sphere.py
diff --git a/examples/windsor.py b/examples/windsor.py
diff --git a/examples/windtunnel_json.py b/examples/windtunnel_json.py
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
@@ -12,7 +12,7 @@ class MultiresSimulationManager(MultiresIncompressibleNavierStokesStepper):
 
     def __init__(
         self,
-        omega,
+        omega_finest,
         grid,
         boundary_conditions=[],
         collision_type="BGK",
@@ -24,8 +24,8 @@ def __init__(
         super().__init__(grid, boundary_conditions, collision_type, forcing_scheme, force_vector)
 
         self.initializer = initializer
-        self.omega = omega
         self.count_levels = grid.count_levels
+        self.omega_list = [self.compute_omega(omega_finest, level) for level in range(self.count_levels)]
         self.mres_perf_opt = mres_perf_opt
         # Create fields
         self.rho = grid.create_field(cardinality=1, dtype=self.precision_policy.store_precision)
@@ -51,6 +51,27 @@ def __init__(
         # Construct the stepper skeleton
         self._construct_stepper_skeleton()
 
+    def compute_omega(self, omega_finest, level):
+        """
+        Compute the relaxation parameter omega at a given grid level based on the finest level omega.
+        We select a refinement ratio of 2 where a coarse cell at level L is uniformly divided into 2^d cells
+        where d is the dimension. to arrive at level L - 1, or in other words ∆x_{L-1} = ∆x_L/2.
+        For neighboring cells that interface two grid levels, a maximum jump in grid level of ∆L = 1 is
+        allowed. Due to acoustic scaling which requires the speed of sound cs to remain constant across various grid levels,
+        ∆tL ∝ ∆xL and hence ∆t_{L-1} = ∆t_{L}/2. In addition, the fluid viscosity \nu must also remain constant on each
+        grid level which leads to the following relationship for the relaxation parameter omega at grid level L base
+        on the finest grid level omega_finest.
+
+        Args:
+            omega_finest: Relaxation parameter at the finest grid level.
+            level: Current grid level (0-indexed, with 0 being the finest level).
+
+        Returns:
+            Relaxation parameter omega at the specified grid level.
+        """
+        omega0 = omega_finest
+        return 2 ** (level + 1) * omega0 / ((2**level - 1.0) * omega0 + 2.0)
+
     def export_macroscopic(self, fname_prefix):
         print(f"exporting macroscopic: #levels {self.count_levels}")
         self.macro(self.f_0, self.bc_mask, self.rho, self.u, streamId=0)
@@ -74,6 +95,10 @@ def _construct_stepper_skeleton(self):
         def recursion_reference(level, app):
             if level < 0:
                 return
+
+            # Compute omega at the current level
+            omega = self.omega_list[level]
+
             print(f"RECURSION down to level {level}")
             print(f"RECURSION Level {level}, COLLIDE")
 
@@ -85,7 +110,7 @@ def recursion_reference(level, app):
                 f_1=self.f_1,
                 bc_mask=self.bc_mask,
                 missing_mask=self.missing_mask,
-                omega=self.omega,
+                omega=omega,
                 timestep=0,
             )
 
@@ -110,6 +135,9 @@ def recursion_fused_finest(level, app):
             if level < 0:
                 return
 
+            # Compute omega at the current level
+            omega = self.omega_list[level]
+
             if level == 0:
                 print(f"RECURSION down to the finest level {level}")
                 print(f"RECURSION Level {level}, Fused STREAM and COLLIDE")
@@ -121,7 +149,7 @@ def recursion_fused_finest(level, app):
                     f_1=self.f_1,
                     bc_mask=self.bc_mask,
                     missing_mask=self.missing_mask,
-                    omega=self.omega,
+                    omega=omega,
                     timestep=0,
                     is_f1_the_explosion_src_field=True,
                 )
@@ -133,7 +161,7 @@ def recursion_fused_finest(level, app):
                     f_1=self.f_0,
                     bc_mask=self.bc_mask,
                     missing_mask=self.missing_mask,
-                    omega=self.omega,
+                    omega=omega,
                     timestep=0,
                     is_f1_the_explosion_src_field=False,
                 )
@@ -150,7 +178,7 @@ def recursion_fused_finest(level, app):
                 f_1=self.f_1,
                 bc_mask=self.bc_mask,
                 missing_mask=self.missing_mask,
-                omega=self.omega,
+                omega=omega,
                 timestep=0,
             )
             # 1. Accumulation is read from f_0 in the streaming step, where f_0=self.f_1.
@@ -186,4 +214,4 @@ def recursion_fused_finest(level, app):
 
         bk = self.grid.get_neon_backend()
         self.sk = neon.Skeleton(backend=bk)
-        self.sk.sequence("mres_nse_stepper", self.app)
+        self.sk.sequence("mres_nse_stepper", self.app)
diff --git a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
@@ -63,6 +63,8 @@ def __init__(
         # Unpack the two warp functionals needed for this BC!
         if self.compute_backend == ComputeBackend.WARP:
             self.warp_functional, self.assemble_auxiliary_data = self.warp_functional
+        elif self.compute_backend == ComputeBackend.NEON:
+            self.neon_functional, self.assemble_auxiliary_data = self.neon_functional
 
     def _get_normal_vectors(self, indices):
         # Get the frequency count and most common element directly
@@ -173,7 +175,7 @@ def functional(
             return _f
 
         @wp.func
-        def assemble_auxiliary_data(
+        def assemble_auxiliary_data_warp(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -199,7 +201,40 @@ def assemble_auxiliary_data(
                     _f[_opp_indices[l]] = (self.compute_dtype(1.0) - sound_speed) * _f_pre[l] + sound_speed * f_aux
             return _f
 
+        @wp.func
+        def assemble_auxiliary_data_neon(
+            index: Any,
+            timestep: Any,
+            missing_mask: Any,
+            f_0: Any,
+            f_1: Any,
+            _f_pre: Any,
+            _f_post: Any,
+            level: Any = 0,
+        ):
+            # Prepare time-dependent dynamic data for imposing the boundary condition in the next iteration after streaming.
+            # We use directions that leave the domain for storing this prepared data.
+            # Since this function is called post-collisiotn: f_pre = f_post_stream and f_post = f_post_collision
+            _f = _f_post
+            nv = get_normal_vectors(missing_mask)
+            for lattice_dir in range(self.velocity_set.q):
+                if missing_mask[lattice_dir] == wp.uint8(1):
+                    # f_0 is the post-collision values of the current time-step
+                    # Get pull index associated with the "neighbours" pull_index
+                    offset = wp.vec3i(-_c[0, lattice_dir], -_c[1, lattice_dir], -_c[2, lattice_dir])
+                    for d in range(self.velocity_set.d):
+                        offset[d] = offset[d] - nv[d]
+                    offset_pull_index = wp.neon_ngh_idx(wp.int8(offset[0]), wp.int8(offset[1]), wp.int8(offset[2]))
+
+                    # The following is the post-streaming values of the neighbor cell
+                    # This function reads a field value at a given neighboring index and direction.
+                    unused_is_valid = wp.bool(False)
+                    f_aux = self.compute_dtype(wp.neon_read_ngh(f_0, index, offset_pull_index, lattice_dir, self.compute_dtype(0.0), unused_is_valid))
+                    _f[_opp_indices[lattice_dir]] = (self.compute_dtype(1.0) - sound_speed) * _f_pre[lattice_dir] + sound_speed * f_aux
+            return _f
+
         kernel = self._construct_kernel(functional)
+        assemble_auxiliary_data = assemble_auxiliary_data_warp if self.compute_backend == ComputeBackend.WARP else assemble_auxiliary_data_neon
 
         return (functional, assemble_auxiliary_data), kernel
 
@@ -212,3 +247,12 @@ def warp_implementation(self, _f_pre, _f_post, bc_mask, missing_mask):
             dim=_f_pre.shape[1:],
         )
         return _f_post
+
+    def _construct_neon(self):
+        functional, _ = self._construct_warp()
+        return functional, None
+
+    @Operator.register_backend(ComputeBackend.NEON)
+    def neon_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # rise exception as this feature is not implemented yet
+        raise NotImplementedError("This feature is not implemented in XLB with the NEON backend yet.")
diff --git a/xlb/operator/boundary_condition/bc_hybrid.py b/xlb/operator/boundary_condition/bc_hybrid.py
@@ -276,7 +276,12 @@ def hybrid_bounceback_grads(
             rho, u = self.macroscopic.warp_functional(f_post)
 
             # Compute Grad's approximation using full equation as in Eq (10) of Dorschner et al.
-            f_post = self.bc_helper.grads_approximate_fpop(_missing_mask, rho, u, f_post)
+            f_post = self.bc_helper.grads_approximate_fpop(
+                _missing_mask,                       
+                rho, 
+                u, 
+                f_post,
+                )            
             return f_post
 
         @wp.func
@@ -297,7 +302,8 @@ def hybrid_nonequilibrium_regularized(
             #     boundaries in the lattice Boltzmann method. Physical Review E 77, 056703.
 
             # Apply interpolated bounceback first to find missing populations at the boundary
-            u_wall = self.profile_functional(f_1, index, timestep)
+            u_wall = self.profile_functional(f_1, index, timestep)         
+
             f_post = self.bc_helper.interpolated_nonequilibrium_bounceback(
                 index,
                 _missing_mask,

diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
@@ -78,25 +78,22 @@ def __init__(
         # Currently we support three methods based on (a) aabb method (b) ray casting and (c) winding number.
         self.voxelization_method = voxelization_method
 
-        if self.compute_backend == ComputeBackend.WARP:
-            # Set local constants TODO: This is a hack and should be fixed with warp update
-            _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
-            _missing_mask_vec = wp.vec(self.velocity_set.q, dtype=wp.uint8)  # TODO fix vec bool
-
-        @wp.func
-        def assemble_auxiliary_data(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            return f_post
+        # Construct a default warp functional for assembling auxiliary data if needed
+        if self.compute_backend in [ComputeBackend.WARP, ComputeBackend.NEON]:
+
+            @wp.func
+            def assemble_auxiliary_data(
+                index: Any,
+                timestep: Any,
+                missing_mask: Any,
+                f_0: Any,
+                f_1: Any,
+                f_pre: Any,
+                f_post: Any,
+                level: Any = 0,
+            ):
+                return f_post
 
-        # Construct some helper warp functions for getting tid data
-        if self.compute_backend == ComputeBackend.WARP:
             self.assemble_auxiliary_data = assemble_auxiliary_data
 
     def pad_indices(self):
@@ -156,4 +153,4 @@ def kernel(
             for l in range(self.velocity_set.q):
                 f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
 
-        return kernel
+        return kernel
diff --git a/xlb/operator/boundary_condition/helper_functions_bc.py b/xlb/operator/boundary_condition/helper_functions_bc.py
@@ -143,7 +143,8 @@ def regularize_fpop(
             """
             # Compute momentum flux of off-equilibrium populations for regularization: Pi^1 = Pi^{neq}
             f_neq = fpop - feq
-            PiNeq = momentum_flux.warp_functional(f_neq)
+            PiNeq = momentum_flux.warp_functional(f_neq)            
+            epsilon = compute_dtype(1e-7)
 
             # Compute double dot product Qi:Pi1 (where Pi1 = PiNeq)
             nt = _d * (_d + 1) // 2
@@ -156,6 +157,7 @@ def regularize_fpop(
                 # fneq ~ f^1
                 fpop1 = compute_dtype(4.5) * _w[l] * QiPi1
                 fpop[l] = feq[l] + fpop1
+                fpop[l] = wp.max(epsilon, fpop[l])
             return fpop
 
         @wp.func
@@ -176,33 +178,46 @@ def grads_approximate_fpop(
 
             # Compute pressure tensor Pi using all f_post-streaming values
             Pi = momentum_flux.warp_functional(f_post)
+            epsilon = compute_dtype(1e-7)            
+            zero = compute_dtype(0.0)
+            one = compute_dtype(1.0)
+            three = compute_dtype(3.0)
+            four_pt_five = compute_dtype(4.5)
+            missing_count = zero
+
+            for l in range(_q):
+                if _missing_mask[l] == wp.uint8(1):
+                    missing_count += one
+            scale = one - ((one + missing_count) / compute_dtype(_q))
 
             # Compute double dot product Qi:Pi1 (where Pi1 = PiNeq)
             nt = _d * (_d + 1) // 2
             for l in range(_q):
                 if _missing_mask[l] == wp.uint8(1):
                     # compute dot product of qi and Pi
-                    QiPi = compute_dtype(0.0)
+                    QiPi = zero
                     for t in range(nt):
                         if t == 0 or t == 3 or t == 5:
-                            QiPi += _qi[l, t] * (Pi[t] - rho / compute_dtype(3.0))
+                            QiPi += _qi[l, t] * (Pi[t] - rho / three)
                         else:
                             QiPi += _qi[l, t] * Pi[t]
 
                     # Compute c.u
-                    cu = compute_dtype(0.0)
+                    cu = zero
                     for d in range(_d):
-                        if _c[d, l] == 1:
-                            cu += u[d]
-                        elif _c[d, l] == -1:
-                            cu -= u[d]
-                    cu *= compute_dtype(3.0)
+                        cu += _c_float[d, l] * u[d]
+                    cu *= three
 
                     # change f_post using the Grad's approximation
-                    f_post[l] = rho * _w[l] * (compute_dtype(1.0) + cu) + _w[l] * compute_dtype(4.5) * QiPi
+                    f_post[l] = rho * _w[l] * (one + cu) + _w[l] * four_pt_five * QiPi * scale
 
-            return f_post
+                    f_post[l] = wp.max(epsilon, f_post[l])
+                else:
+                    f_post[l] = wp.max(epsilon, f_post[l])
 
+            return f_post
+
+
         @wp.func
         def moving_wall_fpop_correction(
             u_wall: Any,
@@ -280,6 +295,7 @@ def interpolated_nonequilibrium_bounceback(
             needs_mesh_distance: bool,
         ):
             # Compute density, velocity using all f_post-collision values
+            epsilon = compute_dtype(1e-7)
             rho, u = macroscopic.warp_functional(f_pre)
             feq = equilibrium.warp_functional(rho, u)
 
@@ -291,6 +307,7 @@ def interpolated_nonequilibrium_bounceback(
 
             # Apply method in Tao et al (2018) [1] to find missing populations at the boundary
             one = compute_dtype(1.0)
+            half = compute_dtype(0.5)
             for l in range(_q):
                 # If the mask is missing then take the opposite index
                 if _missing_mask[l] == wp.uint8(1):
@@ -299,7 +316,7 @@ def interpolated_nonequilibrium_bounceback(
                         # use weights associated with curved boundaries that are properly stored in f_1.
                         weight = compute_dtype(self.distance_decoder_function(f_1, index, l))
                     else:
-                        weight = compute_dtype(0.5)
+                        weight = half
 
                     # Use non-equilibrium bounceback to find f_missing:
                     fneq = f_pre[_opp_indices[l]] - feq[_opp_indices[l]]
@@ -313,6 +330,10 @@ def interpolated_nonequilibrium_bounceback(
                     f_wall = feq_wall[l] + fneq
                     f_post[l] = (f_wall + weight * f_pre[l]) / (one + weight)
 
+                    f_post[l] = wp.max(epsilon, f_post[l])
+                else:
+                    f_post[l] = wp.max(epsilon, f_post[l])
+
             return f_post
 
         @wp.func