diff --git a/examples/cfd/grid_refinement/ahmed.py b/examples/cfd/grid_refinement/ahmed.py
index f6ef8b20..3b258870 100644
--- a/examples/cfd/grid_refinement/ahmed.py
+++ b/examples/cfd/grid_refinement/ahmed.py
@@ -56,7 +56,7 @@ def generate_cuboid_mesh(stl_filename, voxel_size):
     """
     # Domain multipliers for each refinement level
     domain_multiplier = [
-        [3.0, 4.0, 2.5, 2.5, 0.0, 4.0],     # -x, x, -y, y, -z, z
+        [3.0, 4.0, 2.5, 2.5, 0.0, 4.0],  # -x, x, -y, y, -z, z
         [1.2, 1.25, 1.75, 1.75, 0.0, 1.5],
         [0.8, 1.0, 1.25, 1.25, 0.0, 1.2],
         [0.5, 0.65, 0.6, 0.60, 0.0, 0.6],
diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index 5f9ddeb8..66945e89 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -16,31 +16,59 @@
 
 
 def parse_arguments():
-    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
-    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
-    parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation")
-    parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)")
-    parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
+    # Define valid options for consistency
+    COMPUTE_BACKENDS = ["neon", "warp", "jax"]
+    PRECISION_OPTIONS = ["fp32/fp32", "fp64/fp64", "fp64/fp32", "fp32/fp16"]
+    VELOCITY_SETS = ["D3Q19", "D3Q27"]
+    COLLISION_MODELS = ["BGK", "KBC"]
+    OCC_OPTIONS = ["standard", "none"]
+
+    parser = argparse.ArgumentParser(
+        description="MLUPS Benchmark for 3D Lattice Boltzmann Method Simulation",
+        epilog=f"""
+Examples:
+  %(prog)s 100 1000 neon fp32/fp32
+  %(prog)s 200 500 neon fp64/fp64 --collision_model KBC --velocity_set D3Q27
+  %(prog)s 150 2000 neon fp32/fp32 --gpu_devices=[0,1,2] --measure_scalability --report
+  %(prog)s 100 1000 neon fp32/fp32 --repetitions 5 --export_final_velocity
+        """,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    # Positional arguments
+    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid (e.g., 100)")
+    parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation (e.g., 1000)")
+    parser.add_argument("compute_backend", type=str, choices=COMPUTE_BACKENDS, help=f"Backend for the simulation ({', '.join(COMPUTE_BACKENDS)})")
+    parser.add_argument("precision", type=str, choices=PRECISION_OPTIONS, help=f"Precision for the simulation ({', '.join(PRECISION_OPTIONS)})")
+
+    # Optional arguments
+    parser.add_argument("--gpu_devices", type=str, default=None, help="CUDA devices to use for Neon backend (e.g., [0,1,2] or [0])")
     parser.add_argument(
-        "--gpu_devices",
+        "--velocity_set",
         type=str,
-        default=None,
-        help="List of the CUDA devices to use (e.g., --gpu_devices=[0,1,2]). This is only used for Neon backend.",
+        default="D3Q19",
+        choices=VELOCITY_SETS,
+        help=f"Lattice velocity set (default: D3Q19, choices: {', '.join(VELOCITY_SETS)})",
     )
-    # add a flat to choose between 19 or 27 velocity set
-    parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
-    # add a flat to choose between multi-gpu occ options based on the neon occ:
     parser.add_argument(
-        "--occ", type=str, default="standard", help="Overlapping Communication and Computation option (standard, none) (default: standard)"
+        "--collision_model",
+        type=str,
+        default="BGK",
+        choices=COLLISION_MODELS,
+        help=f"Collision model (default: BGK, choices: {', '.join(COLLISION_MODELS)}, KBC requires D3Q27)",
     )
-    parser.add_argument("--report", action="store_true", help="Generate a neon report file (default: disabled)")
-    parser.add_argument("--export_final_velocity", action="store_true", help="Export the final velocity field to a vti file (default: disabled)")
-    parser.add_argument("--measure_scalability", action="store_true", help="Measure scalability of the simulation (default: disabled)")
     parser.add_argument(
-        "--repetitions",
-        type=int,
-        default=1,
-        help="Number of repetitions for the simulation (default: 1) to get the average MLUPs and standard deviation",
+        "--occ",
+        type=str,
+        default="standard",
+        choices=OCC_OPTIONS,
+        help=f"Overlapping Communication and Computation strategy (default: standard, choices: {', '.join(OCC_OPTIONS)})",
+    )
+    parser.add_argument("--report", action="store_true", help="Generate Neon performance report")
+    parser.add_argument("--export_final_velocity", action="store_true", help="Export final velocity field to VTI file")
+    parser.add_argument("--measure_scalability", action="store_true", help="Measure performance across different GPU counts")
+    parser.add_argument(
+        "--repetitions", type=int, default=1, metavar="N", help="Number of simulation repetitions for statistical analysis (default: 1)"
     )
 
     args = parser.parse_args()
@@ -56,29 +84,32 @@ def parse_arguments():
         except (ValueError, SyntaxError):
             raise ValueError("Invalid gpu_devices format. Use format like [0,1,2] or [0]")
 
-    # Checking the compute backend and covert it to the right type
-    compute_backend = None
-    if args.compute_backend == "jax":
-        compute_backend = ComputeBackend.JAX
-    elif args.compute_backend == "warp":
-        compute_backend = ComputeBackend.WARP
-    elif args.compute_backend == "neon":
-        compute_backend = ComputeBackend.NEON
-    else:
-        raise ValueError("Invalid compute backend specified. Use 'jax', 'warp', or 'neon'.")
+    # Validate and convert compute backend
+    compute_backend_map = {
+        "jax": ComputeBackend.JAX,
+        "warp": ComputeBackend.WARP,
+        "neon": ComputeBackend.NEON,
+    }
+    compute_backend = compute_backend_map.get(args.compute_backend)
+    if compute_backend is None:
+        raise ValueError(f"Invalid compute backend '{args.compute_backend}'. Use: {', '.join(COMPUTE_BACKENDS)}")
     args.compute_backend = compute_backend
 
-    # Checking OCC
-    if args.occ not in ["standard", "none"]:
-        raise ValueError("Invalid occupancy option. Use 'standard', or 'none'.")
-    if args.gpu_devices is None and args.compute_backend == ComputeBackend.NEON:
-        print("[Warning] No GPU devices specified. Using default device 0.")
-        args.gpu_devices = [0]
+    # Handle GPU devices for Neon backend
     if args.compute_backend == ComputeBackend.NEON:
+        if args.gpu_devices is None:
+            print("[INFO] No GPU devices specified. Using default device 0.")
+            args.gpu_devices = [0]
+
         import neon
 
-        occ = neon.SkeletonConfig.OCC.from_string(args.occ)
-        args.occ = occ
+        occ_enum = neon.SkeletonConfig.OCC.from_string(args.occ)
+        args.occ_enum = occ_enum  # Store the enum for Neon
+        args.occ_display = args.occ  # Store the original string for display
+    else:
+        if args.gpu_devices is not None:
+            raise ValueError(f"--gpu_devices can only be used with Neon backend, not {args.compute_backend.name}")
+        args.gpu_devices = [0]  # Default for non-Neon backends
 
     # Checking precision policy
     precision_policy_map = {
@@ -89,12 +120,12 @@ def parse_arguments():
     }
     precision_policy = precision_policy_map.get(args.precision)
     if precision_policy is None:
-        raise ValueError("Invalid precision specified.")
+        raise ValueError(f"Invalid precision '{args.precision}'. Use: {', '.join(PRECISION_OPTIONS)}")
     args.precision_policy = precision_policy
 
-    # Checking velocity set
-    if args.velocity_set not in ["D3Q19", "D3Q27"]:
-        raise ValueError("Invalid velocity set. Use 'D3Q19' or 'D3Q27'.")
+    # Validate collision model and velocity set compatibility
+    if args.collision_model == "KBC" and args.velocity_set != "D3Q27":
+        raise ValueError("KBC collision model requires D3Q27 velocity set. Use --velocity_set D3Q27")
 
     if args.velocity_set == "D3Q19":
         velocity_set = xlb.velocity_set.D3Q19(precision_policy=args.precision_policy, compute_backend=compute_backend)
@@ -102,42 +133,45 @@ def parse_arguments():
         velocity_set = xlb.velocity_set.D3Q27(precision_policy=args.precision_policy, compute_backend=compute_backend)
     args.velocity_set = velocity_set
 
-    if args.gpu_devices is not None and args.compute_backend != ComputeBackend.NEON:
-        raise ValueError("--gpu_devices can be used only with the Neon backend.")
-
-    if args.gpu_devices is None:
-        args.gpu_devices = [0]
-
     print_args(args)
 
     return args
 
 
 def print_args(args):
-    # Print simulation configuration
-    print("=" * 60)
-    print("           3D LATTICE BOLTZMANN SIMULATION CONFIG")
-    print("=" * 60)
-    print(f"Grid Size:            {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
-    print(f"Total Lattice Points: {args.cube_edge**3:,}")
-    print(f"Time Steps:           {args.num_steps:,}")
-    print(f"Compute Backend:      {args.compute_backend.name}")
-    print(f"Precision Policy:     {args.precision}")
-    print(f"Velocity Set:         {args.velocity_set.__class__.__name__}")
-    print(f"Generate Report:      {'Yes' if args.report else 'No'}")
-    print(f"Measure Scalability:  {'Yes' if args.measure_scalability else 'No'}")
-    print(f"Export Velocity:      {'Yes' if args.export_final_velocity else 'No'}")
-    print(f"Repetitions:          {args.repetitions}")
+    """Print simulation configuration in a clean, organized format"""
+    print("\n" + "=" * 70)
+    print("                    SIMULATION CONFIGURATION")
+    print("=" * 70)
+
+    # Grid and simulation parameters
+    print("GRID & SIMULATION:")
+    print(f"  Grid Size:              {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
+    print(f"  Total Lattice Points:   {args.cube_edge**3:,}")
+    print(f"  Time Steps:             {args.num_steps:,}")
+    print(f"  Repetitions:            {args.repetitions}")
+
+    # Computational settings
+    print("\nCOMPUTATIONAL SETTINGS:")
+    print(f"  Compute Backend:        {args.compute_backend.name}")
+    print(f"  Precision Policy:       {args.precision}")
+    print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
+    print(f"  Collision Model:        {args.collision_model}")
 
+    # Backend-specific settings
     if args.compute_backend.name == "NEON":
-        print(f"GPU Devices:          {args.gpu_devices}")
-        # Convert the neon OCC enum back to string for display
-        occ_display = args.occ.to_string() if hasattr(args.occ, "__class__") else args.occ
-        print(f"OCC Strategy:         {occ_display}")
+        print("\nNEON BACKEND SETTINGS:")
+        print(f"  GPU Devices:            {args.gpu_devices}")
+        print(f"  OCC Strategy:           {args.occ_display}")
 
-    print("=" * 60)
-    print("Starting simulation...")
-    print()
+    # Output options
+    print("\nOUTPUT OPTIONS:")
+    print(f"  Generate Report:        {'Yes' if args.report else 'No'}")
+    print(f"  Measure Scalability:    {'Yes' if args.measure_scalability else 'No'}")
+    print(f"  Export Velocity:        {'Yes' if args.export_final_velocity else 'No'}")
+
+    print("=" * 70)
+    print("Starting simulation...\n")
 
 
 def init_xlb(args):
@@ -148,12 +182,14 @@ def init_xlb(args):
     )
     options = None
     if args.compute_backend == ComputeBackend.NEON:
-        neon_options = {"occ": args.occ, "device_list": args.gpu_devices}
+        neon_options = {"occ": args.occ_enum, "device_list": args.gpu_devices}
         options = neon_options
     return args.compute_backend, args.precision_policy, options
 
 
-def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, options, export_final_velocity, repetitions, num_devices):
+def run_simulation(
+    compute_backend, precision_policy, grid_shape, num_steps, options, export_final_velocity, repetitions, num_devices, collision_model
+):
     grid = grid_factory(grid_shape, backend_config=options)
     box = grid.bounding_box_indices()
     box_no_edge = grid.bounding_box_indices(remove_edges=True)
@@ -170,7 +206,7 @@ def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, opt
     stepper = IncompressibleNavierStokesStepper(
         grid=grid,
         boundary_conditions=boundary_conditions,
-        collision_type="BGK",
+        collision_type=collision_model,
         backend_config=options,
     )
 
@@ -232,50 +268,6 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time):
     return mlups
 
 
-def print_summary(args, elapsed_time, mlups):
-    """Print comprehensive simulation summary with parameters and performance results"""
-    total_lattice_points = args.cube_edge**3
-    total_lattice_updates = total_lattice_points * args.num_steps
-    lattice_points_per_second = total_lattice_updates / elapsed_time
-
-    print("\n\n\n" + "=" * 70)
-    print("                    SIMULATION SUMMARY")
-    print("=" * 70)
-
-    # Simulation Parameters
-    print("SIMULATION PARAMETERS:")
-    print("-" * 25)
-    print(f"  Grid Size:              {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
-    print(f"  Total Lattice Points:   {total_lattice_points:,}")
-    print(f"  Time Steps:             {args.num_steps:,}")
-    print(f"  Total Lattice Updates:  {total_lattice_updates:,}")
-    print(f"  Compute Backend:        {args.compute_backend.name}")
-    print(f"  Precision Policy:       {args.precision}")
-    print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
-    print(f"  Generate Report:        {'Yes' if args.report else 'No'}")
-    print(f"  Measure Scalability:    {'Yes' if args.measure_scalability else 'No'}")
-
-    if args.compute_backend.name == "NEON":
-        print(f"  GPU Devices:            {args.gpu_devices}")
-        occ_display = str(args.occ).split(".")[-1] if hasattr(args.occ, "__class__") else args.occ
-        print(f"  OCC Strategy:           {occ_display}")
-
-    print()
-
-    # Performance Results
-    print("PERFORMANCE RESULTS:")
-    print("-" * 20)
-    print(f"  Time in main loop:      {elapsed_time:.3f} seconds")
-    print(f"  MLUPs:                  {mlups:.2f}")
-    print(f"  Time per LBM step:      {elapsed_time / args.num_steps * 1000:.3f} ms")
-
-    if args.compute_backend.name == "NEON" and len(args.gpu_devices) > 1:
-        mlups_per_gpu = mlups / len(args.gpu_devices)
-        print(f"  MLUPs per GPU:          {mlups_per_gpu:.2f}")
-
-    print("=" * 70)
-
-
 def print_summary_with_stats(args, stats):
     """Print comprehensive simulation summary with statistics from multiple repetitions"""
     total_lattice_points = args.cube_edge**3
@@ -301,12 +293,13 @@ def print_summary_with_stats(args, stats):
     print(f"  Compute Backend:        {args.compute_backend.name}")
     print(f"  Precision Policy:       {args.precision}")
     print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
+    print(f"  Collision Model:        {args.collision_model}")
     print(f"  Generate Report:        {'Yes' if args.report else 'No'}")
     print(f"  Measure Scalability:    {'Yes' if args.measure_scalability else 'No'}")
 
     if args.compute_backend.name == "NEON":
         print(f"  GPU Devices:            {args.gpu_devices}")
-        occ_display = str(args.occ).split(".")[-1] if hasattr(args.occ, "__class__") else args.occ
+        occ_display = args.occ_display
         print(f"  OCC Strategy:           {occ_display}")
 
     print()
@@ -371,9 +364,10 @@ def print_scalability_summary(args, stats_list):
     print(f"  Compute Backend:        {args.compute_backend.name}")
     print(f"  Precision Policy:       {args.precision}")
     print(f"  Velocity Set:           {args.velocity_set.__class__.__name__}")
+    print(f"  Collision Model:        {args.collision_model}")
 
     if args.compute_backend.name == "NEON":
-        occ_display = str(args.occ).split(".")[-1] if hasattr(args.occ, "__class__") else args.occ
+        occ_display = args.occ_display
         print(f"  OCC Strategy:           {occ_display}")
         print(f"  Available GPU Devices:  {args.gpu_devices}")
 
@@ -439,11 +433,18 @@ def print_scalability_summary(args, stats_list):
 
 def report(args, stats):
     import neon
+    import sys
 
     report = neon.Report("LBM MLUPS LDC")
+
+    # Save the full command line
+    command_line = " ".join(sys.argv)
+    report.add_member("command_line", command_line)
+
     report.add_member("velocity_set", args.velocity_set.__class__.__name__)
     report.add_member("compute_backend", args.compute_backend.name)
     report.add_member("precision_policy", args.precision)
+    report.add_member("collision_model", args.collision_model)
     report.add_member("grid_size", args.cube_edge)
     report.add_member("num_steps", args.num_steps)
     report.add_member("repetitions", args.repetitions)
@@ -463,16 +464,27 @@ def report(args, stats):
     report.add_member("elapsed_time", stats["mean_elapsed_time"])
     report.add_member("mlups", stats["mean_mlups"])
 
-    report.add_member("occ", (args.occ.to_string()))
+    report.add_member("occ", args.occ_display)
     report.add_member_vector("gpu_devices", args.gpu_devices)
     report.add_member("num_devices", len(args.gpu_devices))
     report.add_member("measure_scalability", args.measure_scalability)
 
-    report_name = "mlups_3d_" + f"size_{args.cube_edge}"
-    if args.measure_scalability:
-        report_name += f"_dev_{len(args.gpu_devices)}"
+    # Generate report name following the convention: script_name + parameters
+    report_name = "mlups_3d"
+    report_name += f"_velocity_set_{args.velocity_set.__class__.__name__}"
+    report_name += f"_compute_backend_{args.compute_backend.name}"
+    report_name += f"_precision_policy_{args.precision.replace('/', '_')}"
+    report_name += f"_collision_model_{args.collision_model}"
+    report_name += f"_grid_size_{args.cube_edge}"
+    report_name += f"_num_steps_{args.num_steps}"
+
+    if args.compute_backend.name == "NEON":
+        report_name += f"_occ_{args.occ_display}"
+        report_name += f"_num_devices_{len(args.gpu_devices)}"
+
     if args.repetitions > 1:
-        report_name += f"_rep_{args.repetitions}"
+        report_name += f"_repetitions_{args.repetitions}"
+
     report.write(report_name, True)
 
 
@@ -494,6 +506,7 @@ def benchmark(args):
         export_final_velocity=args.export_final_velocity,
         repetitions=args.repetitions,
         num_devices=len(args.gpu_devices),
+        collision_model=args.collision_model,
     )
 
     for elapsed_time in elapsed_time_list:
diff --git a/examples/performance/mlups_3d_multires.py b/examples/performance/mlups_3d_multires.py
index cff15eb1..3a143764 100644
--- a/examples/performance/mlups_3d_multires.py
+++ b/examples/performance/mlups_3d_multires.py
@@ -10,21 +10,97 @@
 from xlb.grid import multires_grid_factory
 from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper
 from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC
+from xlb.mres_perf_optimization_type import MresPerfOptimizationType
 
 
 def parse_arguments():
-    parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)")
+    parser = argparse.ArgumentParser(
+        description="MLUPS for 3D Lattice Boltzmann Method Simulation with Multi-resolution Grid",
+        epilog="""
+Examples:
+  %(prog)s 100 1000 neon fp32/fp32 2 NAIVE_COLLIDE_STREAM
+  %(prog)s 200 500 neon fp64/fp64 3 FUSION_AT_FINEST --report
+  %(prog)s 50 2000 neon fp32/fp16 2 NAIVE_COLLIDE_STREAM --export_final_velocity
+
+Valid values:
+  compute_backend: neon
+  precision: fp32/fp32, fp64/fp64, fp64/fp32, fp32/fp16
+  mres_perf_opt: NAIVE_COLLIDE_STREAM, FUSION_AT_FINEST
+  velocity_set: D3Q19, D3Q27
+  collision_model: BGK, KBC
+        """,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
     # Positional arguments
-    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid")
-    parser.add_argument("num_steps", type=int, help="Timestep for the simulation")
-    parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)")
-    parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)")
+    parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid (e.g., 100)")
+    parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation (e.g., 1000)")
+    parser.add_argument("compute_backend", type=str, help="Backend for the simulation (neon)")
+    parser.add_argument("precision", type=str, help="Precision for the simulation (fp32/fp32, fp64/fp64, fp64/fp32, fp32/fp16)")
+    parser.add_argument("num_levels", type=int, help="Number of levels for the multiresolution grid (e.g., 2)")
+    parser.add_argument(
+        "mres_perf_opt",
+        type=MresPerfOptimizationType.from_string,
+        help="Multi-resolution performance optimization strategy (NAIVE_COLLIDE_STREAM, FUSION_AT_FINEST)",
+    )
 
     # Optional arguments
     parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)")
     parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)")
+    parser.add_argument("--collision_model", type=str, default="BGK", help="Collision model: BGK or KBC (default: BGK)")
+
+    parser.add_argument("--report", action="store_true", help="Generate a neon report file (default: disabled)")
+    parser.add_argument("--export_final_velocity", action="store_true", help="Export the final velocity field to a vti file (default: disabled)")
+
+    try:
+        args = parser.parse_args()
+    except SystemExit:
+        # Re-raise with custom message
+        print("\n" + "=" * 60)
+        print("USAGE EXAMPLES:")
+        print("=" * 60)
+        print("python mlups_3d_multires.py 100 1000 neon fp32/fp32 2 NAIVE_COLLIDE_STREAM")
+        print("python mlups_3d_multires.py 200 500 neon fp64/fp64 3 FUSION_AT_FINEST --report")
+        print("\nVALID VALUES:")
+        print("  compute_backend: neon")
+        print("  precision: fp32/fp32, fp64/fp64, fp64/fp32, fp32/fp16")
+        print("  mres_perf_opt: NAIVE_COLLIDE_STREAM, FUSION_AT_FINEST")
+        print("  velocity_set: D3Q19, D3Q27")
+        print("  collision_model: BGK, KBC")
+        print("=" * 60)
+        raise
+
+    print_args(args)
+
+    if args.compute_backend != "neon":
+        raise ValueError("Invalid compute backend specified. Use 'neon' which supports multi-resolution!")
+
+    if args.collision_model not in ["BGK", "KBC"]:
+        raise ValueError("Invalid collision model specified. Use 'BGK' or 'KBC'.")
+
+    return args
 
-    return parser.parse_args()
+
+def print_args(args):
+    # Print simulation configuration
+    print("=" * 60)
+    print("           3D LATTICE BOLTZMANN SIMULATION CONFIG")
+    print("=" * 60)
+    print(f"Grid Size:            {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})")
+    print(f"Total Lattice Points: {args.cube_edge**3:,}")
+    print(f"Time Steps:           {args.num_steps:,}")
+    print(f"Number Levels:        {args.num_levels}")
+    print(f"Compute Backend:      {args.compute_backend}")
+    print(f"Precision Policy:     {args.precision}")
+    print(f"Velocity Set:         {args.velocity_set}")
+    print(f"Collision Model:      {args.collision_model}")
+    print(f"Mres Perf Opt:        {args.mres_perf_opt}")
+    print(f"Generate Report:      {'Yes' if args.report else 'No'}")
+    print(f"Export Velocity:      {'Yes' if args.export_final_velocity else 'No'}")
+
+    print("=" * 60)
+    print("Starting simulation...")
+    print()
 
 
 def setup_simulation(args):
@@ -61,7 +137,7 @@ def setup_simulation(args):
     return velocity_set
 
 
-def problem1(grid_shape, velocity_set):
+def problem1(grid_shape, velocity_set, num_levels):
     def peel(dim, idx, peel_level, outwards):
         if outwards:
             xIn = idx.x <= peel_level or idx.x >= dim.x - 1 - peel_level
@@ -108,7 +184,6 @@ def get_levels(num_levels):
         levels.append(lastLevel)
         return levels
 
-    num_levels = 4
     levels = get_levels(num_levels)
 
     grid = multires_grid_factory(
@@ -129,9 +204,8 @@ def get_levels(num_levels):
     return grid, lid, walls
 
 
-def problem2(grid_shape, velocity_set):
+def problem2(grid_shape, velocity_set, num_levels):
     # Example 2: Coarsest at the edges (2 level only)
-    num_levels = 4
     level_origins = []
     level_list = []
     for lvl in range(num_levels):
@@ -167,7 +241,15 @@ def problem2(grid_shape, velocity_set):
     return grid, lid, walls
 
 
-def run(velocity_set, grid_shape, num_steps):
+def run(
+    velocity_set,
+    grid_shape,
+    num_steps,
+    num_levels,
+    collision_model,
+    export_final_velocity,
+    mres_perf_opt,
+):
     # Create grid and setup boundary conditions
 
     # Convert indices to list of indices per level
@@ -177,10 +259,10 @@ def run(velocity_set, grid_shape, num_steps):
     # walls = construct_indices_per_level(grid_shape, walls, levels_mask, level_origins)
 
     # Example 1: fine to coarse
-    # grid, lid, walls = problem1(grid_shape, velocity_set)
+    # grid, lid, walls = problem1(grid_shape, velocity_set, num_levels)
 
     # Example 2: Coarse to fine:
-    grid, lid, walls = problem2(grid_shape, velocity_set)
+    grid, lid, walls = problem1(grid_shape, velocity_set, num_levels)
 
     prescribed_vel = 0.1
     boundary_conditions = [
@@ -195,7 +277,13 @@ def run(velocity_set, grid_shape, num_steps):
     omega_finest = 1.0 / (3.0 * visc + 0.5)
 
     # Define a multi-resolution simulation manager
-    sim = xlb.helper.MultiresSimulationManager(omega_finest=omega_finest, grid=grid, boundary_conditions=boundary_conditions, collision_type="KBC")
+    sim = xlb.helper.MultiresSimulationManager(
+        omega_finest=omega_finest,
+        grid=grid,
+        boundary_conditions=boundary_conditions,
+        collision_type=collision_model,
+        mres_perf_opt=mres_perf_opt,
+    )
 
     # sim.export_macroscopic("Initial_")
     # sim.step()
@@ -203,6 +291,10 @@ def run(velocity_set, grid_shape, num_steps):
     print("start timing")
     wp.synchronize()
     start_time = time.time()
+
+    if num_levels == 1:
+        num_steps = num_steps // 2
+
     for i in range(num_steps):
         sim.step()
         # if i % 1000 == 0:
@@ -212,6 +304,9 @@ def run(velocity_set, grid_shape, num_steps):
     t = time.time() - start_time
     print(f"Timing  {t}")
 
+    if export_final_velocity:
+        sim.export_macroscopic("u_lid_driven_cavity_")
+
     # sim.export_macroscopic("u_lid_driven_cavity_")
     num_levels = grid.count_levels
     return {"time": t, "num_levels": num_levels}
@@ -236,11 +331,42 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time, num_levels):
     # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity")
 
 
+def generate_report(args, stats, mlups_stats):
+    """Generate a neon report file with simulation parameters and results"""
+    import neon
+    import sys
+
+    report = neon.Report("LBM MLUPS Multiresolution LDC")
+
+    # Save the full command line
+    command_line = " ".join(sys.argv)
+    report.add_member("command_line", command_line)
+
+    report.add_member("velocity_set", args.velocity_set)
+    report.add_member("compute_backend", args.compute_backend)
+    report.add_member("precision_policy", args.precision)
+    report.add_member("collision_model", args.collision_model)
+    report.add_member("grid_size", args.cube_edge)
+    report.add_member("num_steps", args.num_steps)
+    report.add_member("num_levels", stats["num_levels"])
+    report.add_member("finer_steps", mlups_stats["finer_steps"])
+
+    # Performance metrics
+    report.add_member("elapsed_time", stats["time"])
+    report.add_member("emlups", mlups_stats["EMLUPS"])
+
+    report_name = f"mlups_3d_multires_size_{args.cube_edge}_levels_{stats['num_levels']}"
+    report.write(report_name, True)
+    print("Report generated successfully.")
+
+
 def main():
     args = parse_arguments()
     velocity_set = setup_simulation(args)
     grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge)
-    stats = run(velocity_set, grid_shape, args.num_steps)
+    stats = run(
+        velocity_set, grid_shape, args.num_steps, args.num_levels, args.collision_model, args.export_final_velocity, mres_perf_opt=args.mres_perf_opt
+    )
     mlups_stats = calculate_mlups(args.cube_edge, args.num_steps, stats["time"], stats["num_levels"])
 
     print(f"Simulation completed in {stats['time']:.2f} seconds")
@@ -252,6 +378,10 @@ def main():
     EMLUPS = mlups_stats["EMLUPS"]
     print(f"EMLUPs: {EMLUPS:.2f}")
 
+    # Generate report if requested
+    if args.report:
+        generate_report(args, stats, mlups_stats)
+
 
 if __name__ == "__main__":
     main()
diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py
index 8abc56b6..c1c43a10 100644
--- a/xlb/helper/simulation_manager.py
+++ b/xlb/helper/simulation_manager.py
@@ -105,7 +105,7 @@ def recursion_reference(level, app):
             self.add_to_app(
                 app=app,
                 op_name="collide_coarse",
-                mres_level=level,
+                level=level,
                 f_0=self.f_0,
                 f_1=self.f_1,
                 bc_mask=self.bc_mask,
@@ -122,7 +122,7 @@ def recursion_reference(level, app):
             self.add_to_app(
                 app=app,
                 op_name="stream_coarse_step_ABC",
-                mres_level=level,
+                level=level,
                 f_0=self.f_1,
                 f_1=self.f_0,
                 bc_mask=self.bc_mask,
@@ -144,11 +144,11 @@ def recursion_fused_finest(level, app):
                 self.add_to_app(
                     app=app,
                     op_name="finest_fused_pull",
-                    mres_level=level,
-                    f_0=self.f_0,
-                    f_1=self.f_1,
-                    bc_mask=self.bc_mask,
-                    missing_mask=self.missing_mask,
+                    level=level,
+                    f_0_fd=self.f_0,
+                    f_1_fd=self.f_1,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
                     omega=omega,
                     timestep=0,
                     is_f1_the_explosion_src_field=True,
@@ -156,11 +156,11 @@ def recursion_fused_finest(level, app):
                 self.add_to_app(
                     app=app,
                     op_name="finest_fused_pull",
-                    mres_level=level,
-                    f_0=self.f_1,
-                    f_1=self.f_0,
-                    bc_mask=self.bc_mask,
-                    missing_mask=self.missing_mask,
+                    level=level,
+                    f_0_fd=self.f_1,
+                    f_1_fd=self.f_0,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
                     omega=omega,
                     timestep=0,
                     is_f1_the_explosion_src_field=False,
@@ -173,11 +173,11 @@ def recursion_fused_finest(level, app):
             self.add_to_app(
                 app=app,
                 op_name="collide_coarse",
-                mres_level=level,
-                f_0=self.f_0,
-                f_1=self.f_1,
-                bc_mask=self.bc_mask,
-                missing_mask=self.missing_mask,
+                level=level,
+                f_0_fd=self.f_0,
+                f_1_fd=self.f_1,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
                 omega=omega,
                 timestep=0,
             )
@@ -196,19 +196,256 @@ def recursion_fused_finest(level, app):
             self.add_to_app(
                 app=app,
                 op_name="stream_coarse_step_ABC",
-                mres_level=level,
-                f_0=self.f_1,
-                f_1=self.f_0,
-                bc_mask=self.bc_mask,
-                missing_mask=self.missing_mask,
+                level=level,
+                f_0_fd=self.f_1,
+                f_1_fd=self.f_0,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
+                omega=self.coalescence_factor,
+                timestep=0,
+            )
+
+        def recursion_fused_finest_254(level, app):
+            if level < 0:
+                return
+
+            # Compute omega at the current level
+            omega = self.omega_list[level]
+
+            if level == 0:
+                print(f"RECURSION down to the finest level {level}")
+                print(f"RECURSION Level {level}, Fused STREAM and COLLIDE")
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_no_254",
+                    level=level,
+                    f_0_fd=self.f_0,
+                    f_1_fd=self.f_1,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega,
+                    timestep=0,
+                    is_f1_the_explosion_src_field=True,
+                )
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_254",
+                    level=level,
+                    f_0_fd=self.f_0,
+                    f_1_fd=self.f_1,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega,
+                )
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_no_254",
+                    level=level,
+                    f_0_fd=self.f_1,
+                    f_1_fd=self.f_0,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega,
+                    timestep=0,
+                    is_f1_the_explosion_src_field=False,
+                )
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_254",
+                    level=level,
+                    f_0_fd=self.f_1,
+                    f_1_fd=self.f_0,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega,
+                )
+                return
+
+            print(f"RECURSION down to level {level}")
+            print(f"RECURSION Level {level}, COLLIDE")
+
+            self.add_to_app(
+                app=app,
+                op_name="collide_coarse",
+                level=level,
+                f_0_fd=self.f_0,
+                f_1_fd=self.f_1,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
+                omega=omega,
+                timestep=0,
+            )
+            # 1. Accumulation is read from f_0 in the streaming step, where f_0=self.f_1.
+            # so is_self_f1_the_coalescence_dst_field is True
+            # 2. Explision data is the output from the corser collide, which is f_1=self.f_1.
+            # so is_self_f1_the_explosion_src_field is True
+
+            if level - 1 == 0:
+                recursion_fused_finest_254(level - 1, app)
+            else:
+                recursion_fused_finest_254(level - 1, app)
+                recursion_fused_finest_254(level - 1, app)
+            # Important: swapping of f_0 and f_1 is done here
+            print(f"RECURSION Level {level}, stream_coarse_step_ABC")
+            self.add_to_app(
+                app=app,
+                op_name="stream_coarse_step_ABC",
+                level=level,
+                f_0_fd=self.f_1,
+                f_1_fd=self.f_0,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
                 omega=self.coalescence_factor,
                 timestep=0,
             )
 
+        def recursion_fused_finest_254_all(level, app):
+            if level < 0:
+                return
+
+            # Compute omega at the current level
+            omega = self.omega_list[level]
+
+            if level == 0:
+                print(f"RECURSION down to the finest level {level}")
+                print(f"RECURSION Level {level}, Fused STREAM and COLLIDE")
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_no_254",
+                    level=level,
+                    f_0_fd=self.f_0,
+                    f_1_fd=self.f_1,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega,
+                    timestep=0,
+                    is_f1_the_explosion_src_field=True,
+                )
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_254",
+                    level=level,
+                    f_0_fd=self.f_0,
+                    f_1_fd=self.f_1,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega,
+                )
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_no_254",
+                    level=level,
+                    f_0_fd=self.f_1,
+                    f_1_fd=self.f_0,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega,
+                    timestep=0,
+                    is_f1_the_explosion_src_field=False,
+                )
+                self.add_to_app(
+                    app=app,
+                    op_name="finest_fused_pull_254",
+                    level=level,
+                    f_0_fd=self.f_1,
+                    f_1_fd=self.f_0,
+                    bc_mask_fd=self.bc_mask,
+                    missing_mask_fd=self.missing_mask,
+                    omega=omega,
+                )
+                return
+
+            print(f"RECURSION down to level {level}")
+            print(f"RECURSION Level {level}, COLLIDE")
+
+            self.add_to_app(
+                app=app,
+                op_name="collide_coarse_no_254",
+                level=level,
+                f_0_fd=self.f_0,
+                f_1_fd=self.f_1,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
+                omega=omega,
+                timestep=0,
+            )
+            self.add_to_app(
+                app=app,
+                op_name="collide_coarse_254",
+                level=level,
+                f_0_fd=self.f_0,
+                f_1_fd=self.f_1,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
+                omega=omega,
+                timestep=0,
+            )
+            # 1. Accumulation is read from f_0 in the streaming step, where f_0=self.f_1.
+            # so is_self_f1_the_coalescence_dst_field is True
+            # 2. Explision data is the output from the corser collide, which is f_1=self.f_1.
+            # so is_self_f1_the_explosion_src_field is True
+
+            if level - 1 == 0:
+                recursion_fused_finest_254_all(level - 1, app)
+            else:
+                recursion_fused_finest_254_all(level - 1, app)
+                recursion_fused_finest_254_all(level - 1, app)
+            # Important: swapping of f_0 and f_1 is done here
+            print(f"RECURSION Level {level}, stream_coarse_step_ABC")
+            self.add_to_app(
+                app=app,
+                op_name="stream_coarse_step_ABC_no_254",
+                level=level,
+                f_0_fd=self.f_1,
+                f_1_fd=self.f_0,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
+                omega=self.coalescence_factor,
+                timestep=0,
+            )
+            self.add_to_app(
+                app=app,
+                op_name="stream_coarse_step_254",
+                level=level,
+                f_0_fd=self.f_1,
+                f_1_fd=self.f_0,
+                bc_mask_fd=self.bc_mask,
+                missing_mask_fd=self.missing_mask,
+            )
+            return
+
         if self.mres_perf_opt == MresPerfOptimizationType.NAIVE_COLLIDE_STREAM:
             recursion_reference(self.count_levels - 1, app=self.app)
         elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST:
             recursion_fused_finest(self.count_levels - 1, app=self.app)
+        elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_254:
+            # Run kernel that generates teh 254 value in the bc_mask
+            wp.synchronize()
+            # self.bc_mask.update_host(0)
+            # wp.synchronize()
+            # self.bc_mask.export_vti(f"mask_before.vti", "u")
+
+            self.neon_container["reset_bc_mask_for_no_mr_no_bc_as_254"](0, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
+            wp.synchronize()
+            # self.bc_mask.update_host(0)
+            # wp.synchronize()
+            # self.bc_mask.export_vti(f"mask_after.vti", "u")
+            recursion_fused_finest_254(self.count_levels - 1, app=self.app)
+        elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_254_ALL:
+            # Run kernel that generates teh 254 value in the bc_mask
+            wp.synchronize()
+            # self.bc_mask.update_host(0)
+            # wp.synchronize()
+            # self.bc_mask.export_vti(f"mask_before.vti", "u")
+
+            num_levels = self.f_0.get_grid().num_levels
+            for l in range(num_levels):
+                self.neon_container["reset_bc_mask_for_no_mr_no_bc_as_254"](l, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0)
+            # wp.synchronize()
+            # self.bc_mask.update_host(0)
+            wp.synchronize()
+            # self.bc_mask.export_vti(f"mask_after.vti", "u")
+            recursion_fused_finest_254_all(self.count_levels - 1, app=self.app)
         else:
             raise ValueError(f"Unknown optimization level: {self.opt_level}")
 
diff --git a/xlb/mres_perf_optimization_type.py b/xlb/mres_perf_optimization_type.py
index 622982ff..ae14bcf1 100644
--- a/xlb/mres_perf_optimization_type.py
+++ b/xlb/mres_perf_optimization_type.py
@@ -12,6 +12,8 @@ class MresPerfOptimizationType(Enum):
 
     NAIVE_COLLIDE_STREAM = auto()
     FUSION_AT_FINEST = auto()
+    FUSION_AT_FINEST_254 = auto()
+    FUSION_AT_FINEST_254_ALL = auto()
 
     @staticmethod
     def from_string(value: str) -> "MresPerfOptimizationType":
diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py
index 63ff560d..240da307 100644
--- a/xlb/operator/stepper/nse_multires_stepper.py
+++ b/xlb/operator/stepper/nse_multires_stepper.py
@@ -198,9 +198,9 @@ def compute(index: Any):
                                 if coalescence_factor > self.compute_dtype(0):
                                     coalescence_factor = self.compute_dtype(1) / (self.compute_dtype(2) * coalescence_factor)
                                     wp.neon_write(coalescence_factor_pn, index, l, coalescence_factor)
-
-                            else:
-                                wp.print("ERRRRRRORRRRRRRRRRRRRR")
+                            #
+                            # else:
+                            #     wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
                 loader.declare_kernel(compute)
 
@@ -450,6 +450,135 @@ def device(index: Any):
 
             return ll_collide_coarse
 
+        @neon.Container.factory(name="collide_coarse_254")
+        def collide_coarse_254(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
+        ):
+            num_levels = f_0_fd.get_grid().num_levels
+
+            def ll_collide_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                # fake loading to enforce sequential step
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+
+                @wp.func
+                def device(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    """
+                    The c++ version starts with the following, which I am not sure is right:
+                        if (type(cell, 0) == CellType::bulk ) {
+                    BC type cells should do collide too
+                    """
+                    if _boundary_id != wp.uint8(254):
+                        return
+
+                    # Read thread data for populations, these are post streaming
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                    _f_post_stream = _f0_thread
+
+                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                    _feq = self.equilibrium.neon_functional(_rho, _u)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+
+                    for l in range(self.velocity_set.q):
+                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+
+                loader.declare_kernel(device)
+
+            return ll_collide_coarse
+
+        @neon.Container.factory(name="no_254_collide_coarse")
+        def collide_coarse_no_254(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: int,
+        ):
+            num_levels = f_0_fd.get_grid().num_levels
+
+            def ll_collide_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                if level + 1 < f_0_fd.get_grid().num_levels:
+                    f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
+                else:
+                    f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                # fake loading to enforce sequential step
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+
+                @wp.func
+                def device(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    """
+                    The c++ version starts with the following, which I am not sure is right:
+                        if (type(cell, 0) == CellType::bulk ) {
+                    BC type cells should do collide too
+                    """
+                    if _boundary_id == wp.uint8(255):
+                        return
+                    if _boundary_id == wp.uint8(254):
+                        return
+                    if not wp.neon_has_child(f_0_pn, index):
+                        # Read thread data for populations, these are post streaming
+                        _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                        _f_post_stream = _f0_thread
+
+                        _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                        _feq = self.equilibrium.neon_functional(_rho, _u)
+                        _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+
+                        # Apply post-collision boundary conditions
+                        _f_post_collision = apply_bc(
+                            index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
+                        )
+
+                        # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
+                        neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
+
+                        # Accumulate the post-collision populations in f_0
+                        for l in range(self.velocity_set.q):
+                            push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
+                            if level < num_levels - 1:
+                                val = _f_post_collision[l]
+                                wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
+                                # Verified that this is not needed: wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
+
+                            wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+                    else:
+                        for l in range(self.velocity_set.q):
+                            wp.neon_write(f_1_pn, index, l, self.compute_dtype(0))
+                            # Verified that this is not needed: wp.neon_write(f_0_pn, index, l, self.compute_dtype(0))
+
+                loader.declare_kernel(device)
+
+            return ll_collide_coarse
+
         @neon.Container.factory(name="stream_coarse_step_ABC")
         def stream_coarse_step_ABC(
             level: int,
@@ -550,8 +679,8 @@ def cl_stream_coarse(index: Any):
                                 accumulated = accumulated * coalescence_factor
                                 # wp.neon_write(f_1_pn, index, l, accumulated)
                                 _f_post_stream[l] = accumulated
-                            else:
-                                wp.print("ERRRRRRORRRRRRRRRRRRRR")
+                            # else:
+                            #     wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
                     # do non mres post-streaming corrections
                     _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
@@ -566,8 +695,8 @@ def cl_stream_coarse(index: Any):
 
             return ll_stream_coarse
 
-        @neon.Container.factory(name="stream_coarse_step_A")
-        def stream_coarse_step_A(
+        @neon.Container.factory(name="no_254_stream_coarse_step_ABC")
+        def stream_coarse_step_ABC_no_254(
             level: int,
             f_0_fd: Any,
             f_1_fd: Any,
@@ -576,12 +705,15 @@ def stream_coarse_step_A(
             omega: Any,
             timestep: int,
         ):
-            num_levels = f_0_fd.get_grid().get_num_levels()
+            num_levels = f_0_fd.get_grid().num_levels
 
             # if level != 0:
             #     # throw an exception
             #     raise Exception("Only the finest level is supported for now")
 
+            # module op to define odd of even iteration
+            # od_or_even = wp.module("odd_or_even", "even")
+
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
@@ -593,9 +725,15 @@ def ll_stream_coarse(loader: neon.Loader):
 
                 _c = self.velocity_set.c
 
+                coalescence_factor_fd = omega
+                coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd)
+
                 @wp.func
                 def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+
+                    if _boundary_id == wp.uint8(254):
+                        return
                     if _boundary_id == wp.uint8(255):
                         return
 
@@ -610,48 +748,119 @@ def cl_stream_coarse(index: Any):
                     _f_post_collision = _f0_thread
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
+                    for l in range(self.velocity_set.q):
+                        if l == lattice_central_index:
+                            # HERE, we skip the center direction
+                            continue
+
+                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
+
+                        has_ngh_at_same_level = wp.bool(False)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+
+                        # if (!pin.hasChildren(cell, dir)) {
+                        if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
+                            # NO finer ngh. in the pull direction (opposite of l)
+                            if not has_ngh_at_same_level:
+                                # NO ngh. at the same level
+                                # COULD we have a ngh. at the courser level?
+                                if wp.neon_has_parent(f_0_pn, index):
+                                    # YES halo cell on top of us
+                                    has_a_coarser_ngh = wp.bool(False)
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
+                                    )
+                                    if has_a_coarser_ngh:
+                                        # Full state:
+                                        # NO finer ngh. in the pull direction (opposite of l)
+                                        # NO ngh. at the same level
+                                        # YES ghost cell on top of us
+                                        # YES courser ngh.
+                                        # -> **Explosion**
+                                        # wp.neon_write(f_1_pn, index, l, exploded_pop)
+                                        _f_post_stream[l] = exploded_pop
+                        else:
+                            # HERE -> I have a finer ngh. in direction pull (opposite l)
+                            # Then I have to read from the halo on top of my finer ngh.
+                            if has_ngh_at_same_level:
+                                # if l == 10:
+                                #     wp.print(accumulated)
+                                #     glob = wp.neon_global_idx(f_1_pn, index)
+                                #     wp.neon_cuda_info()
+                                #     wp.neon_print(glob)
+                                #     wp.neon_level(f_1_pn)
+                                # accumulated = _w[l]
+                                # Full State
+                                # YES finer ngh. in the pull direction (opposite of l)
+                                # YES ngh. at the same level
+                                # -> **Coalescence**
+                                coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
+                                accumulated = accumulated * coalescence_factor
+                                # wp.neon_write(f_1_pn, index, l, accumulated)
+                                _f_post_stream[l] = accumulated
+                            # else:
+                            #     wp.print("ERRRRRRORRRRRRRRRRRRRR")
+
+                    # do non mres post-streaming corrections
+                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
+
+                    # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
+                    neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
+
                     for l in range(self.velocity_set.q):
                         wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
-                    # wp.print("stream_coarse")
 
                 loader.declare_kernel(cl_stream_coarse)
 
             return ll_stream_coarse
 
-        @neon.Container.factory(name="stream_coarse_step_B")
-        def stream_coarse_step_B(
+        @neon.Container.factory(name="reset_bc_mask_for_no_mr_no_bc_as_254")
+        def reset_bc_mask_for_no_mr_no_bc_as_254(
             level: int,
             f_0_fd: Any,
             f_1_fd: Any,
             bc_mask_fd: Any,
             missing_mask_fd: Any,
-            omega: Any,
-            timestep: int,
         ):
+            num_levels = f_0_fd.get_grid().num_levels
+
+            # if level != 0:
+            #     # throw an exception
+            #     raise Exception("Only the finest level is supported for now")
+
+            # module op to define odd of even iteration
+            # od_or_even = wp.module("odd_or_even", "even")
+
             def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
-                coalescence_factor_fd = omega
+
                 f_0_pn = loader.get_mres_read_handle(f_0_fd)
                 f_1_pn = loader.get_mres_write_handle(f_1_fd)
 
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
-                coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd)
 
                 _c = self.velocity_set.c
-                _w = self.velocity_set.w
 
                 @wp.func
                 def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(255):
                         return
+                    if _boundary_id != 0:
+                        return
 
                     are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
                     if are_we_a_halo_cell:
                         # HERE: we are a halo cell so we just exit
                         return
 
+                    # do stream normally
+                    _missing_mask = _missing_mask_vec()
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                    _f_post_collision = _f0_thread
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
                     for l in range(self.velocity_set.q):
                         if l == lattice_central_index:
                             # HERE, we skip the center direction
@@ -681,7 +890,8 @@ def cl_stream_coarse(index: Any):
                                         # YES ghost cell on top of us
                                         # YES courser ngh.
                                         # -> **Explosion**
-                                        wp.neon_write(f_1_pn, index, l, exploded_pop)
+                                        # wp.neon_write(f_1_pn, index, l, exploded_pop)
+                                        return
                         else:
                             # HERE -> I have a finer ngh. in direction pull (opposite l)
                             # Then I have to read from the halo on top of my finer ngh.
@@ -697,59 +907,49 @@ def cl_stream_coarse(index: Any):
                                 # YES finer ngh. in the pull direction (opposite of l)
                                 # YES ngh. at the same level
                                 # -> **Coalescence**
-                                coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
-                                accumulated = accumulated * coalescence_factor
-                                wp.neon_write(f_1_pn, index, l, accumulated)
-
+                                return
                             else:
                                 wp.print("ERRRRRRORRRRRRRRRRRRRR")
 
+                    # Only fluid voxels with the following properties can reach this line:
+                    # They are not BC voxels
+                    # They are not on a resolution jump -> they do not do coalescence or explosion
+                    # They are not mr halo cells
+                    wp.neon_write(bc_mask_pn, index, 0, wp.uint8(254))
+
                 loader.declare_kernel(cl_stream_coarse)
 
             return ll_stream_coarse
 
-        @neon.Container.factory(name="finest_fused_pull")
-        def finest_fused_pull(
+        @neon.Container.factory(name="stream_coarse_step_A")
+        def stream_coarse_step_A(
             level: int,
             f_0_fd: Any,
             f_1_fd: Any,
             bc_mask_fd: Any,
             missing_mask_fd: Any,
             omega: Any,
-            timestep: Any,
-            is_f1_the_explosion_src_field: bool,
+            timestep: int,
         ):
-            if level != 0:
-                # throw an exception
-                raise Exception("Only the finest level is supported for now")
-            grid = f_0_fd.get_grid()
-            num_levels = grid.num_levels
+            num_levels = f_0_fd.get_grid().get_num_levels()
 
             # if level != 0:
             #     # throw an exception
             #     raise Exception("Only the finest level is supported for now")
 
-            # module op to define odd of even iteration
-            # od_or_even = wp.module("odd_or_even", "even")
-
-            def finest_fused_pull_launcher(loader: neon.Loader):
+            def ll_stream_coarse(loader: neon.Loader):
                 loader.set_mres_grid(bc_mask_fd.get_grid(), level)
 
-                if level + 1 < f_0_fd.get_grid().num_levels:
-                    f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
-                    f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
-                else:
-                    f_0_pn = loader.get_mres_read_handle(f_0_fd)
-                    f_1_pn = loader.get_mres_write_handle(f_1_fd)
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
 
                 bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
                 missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
 
                 _c = self.velocity_set.c
-                _w = self.velocity_set.w
 
                 @wp.func
-                def finest_fused_pull_kernel(index: Any):
+                def cl_stream_coarse(index: Any):
                     _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
                     if _boundary_id == wp.uint8(255):
                         return
@@ -765,6 +965,324 @@ def finest_fused_pull_kernel(index: Any):
                     _f_post_collision = _f0_thread
                     _f_post_stream = self.stream.neon_functional(f_0_pn, index)
 
+                    for l in range(self.velocity_set.q):
+                        wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
+                    # wp.print("stream_coarse")
+
+                loader.declare_kernel(cl_stream_coarse)
+
+            return ll_stream_coarse
+
+        @neon.Container.factory(name="stream_coarse_step_254")
+        def stream_coarse_step_254(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any):
+
+            def ll_stream_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+
+                @wp.func
+                def cl_stream_coarse(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id != wp.uint8(254):
+                        return
+
+                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+
+                    # do stream normally
+                    _missing_mask = _missing_mask_vec()
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                    _f_post_collision = _f0_thread
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+                    for l in range(self.velocity_set.q):
+                        wp.neon_write(f_1_pn, index, l, _f_post_stream[l])
+                    # wp.print("stream_coarse")
+
+                loader.declare_kernel(cl_stream_coarse)
+
+            return ll_stream_coarse
+
+        @neon.Container.factory(name="stream_coarse_step_B")
+        def stream_coarse_step_B(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+        ):
+            def ll_stream_coarse(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+                coalescence_factor_fd = omega
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+                coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+
+                @wp.func
+                def cl_stream_coarse(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+
+                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+
+                    for l in range(self.velocity_set.q):
+                        if l == lattice_central_index:
+                            # HERE, we skip the center direction
+                            continue
+
+                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
+
+                        has_ngh_at_same_level = wp.bool(False)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+
+                        # if (!pin.hasChildren(cell, dir)) {
+                        if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction):
+                            # NO finer ngh. in the pull direction (opposite of l)
+                            if not has_ngh_at_same_level:
+                                # NO ngh. at the same level
+                                # COULD we have a ngh. at the courser level?
+                                if wp.neon_has_parent(f_0_pn, index):
+                                    # YES halo cell on top of us
+                                    has_a_coarser_ngh = wp.bool(False)
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
+                                    )
+                                    if has_a_coarser_ngh:
+                                        # Full state:
+                                        # NO finer ngh. in the pull direction (opposite of l)
+                                        # NO ngh. at the same level
+                                        # YES ghost cell on top of us
+                                        # YES courser ngh.
+                                        # -> **Explosion**
+                                        wp.neon_write(f_1_pn, index, l, exploded_pop)
+                        else:
+                            # HERE -> I have a finer ngh. in direction pull (opposite l)
+                            # Then I have to read from the halo on top of my finer ngh.
+                            if has_ngh_at_same_level:
+                                # if l == 10:
+                                #     wp.print(accumulated)
+                                #     glob = wp.neon_global_idx(f_1_pn, index)
+                                #     wp.neon_cuda_info()
+                                #     wp.neon_print(glob)
+                                #     wp.neon_level(f_1_pn)
+                                # accumulated = _w[l]
+                                # Full State
+                                # YES finer ngh. in the pull direction (opposite of l)
+                                # YES ngh. at the same level
+                                # -> **Coalescence**
+                                coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l)
+                                accumulated = accumulated * coalescence_factor
+                                wp.neon_write(f_1_pn, index, l, accumulated)
+                            #
+                            # else:
+                            #     wp.print("ERRRRRRORRRRRRRRRRRRRR")
+
+                loader.declare_kernel(cl_stream_coarse)
+
+            return ll_stream_coarse
+
+        @neon.Container.factory(name="finest_fused_pull")
+        def finest_fused_pull(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: Any,
+            is_f1_the_explosion_src_field: bool,
+        ):
+            if level != 0:
+                # throw an exception
+                raise Exception("Only the finest level is supported for now")
+            grid = f_0_fd.get_grid()
+            num_levels = grid.num_levels
+
+            # if level != 0:
+            #     # throw an exception
+            #     raise Exception("Only the finest level is supported for now")
+
+            # module op to define odd of even iteration
+            # od_or_even = wp.module("odd_or_even", "even")
+
+            def finest_fused_pull_launcher(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                if level + 1 < f_0_fd.get_grid().num_levels:
+                    f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
+                else:
+                    f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+
+                @wp.func
+                def finest_fused_pull_kernel(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+
+                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+
+                    # do stream normally
+                    _missing_mask = _missing_mask_vec()
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                    _f_post_collision = _f0_thread
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+                    for l in range(self.velocity_set.q):
+                        if l == lattice_central_index:
+                            # HERE, we skip the center direction
+                            continue
+
+                        pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l]))
+
+                        has_ngh_at_same_level = wp.bool(False)
+                        accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level)
+
+                        # NO finer ngh. in the pull direction (opposite of l)
+                        if not has_ngh_at_same_level:
+                            # NO ngh. at the same level
+                            # COULD we have a ngh. at the courser level?
+                            if wp.neon_has_parent(f_0_pn, index):
+                                # YES halo cell on top of us
+                                has_a_coarser_ngh = wp.bool(False)
+                                if is_f1_the_explosion_src_field:
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                                        f_1_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
+                                    )
+                                else:
+                                    exploded_pop = wp.neon_lbm_read_coarser_ngh(
+                                        f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh
+                                    )
+                                if has_a_coarser_ngh:
+                                    # Full state:
+                                    # NO finer ngh. in the pull direction (opposite of l)
+                                    # NO ngh. at the same level
+                                    # YES ghost cell on top of us
+                                    # YES courser ngh.
+                                    # -> **Explosion**
+                                    # wp.neon_write(f_1_pn, index, l, exploded_pop)
+                                    _f_post_stream[l] = exploded_pop
+
+                    # do non mres post-streaming corrections
+                    _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True)
+
+                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                    _feq = self.equilibrium.neon_functional(_rho, _u)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+
+                    # Apply post-collision boundary conditions
+                    _f_post_collision = apply_bc(
+                        index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False
+                    )
+
+                    # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1
+                    neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn)
+
+                    # Accumulate the post-collision populations in f_0
+                    for l in range(self.velocity_set.q):
+                        push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l]))
+                        if level < num_levels - 1:
+                            val = _f_post_collision[l]
+                            if is_f1_the_explosion_src_field:
+                                wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val)
+                            else:
+                                wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val)
+
+                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+
+                loader.declare_kernel(finest_fused_pull_kernel)
+
+            return finest_fused_pull_launcher
+
+        @neon.Container.factory(name="finest_fused_pull_no_254")
+        def finest_fused_pull_no_254(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+            timestep: Any,
+            is_f1_the_explosion_src_field: bool,
+        ):
+            if level != 0:
+                # throw an exception
+                raise Exception("Only the finest level is supported for now")
+            grid = f_0_fd.get_grid()
+            num_levels = grid.num_levels
+
+            # if level != 0:
+            #     # throw an exception
+            #     raise Exception("Only the finest level is supported for now")
+
+            # module op to define odd of even iteration
+            # od_or_even = wp.module("odd_or_even", "even")
+
+            def finest_fused_pull_launcher(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                if level + 1 < f_0_fd.get_grid().num_levels:
+                    f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up)
+                else:
+                    f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                    f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+
+                @wp.func
+                def finest_fused_pull_kernel(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id == wp.uint8(255):
+                        return
+                    if _boundary_id == wp.uint8(254):
+                        return
+
+                    are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index)
+                    if are_we_a_halo_cell:
+                        # HERE: we are a halo cell so we just exit
+                        return
+
+                    # do stream normally
+                    _missing_mask = _missing_mask_vec()
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                    _f_post_collision = _f0_thread
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
                     for l in range(self.velocity_set.q):
                         if l == lattice_central_index:
                             # HERE, we skip the center direction
@@ -831,6 +1349,60 @@ def finest_fused_pull_kernel(index: Any):
 
             return finest_fused_pull_launcher
 
+        @neon.Container.factory(name="254_finest_fused_pull")
+        def finest_fused_pull_254(
+            level: int,
+            f_0_fd: Any,
+            f_1_fd: Any,
+            bc_mask_fd: Any,
+            missing_mask_fd: Any,
+            omega: Any,
+        ):
+            if level != 0:
+                # throw an exception
+                raise Exception("Only the finest level is supported for now")
+            grid = f_0_fd.get_grid()
+            num_levels = grid.num_levels
+
+            if level != 0:
+                # throw an exception
+                raise Exception("Only the finest level is supported for now")
+
+            def finest_fused_pull_launcher(loader: neon.Loader):
+                loader.set_mres_grid(bc_mask_fd.get_grid(), level)
+
+                f_0_pn = loader.get_mres_read_handle(f_0_fd)
+                f_1_pn = loader.get_mres_write_handle(f_1_fd)
+
+                bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd)
+                missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd)
+
+                _c = self.velocity_set.c
+                _w = self.velocity_set.w
+
+                @wp.func
+                def finest_fused_pull_kernel_254(index: Any):
+                    _boundary_id = wp.neon_read(bc_mask_pn, index, 0)
+                    if _boundary_id != wp.uint8(254):
+                        return
+
+                    # do stream normally
+                    _missing_mask = _missing_mask_vec()
+                    _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index)
+                    _f_post_collision = _f0_thread
+                    _f_post_stream = self.stream.neon_functional(f_0_pn, index)
+
+                    _rho, _u = self.macroscopic.neon_functional(_f_post_stream)
+                    _feq = self.equilibrium.neon_functional(_rho, _u)
+                    _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega)
+
+                    for l in range(self.velocity_set.q):
+                        wp.neon_write(f_1_pn, index, l, _f_post_collision[l])
+
+                loader.declare_kernel(finest_fused_pull_kernel_254)
+
+            return finest_fused_pull_launcher
+
         @neon.Container.factory(name="stream_coarse_step_C")
         def stream_coarse_step_C(
             level: int,
@@ -889,29 +1461,89 @@ def cl_stream_coarse(index: Any):
             "stream_coarse_step_B": stream_coarse_step_B,
             "stream_coarse_step_C": stream_coarse_step_C,
             "finest_fused_pull": finest_fused_pull,
+            "finest_fused_pull_no_254": finest_fused_pull_no_254,
+            "finest_fused_pull_254": finest_fused_pull_254,
+            "reset_bc_mask_for_no_mr_no_bc_as_254": reset_bc_mask_for_no_mr_no_bc_as_254,
+            "collide_coarse_no_254": collide_coarse_no_254,
+            "collide_coarse_254": collide_coarse_254,
+            "stream_coarse_step_ABC_no_254": stream_coarse_step_ABC_no_254,
+            "stream_coarse_step_254": stream_coarse_step_254,
         }
 
     def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep):
         self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep).run(0)
 
-    def add_to_app(
-        self,
-        app,
-        op_name,
-        mres_level,
-        f_0,
-        f_1,
-        bc_mask,
-        missing_mask,
-        omega,
-        timestep,
-        is_f1_the_explosion_src_field: bool = None,
-    ):
+    def add_to_app(self, **kwargs):
+        import inspect
+
+        def validate_kwargs_forward(func, kwargs):
+            """
+            Check whether `func(**kwargs)` would be valid,
+            and return *all* the issues instead of raising on the first one.
+
+            Returns a dict; empty dict means "everything is OK".
+            """
+            sig = inspect.signature(func)
+            params = sig.parameters
+
+            errors = {}
+
+            # --- 1. Positional-only required params (cannot be given via kwargs) ---
+            pos_only_required = [name for name, p in params.items() if p.kind == inspect.Parameter.POSITIONAL_ONLY and p.default is inspect._empty]
+            if pos_only_required:
+                errors["positional_only_required"] = pos_only_required
+
+            # --- 2. Unexpected kwargs (if no **kwargs in target) ---
+            has_var_kw = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values())
+            if not has_var_kw:
+                allowed_kw = {
+                    name
+                    for name, p in params.items()
+                    if p.kind
+                    in (
+                        inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                        inspect.Parameter.KEYWORD_ONLY,
+                    )
+                }
+                unexpected = sorted(set(kwargs) - allowed_kw)
+                if unexpected:
+                    errors["unexpected_kwargs"] = unexpected
+
+            # --- 3. Missing required keyword-passable params ---
+            missing_required = [
+                name
+                for name, p in params.items()
+                if p.kind
+                in (
+                    inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                    inspect.Parameter.KEYWORD_ONLY,
+                )
+                and p.default is inspect._empty  # no default
+                and name not in kwargs  # not provided
+            ]
+            if missing_required:
+                errors["missing_required"] = missing_required
+
+            return errors
+
+        container_generator = None
+        try:
+            op_name = kwargs.pop("op_name")
+            app = kwargs.pop("app")
+        except:
+            raise ValueError("op_name and app must be provided as keyword arguments")
+
+        try:
+            container_generator = self.neon_container[op_name]
+        except KeyError:
+            raise ValueError(f"Operator {op_name} not found in neon container. Available operators: {list(self.neon_container.keys())}")
+
+        errors = validate_kwargs_forward(container_generator, kwargs)
+        if errors:
+            raise ValueError(f"Cannot forward kwargs to target: {errors}")
+
         nvtx.push_range(f"New Container {op_name}", color="yellow")
-        if is_f1_the_explosion_src_field is None:
-            app.append(self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep))
-        else:
-            app.append(self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep, is_f1_the_explosion_src_field))
+        app.append(container_generator(**kwargs))
         nvtx.pop_range()
 
     @Operator.register_backend(ComputeBackend.NEON)