diff --git a/examples/cfd/grid_refinement/ahmed.py b/examples/cfd/grid_refinement/ahmed.py index f6ef8b20..3b258870 100644 --- a/examples/cfd/grid_refinement/ahmed.py +++ b/examples/cfd/grid_refinement/ahmed.py @@ -56,7 +56,7 @@ def generate_cuboid_mesh(stl_filename, voxel_size): """ # Domain multipliers for each refinement level domain_multiplier = [ - [3.0, 4.0, 2.5, 2.5, 0.0, 4.0], # -x, x, -y, y, -z, z + [3.0, 4.0, 2.5, 2.5, 0.0, 4.0], # -x, x, -y, y, -z, z [1.2, 1.25, 1.75, 1.75, 0.0, 1.5], [0.8, 1.0, 1.25, 1.25, 0.0, 1.2], [0.5, 0.65, 0.6, 0.60, 0.0, 0.6], diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py index 5f9ddeb8..66945e89 100644 --- a/examples/performance/mlups_3d.py +++ b/examples/performance/mlups_3d.py @@ -16,31 +16,59 @@ def parse_arguments(): - parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)") - parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid") - parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation") - parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)") - parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)") + # Define valid options for consistency + COMPUTE_BACKENDS = ["neon", "warp", "jax"] + PRECISION_OPTIONS = ["fp32/fp32", "fp64/fp64", "fp64/fp32", "fp32/fp16"] + VELOCITY_SETS = ["D3Q19", "D3Q27"] + COLLISION_MODELS = ["BGK", "KBC"] + OCC_OPTIONS = ["standard", "none"] + + parser = argparse.ArgumentParser( + description="MLUPS Benchmark for 3D Lattice Boltzmann Method Simulation", + epilog=f""" +Examples: + %(prog)s 100 1000 neon fp32/fp32 + %(prog)s 200 500 neon fp64/fp64 --collision_model KBC --velocity_set D3Q27 + %(prog)s 150 2000 neon fp32/fp32 --gpu_devices=[0,1,2] --measure_scalability --report + %(prog)s 100 1000 neon fp32/fp32 --repetitions 5 --export_final_velocity + """, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + # Positional arguments + parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid (e.g., 100)") + parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation (e.g., 1000)") + parser.add_argument("compute_backend", type=str, choices=COMPUTE_BACKENDS, help=f"Backend for the simulation ({', '.join(COMPUTE_BACKENDS)})") + parser.add_argument("precision", type=str, choices=PRECISION_OPTIONS, help=f"Precision for the simulation ({', '.join(PRECISION_OPTIONS)})") + + # Optional arguments + parser.add_argument("--gpu_devices", type=str, default=None, help="CUDA devices to use for Neon backend (e.g., [0,1,2] or [0])") parser.add_argument( - "--gpu_devices", + "--velocity_set", type=str, - default=None, - help="List of the CUDA devices to use (e.g., --gpu_devices=[0,1,2]). This is only used for Neon backend.", + default="D3Q19", + choices=VELOCITY_SETS, + help=f"Lattice velocity set (default: D3Q19, choices: {', '.join(VELOCITY_SETS)})", ) - # add a flat to choose between 19 or 27 velocity set - parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)") - # add a flat to choose between multi-gpu occ options based on the neon occ: parser.add_argument( - "--occ", type=str, default="standard", help="Overlapping Communication and Computation option (standard, none) (default: standard)" + "--collision_model", + type=str, + default="BGK", + choices=COLLISION_MODELS, + help=f"Collision model (default: BGK, choices: {', '.join(COLLISION_MODELS)}, KBC requires D3Q27)", ) - parser.add_argument("--report", action="store_true", help="Generate a neon report file (default: disabled)") - parser.add_argument("--export_final_velocity", action="store_true", help="Export the final velocity field to a vti file (default: disabled)") - parser.add_argument("--measure_scalability", action="store_true", help="Measure scalability of the simulation (default: disabled)") parser.add_argument( - "--repetitions", - type=int, - default=1, - help="Number of repetitions for the simulation (default: 1) to get the average MLUPs and standard deviation", + "--occ", + type=str, + default="standard", + choices=OCC_OPTIONS, + help=f"Overlapping Communication and Computation strategy (default: standard, choices: {', '.join(OCC_OPTIONS)})", + ) + parser.add_argument("--report", action="store_true", help="Generate Neon performance report") + parser.add_argument("--export_final_velocity", action="store_true", help="Export final velocity field to VTI file") + parser.add_argument("--measure_scalability", action="store_true", help="Measure performance across different GPU counts") + parser.add_argument( + "--repetitions", type=int, default=1, metavar="N", help="Number of simulation repetitions for statistical analysis (default: 1)" ) args = parser.parse_args() @@ -56,29 +84,32 @@ def parse_arguments(): except (ValueError, SyntaxError): raise ValueError("Invalid gpu_devices format. Use format like [0,1,2] or [0]") - # Checking the compute backend and covert it to the right type - compute_backend = None - if args.compute_backend == "jax": - compute_backend = ComputeBackend.JAX - elif args.compute_backend == "warp": - compute_backend = ComputeBackend.WARP - elif args.compute_backend == "neon": - compute_backend = ComputeBackend.NEON - else: - raise ValueError("Invalid compute backend specified. Use 'jax', 'warp', or 'neon'.") + # Validate and convert compute backend + compute_backend_map = { + "jax": ComputeBackend.JAX, + "warp": ComputeBackend.WARP, + "neon": ComputeBackend.NEON, + } + compute_backend = compute_backend_map.get(args.compute_backend) + if compute_backend is None: + raise ValueError(f"Invalid compute backend '{args.compute_backend}'. Use: {', '.join(COMPUTE_BACKENDS)}") args.compute_backend = compute_backend - # Checking OCC - if args.occ not in ["standard", "none"]: - raise ValueError("Invalid occupancy option. Use 'standard', or 'none'.") - if args.gpu_devices is None and args.compute_backend == ComputeBackend.NEON: - print("[Warning] No GPU devices specified. Using default device 0.") - args.gpu_devices = [0] + # Handle GPU devices for Neon backend if args.compute_backend == ComputeBackend.NEON: + if args.gpu_devices is None: + print("[INFO] No GPU devices specified. Using default device 0.") + args.gpu_devices = [0] + import neon - occ = neon.SkeletonConfig.OCC.from_string(args.occ) - args.occ = occ + occ_enum = neon.SkeletonConfig.OCC.from_string(args.occ) + args.occ_enum = occ_enum # Store the enum for Neon + args.occ_display = args.occ # Store the original string for display + else: + if args.gpu_devices is not None: + raise ValueError(f"--gpu_devices can only be used with Neon backend, not {args.compute_backend.name}") + args.gpu_devices = [0] # Default for non-Neon backends # Checking precision policy precision_policy_map = { @@ -89,12 +120,12 @@ def parse_arguments(): } precision_policy = precision_policy_map.get(args.precision) if precision_policy is None: - raise ValueError("Invalid precision specified.") + raise ValueError(f"Invalid precision '{args.precision}'. Use: {', '.join(PRECISION_OPTIONS)}") args.precision_policy = precision_policy - # Checking velocity set - if args.velocity_set not in ["D3Q19", "D3Q27"]: - raise ValueError("Invalid velocity set. Use 'D3Q19' or 'D3Q27'.") + # Validate collision model and velocity set compatibility + if args.collision_model == "KBC" and args.velocity_set != "D3Q27": + raise ValueError("KBC collision model requires D3Q27 velocity set. Use --velocity_set D3Q27") if args.velocity_set == "D3Q19": velocity_set = xlb.velocity_set.D3Q19(precision_policy=args.precision_policy, compute_backend=compute_backend) @@ -102,42 +133,45 @@ def parse_arguments(): velocity_set = xlb.velocity_set.D3Q27(precision_policy=args.precision_policy, compute_backend=compute_backend) args.velocity_set = velocity_set - if args.gpu_devices is not None and args.compute_backend != ComputeBackend.NEON: - raise ValueError("--gpu_devices can be used only with the Neon backend.") - - if args.gpu_devices is None: - args.gpu_devices = [0] - print_args(args) return args def print_args(args): - # Print simulation configuration - print("=" * 60) - print(" 3D LATTICE BOLTZMANN SIMULATION CONFIG") - print("=" * 60) - print(f"Grid Size: {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})") - print(f"Total Lattice Points: {args.cube_edge**3:,}") - print(f"Time Steps: {args.num_steps:,}") - print(f"Compute Backend: {args.compute_backend.name}") - print(f"Precision Policy: {args.precision}") - print(f"Velocity Set: {args.velocity_set.__class__.__name__}") - print(f"Generate Report: {'Yes' if args.report else 'No'}") - print(f"Measure Scalability: {'Yes' if args.measure_scalability else 'No'}") - print(f"Export Velocity: {'Yes' if args.export_final_velocity else 'No'}") - print(f"Repetitions: {args.repetitions}") + """Print simulation configuration in a clean, organized format""" + print("\n" + "=" * 70) + print(" SIMULATION CONFIGURATION") + print("=" * 70) + + # Grid and simulation parameters + print("GRID & SIMULATION:") + print(f" Grid Size: {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})") + print(f" Total Lattice Points: {args.cube_edge**3:,}") + print(f" Time Steps: {args.num_steps:,}") + print(f" Repetitions: {args.repetitions}") + + # Computational settings + print("\nCOMPUTATIONAL SETTINGS:") + print(f" Compute Backend: {args.compute_backend.name}") + print(f" Precision Policy: {args.precision}") + print(f" Velocity Set: {args.velocity_set.__class__.__name__}") + print(f" Collision Model: {args.collision_model}") + # Backend-specific settings if args.compute_backend.name == "NEON": - print(f"GPU Devices: {args.gpu_devices}") - # Convert the neon OCC enum back to string for display - occ_display = args.occ.to_string() if hasattr(args.occ, "__class__") else args.occ - print(f"OCC Strategy: {occ_display}") + print("\nNEON BACKEND SETTINGS:") + print(f" GPU Devices: {args.gpu_devices}") + print(f" OCC Strategy: {args.occ_display}") - print("=" * 60) - print("Starting simulation...") - print() + # Output options + print("\nOUTPUT OPTIONS:") + print(f" Generate Report: {'Yes' if args.report else 'No'}") + print(f" Measure Scalability: {'Yes' if args.measure_scalability else 'No'}") + print(f" Export Velocity: {'Yes' if args.export_final_velocity else 'No'}") + + print("=" * 70) + print("Starting simulation...\n") def init_xlb(args): @@ -148,12 +182,14 @@ def init_xlb(args): ) options = None if args.compute_backend == ComputeBackend.NEON: - neon_options = {"occ": args.occ, "device_list": args.gpu_devices} + neon_options = {"occ": args.occ_enum, "device_list": args.gpu_devices} options = neon_options return args.compute_backend, args.precision_policy, options -def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, options, export_final_velocity, repetitions, num_devices): +def run_simulation( + compute_backend, precision_policy, grid_shape, num_steps, options, export_final_velocity, repetitions, num_devices, collision_model +): grid = grid_factory(grid_shape, backend_config=options) box = grid.bounding_box_indices() box_no_edge = grid.bounding_box_indices(remove_edges=True) @@ -170,7 +206,7 @@ def run_simulation(compute_backend, precision_policy, grid_shape, num_steps, opt stepper = IncompressibleNavierStokesStepper( grid=grid, boundary_conditions=boundary_conditions, - collision_type="BGK", + collision_type=collision_model, backend_config=options, ) @@ -232,50 +268,6 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time): return mlups -def print_summary(args, elapsed_time, mlups): - """Print comprehensive simulation summary with parameters and performance results""" - total_lattice_points = args.cube_edge**3 - total_lattice_updates = total_lattice_points * args.num_steps - lattice_points_per_second = total_lattice_updates / elapsed_time - - print("\n\n\n" + "=" * 70) - print(" SIMULATION SUMMARY") - print("=" * 70) - - # Simulation Parameters - print("SIMULATION PARAMETERS:") - print("-" * 25) - print(f" Grid Size: {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})") - print(f" Total Lattice Points: {total_lattice_points:,}") - print(f" Time Steps: {args.num_steps:,}") - print(f" Total Lattice Updates: {total_lattice_updates:,}") - print(f" Compute Backend: {args.compute_backend.name}") - print(f" Precision Policy: {args.precision}") - print(f" Velocity Set: {args.velocity_set.__class__.__name__}") - print(f" Generate Report: {'Yes' if args.report else 'No'}") - print(f" Measure Scalability: {'Yes' if args.measure_scalability else 'No'}") - - if args.compute_backend.name == "NEON": - print(f" GPU Devices: {args.gpu_devices}") - occ_display = str(args.occ).split(".")[-1] if hasattr(args.occ, "__class__") else args.occ - print(f" OCC Strategy: {occ_display}") - - print() - - # Performance Results - print("PERFORMANCE RESULTS:") - print("-" * 20) - print(f" Time in main loop: {elapsed_time:.3f} seconds") - print(f" MLUPs: {mlups:.2f}") - print(f" Time per LBM step: {elapsed_time / args.num_steps * 1000:.3f} ms") - - if args.compute_backend.name == "NEON" and len(args.gpu_devices) > 1: - mlups_per_gpu = mlups / len(args.gpu_devices) - print(f" MLUPs per GPU: {mlups_per_gpu:.2f}") - - print("=" * 70) - - def print_summary_with_stats(args, stats): """Print comprehensive simulation summary with statistics from multiple repetitions""" total_lattice_points = args.cube_edge**3 @@ -301,12 +293,13 @@ def print_summary_with_stats(args, stats): print(f" Compute Backend: {args.compute_backend.name}") print(f" Precision Policy: {args.precision}") print(f" Velocity Set: {args.velocity_set.__class__.__name__}") + print(f" Collision Model: {args.collision_model}") print(f" Generate Report: {'Yes' if args.report else 'No'}") print(f" Measure Scalability: {'Yes' if args.measure_scalability else 'No'}") if args.compute_backend.name == "NEON": print(f" GPU Devices: {args.gpu_devices}") - occ_display = str(args.occ).split(".")[-1] if hasattr(args.occ, "__class__") else args.occ + occ_display = args.occ_display print(f" OCC Strategy: {occ_display}") print() @@ -371,9 +364,10 @@ def print_scalability_summary(args, stats_list): print(f" Compute Backend: {args.compute_backend.name}") print(f" Precision Policy: {args.precision}") print(f" Velocity Set: {args.velocity_set.__class__.__name__}") + print(f" Collision Model: {args.collision_model}") if args.compute_backend.name == "NEON": - occ_display = str(args.occ).split(".")[-1] if hasattr(args.occ, "__class__") else args.occ + occ_display = args.occ_display print(f" OCC Strategy: {occ_display}") print(f" Available GPU Devices: {args.gpu_devices}") @@ -439,11 +433,18 @@ def print_scalability_summary(args, stats_list): def report(args, stats): import neon + import sys report = neon.Report("LBM MLUPS LDC") + + # Save the full command line + command_line = " ".join(sys.argv) + report.add_member("command_line", command_line) + report.add_member("velocity_set", args.velocity_set.__class__.__name__) report.add_member("compute_backend", args.compute_backend.name) report.add_member("precision_policy", args.precision) + report.add_member("collision_model", args.collision_model) report.add_member("grid_size", args.cube_edge) report.add_member("num_steps", args.num_steps) report.add_member("repetitions", args.repetitions) @@ -463,16 +464,27 @@ def report(args, stats): report.add_member("elapsed_time", stats["mean_elapsed_time"]) report.add_member("mlups", stats["mean_mlups"]) - report.add_member("occ", (args.occ.to_string())) + report.add_member("occ", args.occ_display) report.add_member_vector("gpu_devices", args.gpu_devices) report.add_member("num_devices", len(args.gpu_devices)) report.add_member("measure_scalability", args.measure_scalability) - report_name = "mlups_3d_" + f"size_{args.cube_edge}" - if args.measure_scalability: - report_name += f"_dev_{len(args.gpu_devices)}" + # Generate report name following the convention: script_name + parameters + report_name = "mlups_3d" + report_name += f"_velocity_set_{args.velocity_set.__class__.__name__}" + report_name += f"_compute_backend_{args.compute_backend.name}" + report_name += f"_precision_policy_{args.precision.replace('/', '_')}" + report_name += f"_collision_model_{args.collision_model}" + report_name += f"_grid_size_{args.cube_edge}" + report_name += f"_num_steps_{args.num_steps}" + + if args.compute_backend.name == "NEON": + report_name += f"_occ_{args.occ_display}" + report_name += f"_num_devices_{len(args.gpu_devices)}" + if args.repetitions > 1: - report_name += f"_rep_{args.repetitions}" + report_name += f"_repetitions_{args.repetitions}" + report.write(report_name, True) @@ -494,6 +506,7 @@ def benchmark(args): export_final_velocity=args.export_final_velocity, repetitions=args.repetitions, num_devices=len(args.gpu_devices), + collision_model=args.collision_model, ) for elapsed_time in elapsed_time_list: diff --git a/examples/performance/mlups_3d_multires.py b/examples/performance/mlups_3d_multires.py index cff15eb1..3a143764 100644 --- a/examples/performance/mlups_3d_multires.py +++ b/examples/performance/mlups_3d_multires.py @@ -10,21 +10,97 @@ from xlb.grid import multires_grid_factory from xlb.operator.stepper import MultiresIncompressibleNavierStokesStepper from xlb.operator.boundary_condition import FullwayBounceBackBC, EquilibriumBC +from xlb.mres_perf_optimization_type import MresPerfOptimizationType def parse_arguments(): - parser = argparse.ArgumentParser(description="MLUPS for 3D Lattice Boltzmann Method Simulation (BGK)") + parser = argparse.ArgumentParser( + description="MLUPS for 3D Lattice Boltzmann Method Simulation with Multi-resolution Grid", + epilog=""" +Examples: + %(prog)s 100 1000 neon fp32/fp32 2 NAIVE_COLLIDE_STREAM + %(prog)s 200 500 neon fp64/fp64 3 FUSION_AT_FINEST --report + %(prog)s 50 2000 neon fp32/fp16 2 NAIVE_COLLIDE_STREAM --export_final_velocity + +Valid values: + compute_backend: neon + precision: fp32/fp32, fp64/fp64, fp64/fp32, fp32/fp16 + mres_perf_opt: NAIVE_COLLIDE_STREAM, FUSION_AT_FINEST + velocity_set: D3Q19, D3Q27 + collision_model: BGK, KBC + """, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + # Positional arguments - parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid") - parser.add_argument("num_steps", type=int, help="Timestep for the simulation") - parser.add_argument("compute_backend", type=str, help="Backend for the simulation (jax, warp or neon)") - parser.add_argument("precision", type=str, help="Precision for the simulation (e.g., fp32/fp32)") + parser.add_argument("cube_edge", type=int, help="Length of the edge of the cubic grid (e.g., 100)") + parser.add_argument("num_steps", type=int, help="Number of timesteps for the simulation (e.g., 1000)") + parser.add_argument("compute_backend", type=str, help="Backend for the simulation (neon)") + parser.add_argument("precision", type=str, help="Precision for the simulation (fp32/fp32, fp64/fp64, fp64/fp32, fp32/fp16)") + parser.add_argument("num_levels", type=int, help="Number of levels for the multiresolution grid (e.g., 2)") + parser.add_argument( + "mres_perf_opt", + type=MresPerfOptimizationType.from_string, + help="Multi-resolution performance optimization strategy (NAIVE_COLLIDE_STREAM, FUSION_AT_FINEST)", + ) # Optional arguments parser.add_argument("--num_devices", type=int, default=0, help="Number of devices for the simulation (default: 0)") parser.add_argument("--velocity_set", type=str, default="D3Q19", help="Lattice type: D3Q19 or D3Q27 (default: D3Q19)") + parser.add_argument("--collision_model", type=str, default="BGK", help="Collision model: BGK or KBC (default: BGK)") + + parser.add_argument("--report", action="store_true", help="Generate a neon report file (default: disabled)") + parser.add_argument("--export_final_velocity", action="store_true", help="Export the final velocity field to a vti file (default: disabled)") + + try: + args = parser.parse_args() + except SystemExit: + # Re-raise with custom message + print("\n" + "=" * 60) + print("USAGE EXAMPLES:") + print("=" * 60) + print("python mlups_3d_multires.py 100 1000 neon fp32/fp32 2 NAIVE_COLLIDE_STREAM") + print("python mlups_3d_multires.py 200 500 neon fp64/fp64 3 FUSION_AT_FINEST --report") + print("\nVALID VALUES:") + print(" compute_backend: neon") + print(" precision: fp32/fp32, fp64/fp64, fp64/fp32, fp32/fp16") + print(" mres_perf_opt: NAIVE_COLLIDE_STREAM, FUSION_AT_FINEST") + print(" velocity_set: D3Q19, D3Q27") + print(" collision_model: BGK, KBC") + print("=" * 60) + raise + + print_args(args) + + if args.compute_backend != "neon": + raise ValueError("Invalid compute backend specified. Use 'neon' which supports multi-resolution!") + + if args.collision_model not in ["BGK", "KBC"]: + raise ValueError("Invalid collision model specified. Use 'BGK' or 'KBC'.") + + return args - return parser.parse_args() + +def print_args(args): + # Print simulation configuration + print("=" * 60) + print(" 3D LATTICE BOLTZMANN SIMULATION CONFIG") + print("=" * 60) + print(f"Grid Size: {args.cube_edge}³ ({args.cube_edge:,} × {args.cube_edge:,} × {args.cube_edge:,})") + print(f"Total Lattice Points: {args.cube_edge**3:,}") + print(f"Time Steps: {args.num_steps:,}") + print(f"Number Levels: {args.num_levels}") + print(f"Compute Backend: {args.compute_backend}") + print(f"Precision Policy: {args.precision}") + print(f"Velocity Set: {args.velocity_set}") + print(f"Collision Model: {args.collision_model}") + print(f"Mres Perf Opt: {args.mres_perf_opt}") + print(f"Generate Report: {'Yes' if args.report else 'No'}") + print(f"Export Velocity: {'Yes' if args.export_final_velocity else 'No'}") + + print("=" * 60) + print("Starting simulation...") + print() def setup_simulation(args): @@ -61,7 +137,7 @@ def setup_simulation(args): return velocity_set -def problem1(grid_shape, velocity_set): +def problem1(grid_shape, velocity_set, num_levels): def peel(dim, idx, peel_level, outwards): if outwards: xIn = idx.x <= peel_level or idx.x >= dim.x - 1 - peel_level @@ -108,7 +184,6 @@ def get_levels(num_levels): levels.append(lastLevel) return levels - num_levels = 4 levels = get_levels(num_levels) grid = multires_grid_factory( @@ -129,9 +204,8 @@ def get_levels(num_levels): return grid, lid, walls -def problem2(grid_shape, velocity_set): +def problem2(grid_shape, velocity_set, num_levels): # Example 2: Coarsest at the edges (2 level only) - num_levels = 4 level_origins = [] level_list = [] for lvl in range(num_levels): @@ -167,7 +241,15 @@ def problem2(grid_shape, velocity_set): return grid, lid, walls -def run(velocity_set, grid_shape, num_steps): +def run( + velocity_set, + grid_shape, + num_steps, + num_levels, + collision_model, + export_final_velocity, + mres_perf_opt, +): # Create grid and setup boundary conditions # Convert indices to list of indices per level @@ -177,10 +259,10 @@ def run(velocity_set, grid_shape, num_steps): # walls = construct_indices_per_level(grid_shape, walls, levels_mask, level_origins) # Example 1: fine to coarse - # grid, lid, walls = problem1(grid_shape, velocity_set) + # grid, lid, walls = problem1(grid_shape, velocity_set, num_levels) # Example 2: Coarse to fine: - grid, lid, walls = problem2(grid_shape, velocity_set) + grid, lid, walls = problem1(grid_shape, velocity_set, num_levels) prescribed_vel = 0.1 boundary_conditions = [ @@ -195,7 +277,13 @@ def run(velocity_set, grid_shape, num_steps): omega_finest = 1.0 / (3.0 * visc + 0.5) # Define a multi-resolution simulation manager - sim = xlb.helper.MultiresSimulationManager(omega_finest=omega_finest, grid=grid, boundary_conditions=boundary_conditions, collision_type="KBC") + sim = xlb.helper.MultiresSimulationManager( + omega_finest=omega_finest, + grid=grid, + boundary_conditions=boundary_conditions, + collision_type=collision_model, + mres_perf_opt=mres_perf_opt, + ) # sim.export_macroscopic("Initial_") # sim.step() @@ -203,6 +291,10 @@ def run(velocity_set, grid_shape, num_steps): print("start timing") wp.synchronize() start_time = time.time() + + if num_levels == 1: + num_steps = num_steps // 2 + for i in range(num_steps): sim.step() # if i % 1000 == 0: @@ -212,6 +304,9 @@ def run(velocity_set, grid_shape, num_steps): t = time.time() - start_time print(f"Timing {t}") + if export_final_velocity: + sim.export_macroscopic("u_lid_driven_cavity_") + # sim.export_macroscopic("u_lid_driven_cavity_") num_levels = grid.count_levels return {"time": t, "num_levels": num_levels} @@ -236,11 +331,42 @@ def calculate_mlups(cube_edge, num_steps, elapsed_time, num_levels): # save_image(fields["u_magnitude"][:, ny//2, :], timestep=i, prefix="lid_driven_cavity") +def generate_report(args, stats, mlups_stats): + """Generate a neon report file with simulation parameters and results""" + import neon + import sys + + report = neon.Report("LBM MLUPS Multiresolution LDC") + + # Save the full command line + command_line = " ".join(sys.argv) + report.add_member("command_line", command_line) + + report.add_member("velocity_set", args.velocity_set) + report.add_member("compute_backend", args.compute_backend) + report.add_member("precision_policy", args.precision) + report.add_member("collision_model", args.collision_model) + report.add_member("grid_size", args.cube_edge) + report.add_member("num_steps", args.num_steps) + report.add_member("num_levels", stats["num_levels"]) + report.add_member("finer_steps", mlups_stats["finer_steps"]) + + # Performance metrics + report.add_member("elapsed_time", stats["time"]) + report.add_member("emlups", mlups_stats["EMLUPS"]) + + report_name = f"mlups_3d_multires_size_{args.cube_edge}_levels_{stats['num_levels']}" + report.write(report_name, True) + print("Report generated successfully.") + + def main(): args = parse_arguments() velocity_set = setup_simulation(args) grid_shape = (args.cube_edge, args.cube_edge, args.cube_edge) - stats = run(velocity_set, grid_shape, args.num_steps) + stats = run( + velocity_set, grid_shape, args.num_steps, args.num_levels, args.collision_model, args.export_final_velocity, mres_perf_opt=args.mres_perf_opt + ) mlups_stats = calculate_mlups(args.cube_edge, args.num_steps, stats["time"], stats["num_levels"]) print(f"Simulation completed in {stats['time']:.2f} seconds") @@ -252,6 +378,10 @@ def main(): EMLUPS = mlups_stats["EMLUPS"] print(f"EMLUPs: {EMLUPS:.2f}") + # Generate report if requested + if args.report: + generate_report(args, stats, mlups_stats) + if __name__ == "__main__": main() diff --git a/xlb/helper/simulation_manager.py b/xlb/helper/simulation_manager.py index 8abc56b6..c1c43a10 100644 --- a/xlb/helper/simulation_manager.py +++ b/xlb/helper/simulation_manager.py @@ -105,7 +105,7 @@ def recursion_reference(level, app): self.add_to_app( app=app, op_name="collide_coarse", - mres_level=level, + level=level, f_0=self.f_0, f_1=self.f_1, bc_mask=self.bc_mask, @@ -122,7 +122,7 @@ def recursion_reference(level, app): self.add_to_app( app=app, op_name="stream_coarse_step_ABC", - mres_level=level, + level=level, f_0=self.f_1, f_1=self.f_0, bc_mask=self.bc_mask, @@ -144,11 +144,11 @@ def recursion_fused_finest(level, app): self.add_to_app( app=app, op_name="finest_fused_pull", - mres_level=level, - f_0=self.f_0, - f_1=self.f_1, - bc_mask=self.bc_mask, - missing_mask=self.missing_mask, + level=level, + f_0_fd=self.f_0, + f_1_fd=self.f_1, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, omega=omega, timestep=0, is_f1_the_explosion_src_field=True, @@ -156,11 +156,11 @@ def recursion_fused_finest(level, app): self.add_to_app( app=app, op_name="finest_fused_pull", - mres_level=level, - f_0=self.f_1, - f_1=self.f_0, - bc_mask=self.bc_mask, - missing_mask=self.missing_mask, + level=level, + f_0_fd=self.f_1, + f_1_fd=self.f_0, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, omega=omega, timestep=0, is_f1_the_explosion_src_field=False, @@ -173,11 +173,11 @@ def recursion_fused_finest(level, app): self.add_to_app( app=app, op_name="collide_coarse", - mres_level=level, - f_0=self.f_0, - f_1=self.f_1, - bc_mask=self.bc_mask, - missing_mask=self.missing_mask, + level=level, + f_0_fd=self.f_0, + f_1_fd=self.f_1, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, omega=omega, timestep=0, ) @@ -196,19 +196,256 @@ def recursion_fused_finest(level, app): self.add_to_app( app=app, op_name="stream_coarse_step_ABC", - mres_level=level, - f_0=self.f_1, - f_1=self.f_0, - bc_mask=self.bc_mask, - missing_mask=self.missing_mask, + level=level, + f_0_fd=self.f_1, + f_1_fd=self.f_0, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, + omega=self.coalescence_factor, + timestep=0, + ) + + def recursion_fused_finest_254(level, app): + if level < 0: + return + + # Compute omega at the current level + omega = self.omega_list[level] + + if level == 0: + print(f"RECURSION down to the finest level {level}") + print(f"RECURSION Level {level}, Fused STREAM and COLLIDE") + self.add_to_app( + app=app, + op_name="finest_fused_pull_no_254", + level=level, + f_0_fd=self.f_0, + f_1_fd=self.f_1, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, + omega=omega, + timestep=0, + is_f1_the_explosion_src_field=True, + ) + self.add_to_app( + app=app, + op_name="finest_fused_pull_254", + level=level, + f_0_fd=self.f_0, + f_1_fd=self.f_1, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, + omega=omega, + ) + self.add_to_app( + app=app, + op_name="finest_fused_pull_no_254", + level=level, + f_0_fd=self.f_1, + f_1_fd=self.f_0, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, + omega=omega, + timestep=0, + is_f1_the_explosion_src_field=False, + ) + self.add_to_app( + app=app, + op_name="finest_fused_pull_254", + level=level, + f_0_fd=self.f_1, + f_1_fd=self.f_0, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, + omega=omega, + ) + return + + print(f"RECURSION down to level {level}") + print(f"RECURSION Level {level}, COLLIDE") + + self.add_to_app( + app=app, + op_name="collide_coarse", + level=level, + f_0_fd=self.f_0, + f_1_fd=self.f_1, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, + omega=omega, + timestep=0, + ) + # 1. Accumulation is read from f_0 in the streaming step, where f_0=self.f_1. + # so is_self_f1_the_coalescence_dst_field is True + # 2. Explision data is the output from the corser collide, which is f_1=self.f_1. + # so is_self_f1_the_explosion_src_field is True + + if level - 1 == 0: + recursion_fused_finest_254(level - 1, app) + else: + recursion_fused_finest_254(level - 1, app) + recursion_fused_finest_254(level - 1, app) + # Important: swapping of f_0 and f_1 is done here + print(f"RECURSION Level {level}, stream_coarse_step_ABC") + self.add_to_app( + app=app, + op_name="stream_coarse_step_ABC", + level=level, + f_0_fd=self.f_1, + f_1_fd=self.f_0, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, omega=self.coalescence_factor, timestep=0, ) + def recursion_fused_finest_254_all(level, app): + if level < 0: + return + + # Compute omega at the current level + omega = self.omega_list[level] + + if level == 0: + print(f"RECURSION down to the finest level {level}") + print(f"RECURSION Level {level}, Fused STREAM and COLLIDE") + self.add_to_app( + app=app, + op_name="finest_fused_pull_no_254", + level=level, + f_0_fd=self.f_0, + f_1_fd=self.f_1, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, + omega=omega, + timestep=0, + is_f1_the_explosion_src_field=True, + ) + self.add_to_app( + app=app, + op_name="finest_fused_pull_254", + level=level, + f_0_fd=self.f_0, + f_1_fd=self.f_1, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, + omega=omega, + ) + self.add_to_app( + app=app, + op_name="finest_fused_pull_no_254", + level=level, + f_0_fd=self.f_1, + f_1_fd=self.f_0, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, + omega=omega, + timestep=0, + is_f1_the_explosion_src_field=False, + ) + self.add_to_app( + app=app, + op_name="finest_fused_pull_254", + level=level, + f_0_fd=self.f_1, + f_1_fd=self.f_0, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, + omega=omega, + ) + return + + print(f"RECURSION down to level {level}") + print(f"RECURSION Level {level}, COLLIDE") + + self.add_to_app( + app=app, + op_name="collide_coarse_no_254", + level=level, + f_0_fd=self.f_0, + f_1_fd=self.f_1, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, + omega=omega, + timestep=0, + ) + self.add_to_app( + app=app, + op_name="collide_coarse_254", + level=level, + f_0_fd=self.f_0, + f_1_fd=self.f_1, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, + omega=omega, + timestep=0, + ) + # 1. Accumulation is read from f_0 in the streaming step, where f_0=self.f_1. + # so is_self_f1_the_coalescence_dst_field is True + # 2. Explision data is the output from the corser collide, which is f_1=self.f_1. + # so is_self_f1_the_explosion_src_field is True + + if level - 1 == 0: + recursion_fused_finest_254_all(level - 1, app) + else: + recursion_fused_finest_254_all(level - 1, app) + recursion_fused_finest_254_all(level - 1, app) + # Important: swapping of f_0 and f_1 is done here + print(f"RECURSION Level {level}, stream_coarse_step_ABC") + self.add_to_app( + app=app, + op_name="stream_coarse_step_ABC_no_254", + level=level, + f_0_fd=self.f_1, + f_1_fd=self.f_0, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, + omega=self.coalescence_factor, + timestep=0, + ) + self.add_to_app( + app=app, + op_name="stream_coarse_step_254", + level=level, + f_0_fd=self.f_1, + f_1_fd=self.f_0, + bc_mask_fd=self.bc_mask, + missing_mask_fd=self.missing_mask, + ) + return + if self.mres_perf_opt == MresPerfOptimizationType.NAIVE_COLLIDE_STREAM: recursion_reference(self.count_levels - 1, app=self.app) elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST: recursion_fused_finest(self.count_levels - 1, app=self.app) + elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_254: + # Run kernel that generates teh 254 value in the bc_mask + wp.synchronize() + # self.bc_mask.update_host(0) + # wp.synchronize() + # self.bc_mask.export_vti(f"mask_before.vti", "u") + + self.neon_container["reset_bc_mask_for_no_mr_no_bc_as_254"](0, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0) + wp.synchronize() + # self.bc_mask.update_host(0) + # wp.synchronize() + # self.bc_mask.export_vti(f"mask_after.vti", "u") + recursion_fused_finest_254(self.count_levels - 1, app=self.app) + elif self.mres_perf_opt == MresPerfOptimizationType.FUSION_AT_FINEST_254_ALL: + # Run kernel that generates teh 254 value in the bc_mask + wp.synchronize() + # self.bc_mask.update_host(0) + # wp.synchronize() + # self.bc_mask.export_vti(f"mask_before.vti", "u") + + num_levels = self.f_0.get_grid().num_levels + for l in range(num_levels): + self.neon_container["reset_bc_mask_for_no_mr_no_bc_as_254"](l, self.f_0, self.f_1, self.bc_mask, self.bc_mask).run(0) + # wp.synchronize() + # self.bc_mask.update_host(0) + wp.synchronize() + # self.bc_mask.export_vti(f"mask_after.vti", "u") + recursion_fused_finest_254_all(self.count_levels - 1, app=self.app) else: raise ValueError(f"Unknown optimization level: {self.opt_level}") diff --git a/xlb/mres_perf_optimization_type.py b/xlb/mres_perf_optimization_type.py index 622982ff..ae14bcf1 100644 --- a/xlb/mres_perf_optimization_type.py +++ b/xlb/mres_perf_optimization_type.py @@ -12,6 +12,8 @@ class MresPerfOptimizationType(Enum): NAIVE_COLLIDE_STREAM = auto() FUSION_AT_FINEST = auto() + FUSION_AT_FINEST_254 = auto() + FUSION_AT_FINEST_254_ALL = auto() @staticmethod def from_string(value: str) -> "MresPerfOptimizationType": diff --git a/xlb/operator/stepper/nse_multires_stepper.py b/xlb/operator/stepper/nse_multires_stepper.py index 63ff560d..240da307 100644 --- a/xlb/operator/stepper/nse_multires_stepper.py +++ b/xlb/operator/stepper/nse_multires_stepper.py @@ -198,9 +198,9 @@ def compute(index: Any): if coalescence_factor > self.compute_dtype(0): coalescence_factor = self.compute_dtype(1) / (self.compute_dtype(2) * coalescence_factor) wp.neon_write(coalescence_factor_pn, index, l, coalescence_factor) - - else: - wp.print("ERRRRRRORRRRRRRRRRRRRR") + # + # else: + # wp.print("ERRRRRRORRRRRRRRRRRRRR") loader.declare_kernel(compute) @@ -450,6 +450,135 @@ def device(index: Any): return ll_collide_coarse + @neon.Container.factory(name="collide_coarse_254") + def collide_coarse_254( + level: int, + f_0_fd: Any, + f_1_fd: Any, + bc_mask_fd: Any, + missing_mask_fd: Any, + omega: Any, + timestep: int, + ): + num_levels = f_0_fd.get_grid().num_levels + + def ll_collide_coarse(loader: neon.Loader): + loader.set_mres_grid(bc_mask_fd.get_grid(), level) + + f_0_pn = loader.get_mres_read_handle(f_0_fd) + f_1_pn = loader.get_mres_write_handle(f_1_fd) + + # fake loading to enforce sequential step + + bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd) + missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd) + + _c = self.velocity_set.c + _w = self.velocity_set.w + + @wp.func + def device(index: Any): + _boundary_id = wp.neon_read(bc_mask_pn, index, 0) + """ + The c++ version starts with the following, which I am not sure is right: + if (type(cell, 0) == CellType::bulk ) { + BC type cells should do collide too + """ + if _boundary_id != wp.uint8(254): + return + + # Read thread data for populations, these are post streaming + _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index) + _f_post_stream = _f0_thread + + _rho, _u = self.macroscopic.neon_functional(_f_post_stream) + _feq = self.equilibrium.neon_functional(_rho, _u) + _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega) + + for l in range(self.velocity_set.q): + wp.neon_write(f_1_pn, index, l, _f_post_collision[l]) + + loader.declare_kernel(device) + + return ll_collide_coarse + + @neon.Container.factory(name="no_254_collide_coarse") + def collide_coarse_no_254( + level: int, + f_0_fd: Any, + f_1_fd: Any, + bc_mask_fd: Any, + missing_mask_fd: Any, + omega: Any, + timestep: int, + ): + num_levels = f_0_fd.get_grid().num_levels + + def ll_collide_coarse(loader: neon.Loader): + loader.set_mres_grid(bc_mask_fd.get_grid(), level) + + if level + 1 < f_0_fd.get_grid().num_levels: + f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up) + f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up) + else: + f_0_pn = loader.get_mres_read_handle(f_0_fd) + f_1_pn = loader.get_mres_write_handle(f_1_fd) + + # fake loading to enforce sequential step + + bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd) + missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd) + + _c = self.velocity_set.c + _w = self.velocity_set.w + + @wp.func + def device(index: Any): + _boundary_id = wp.neon_read(bc_mask_pn, index, 0) + """ + The c++ version starts with the following, which I am not sure is right: + if (type(cell, 0) == CellType::bulk ) { + BC type cells should do collide too + """ + if _boundary_id == wp.uint8(255): + return + if _boundary_id == wp.uint8(254): + return + if not wp.neon_has_child(f_0_pn, index): + # Read thread data for populations, these are post streaming + _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index) + _f_post_stream = _f0_thread + + _rho, _u = self.macroscopic.neon_functional(_f_post_stream) + _feq = self.equilibrium.neon_functional(_rho, _u) + _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega) + + # Apply post-collision boundary conditions + _f_post_collision = apply_bc( + index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False + ) + + # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1 + neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn) + + # Accumulate the post-collision populations in f_0 + for l in range(self.velocity_set.q): + push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l])) + if level < num_levels - 1: + val = _f_post_collision[l] + wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val) + # Verified that this is not needed: wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val) + + wp.neon_write(f_1_pn, index, l, _f_post_collision[l]) + else: + for l in range(self.velocity_set.q): + wp.neon_write(f_1_pn, index, l, self.compute_dtype(0)) + # Verified that this is not needed: wp.neon_write(f_0_pn, index, l, self.compute_dtype(0)) + + loader.declare_kernel(device) + + return ll_collide_coarse + @neon.Container.factory(name="stream_coarse_step_ABC") def stream_coarse_step_ABC( level: int, @@ -550,8 +679,8 @@ def cl_stream_coarse(index: Any): accumulated = accumulated * coalescence_factor # wp.neon_write(f_1_pn, index, l, accumulated) _f_post_stream[l] = accumulated - else: - wp.print("ERRRRRRORRRRRRRRRRRRRR") + # else: + # wp.print("ERRRRRRORRRRRRRRRRRRRR") # do non mres post-streaming corrections _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True) @@ -566,8 +695,8 @@ def cl_stream_coarse(index: Any): return ll_stream_coarse - @neon.Container.factory(name="stream_coarse_step_A") - def stream_coarse_step_A( + @neon.Container.factory(name="no_254_stream_coarse_step_ABC") + def stream_coarse_step_ABC_no_254( level: int, f_0_fd: Any, f_1_fd: Any, @@ -576,12 +705,15 @@ def stream_coarse_step_A( omega: Any, timestep: int, ): - num_levels = f_0_fd.get_grid().get_num_levels() + num_levels = f_0_fd.get_grid().num_levels # if level != 0: # # throw an exception # raise Exception("Only the finest level is supported for now") + # module op to define odd of even iteration + # od_or_even = wp.module("odd_or_even", "even") + def ll_stream_coarse(loader: neon.Loader): loader.set_mres_grid(bc_mask_fd.get_grid(), level) @@ -593,9 +725,15 @@ def ll_stream_coarse(loader: neon.Loader): _c = self.velocity_set.c + coalescence_factor_fd = omega + coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd) + @wp.func def cl_stream_coarse(index: Any): _boundary_id = wp.neon_read(bc_mask_pn, index, 0) + + if _boundary_id == wp.uint8(254): + return if _boundary_id == wp.uint8(255): return @@ -610,48 +748,119 @@ def cl_stream_coarse(index: Any): _f_post_collision = _f0_thread _f_post_stream = self.stream.neon_functional(f_0_pn, index) + for l in range(self.velocity_set.q): + if l == lattice_central_index: + # HERE, we skip the center direction + continue + + pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l])) + + has_ngh_at_same_level = wp.bool(False) + accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level) + + # if (!pin.hasChildren(cell, dir)) { + if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction): + # NO finer ngh. in the pull direction (opposite of l) + if not has_ngh_at_same_level: + # NO ngh. at the same level + # COULD we have a ngh. at the courser level? + if wp.neon_has_parent(f_0_pn, index): + # YES halo cell on top of us + has_a_coarser_ngh = wp.bool(False) + exploded_pop = wp.neon_lbm_read_coarser_ngh( + f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh + ) + if has_a_coarser_ngh: + # Full state: + # NO finer ngh. in the pull direction (opposite of l) + # NO ngh. at the same level + # YES ghost cell on top of us + # YES courser ngh. + # -> **Explosion** + # wp.neon_write(f_1_pn, index, l, exploded_pop) + _f_post_stream[l] = exploded_pop + else: + # HERE -> I have a finer ngh. in direction pull (opposite l) + # Then I have to read from the halo on top of my finer ngh. + if has_ngh_at_same_level: + # if l == 10: + # wp.print(accumulated) + # glob = wp.neon_global_idx(f_1_pn, index) + # wp.neon_cuda_info() + # wp.neon_print(glob) + # wp.neon_level(f_1_pn) + # accumulated = _w[l] + # Full State + # YES finer ngh. in the pull direction (opposite of l) + # YES ngh. at the same level + # -> **Coalescence** + coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l) + accumulated = accumulated * coalescence_factor + # wp.neon_write(f_1_pn, index, l, accumulated) + _f_post_stream[l] = accumulated + # else: + # wp.print("ERRRRRRORRRRRRRRRRRRRR") + + # do non mres post-streaming corrections + _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True) + + # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1 + neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn) + for l in range(self.velocity_set.q): wp.neon_write(f_1_pn, index, l, _f_post_stream[l]) - # wp.print("stream_coarse") loader.declare_kernel(cl_stream_coarse) return ll_stream_coarse - @neon.Container.factory(name="stream_coarse_step_B") - def stream_coarse_step_B( + @neon.Container.factory(name="reset_bc_mask_for_no_mr_no_bc_as_254") + def reset_bc_mask_for_no_mr_no_bc_as_254( level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, - omega: Any, - timestep: int, ): + num_levels = f_0_fd.get_grid().num_levels + + # if level != 0: + # # throw an exception + # raise Exception("Only the finest level is supported for now") + + # module op to define odd of even iteration + # od_or_even = wp.module("odd_or_even", "even") + def ll_stream_coarse(loader: neon.Loader): loader.set_mres_grid(bc_mask_fd.get_grid(), level) - coalescence_factor_fd = omega + f_0_pn = loader.get_mres_read_handle(f_0_fd) f_1_pn = loader.get_mres_write_handle(f_1_fd) bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd) missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd) - coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd) _c = self.velocity_set.c - _w = self.velocity_set.w @wp.func def cl_stream_coarse(index: Any): _boundary_id = wp.neon_read(bc_mask_pn, index, 0) if _boundary_id == wp.uint8(255): return + if _boundary_id != 0: + return are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index) if are_we_a_halo_cell: # HERE: we are a halo cell so we just exit return + # do stream normally + _missing_mask = _missing_mask_vec() + _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index) + _f_post_collision = _f0_thread + _f_post_stream = self.stream.neon_functional(f_0_pn, index) + for l in range(self.velocity_set.q): if l == lattice_central_index: # HERE, we skip the center direction @@ -681,7 +890,8 @@ def cl_stream_coarse(index: Any): # YES ghost cell on top of us # YES courser ngh. # -> **Explosion** - wp.neon_write(f_1_pn, index, l, exploded_pop) + # wp.neon_write(f_1_pn, index, l, exploded_pop) + return else: # HERE -> I have a finer ngh. in direction pull (opposite l) # Then I have to read from the halo on top of my finer ngh. @@ -697,59 +907,49 @@ def cl_stream_coarse(index: Any): # YES finer ngh. in the pull direction (opposite of l) # YES ngh. at the same level # -> **Coalescence** - coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l) - accumulated = accumulated * coalescence_factor - wp.neon_write(f_1_pn, index, l, accumulated) - + return else: wp.print("ERRRRRRORRRRRRRRRRRRRR") + # Only fluid voxels with the following properties can reach this line: + # They are not BC voxels + # They are not on a resolution jump -> they do not do coalescence or explosion + # They are not mr halo cells + wp.neon_write(bc_mask_pn, index, 0, wp.uint8(254)) + loader.declare_kernel(cl_stream_coarse) return ll_stream_coarse - @neon.Container.factory(name="finest_fused_pull") - def finest_fused_pull( + @neon.Container.factory(name="stream_coarse_step_A") + def stream_coarse_step_A( level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any, omega: Any, - timestep: Any, - is_f1_the_explosion_src_field: bool, + timestep: int, ): - if level != 0: - # throw an exception - raise Exception("Only the finest level is supported for now") - grid = f_0_fd.get_grid() - num_levels = grid.num_levels + num_levels = f_0_fd.get_grid().get_num_levels() # if level != 0: # # throw an exception # raise Exception("Only the finest level is supported for now") - # module op to define odd of even iteration - # od_or_even = wp.module("odd_or_even", "even") - - def finest_fused_pull_launcher(loader: neon.Loader): + def ll_stream_coarse(loader: neon.Loader): loader.set_mres_grid(bc_mask_fd.get_grid(), level) - if level + 1 < f_0_fd.get_grid().num_levels: - f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up) - f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up) - else: - f_0_pn = loader.get_mres_read_handle(f_0_fd) - f_1_pn = loader.get_mres_write_handle(f_1_fd) + f_0_pn = loader.get_mres_read_handle(f_0_fd) + f_1_pn = loader.get_mres_write_handle(f_1_fd) bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd) missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd) _c = self.velocity_set.c - _w = self.velocity_set.w @wp.func - def finest_fused_pull_kernel(index: Any): + def cl_stream_coarse(index: Any): _boundary_id = wp.neon_read(bc_mask_pn, index, 0) if _boundary_id == wp.uint8(255): return @@ -765,6 +965,324 @@ def finest_fused_pull_kernel(index: Any): _f_post_collision = _f0_thread _f_post_stream = self.stream.neon_functional(f_0_pn, index) + for l in range(self.velocity_set.q): + wp.neon_write(f_1_pn, index, l, _f_post_stream[l]) + # wp.print("stream_coarse") + + loader.declare_kernel(cl_stream_coarse) + + return ll_stream_coarse + + @neon.Container.factory(name="stream_coarse_step_254") + def stream_coarse_step_254(level: int, f_0_fd: Any, f_1_fd: Any, bc_mask_fd: Any, missing_mask_fd: Any): + + def ll_stream_coarse(loader: neon.Loader): + loader.set_mres_grid(bc_mask_fd.get_grid(), level) + + f_0_pn = loader.get_mres_read_handle(f_0_fd) + f_1_pn = loader.get_mres_write_handle(f_1_fd) + + bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd) + missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd) + + _c = self.velocity_set.c + + @wp.func + def cl_stream_coarse(index: Any): + _boundary_id = wp.neon_read(bc_mask_pn, index, 0) + if _boundary_id != wp.uint8(254): + return + + are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index) + if are_we_a_halo_cell: + # HERE: we are a halo cell so we just exit + return + + # do stream normally + _missing_mask = _missing_mask_vec() + _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index) + _f_post_collision = _f0_thread + _f_post_stream = self.stream.neon_functional(f_0_pn, index) + + for l in range(self.velocity_set.q): + wp.neon_write(f_1_pn, index, l, _f_post_stream[l]) + # wp.print("stream_coarse") + + loader.declare_kernel(cl_stream_coarse) + + return ll_stream_coarse + + @neon.Container.factory(name="stream_coarse_step_B") + def stream_coarse_step_B( + level: int, + f_0_fd: Any, + f_1_fd: Any, + bc_mask_fd: Any, + missing_mask_fd: Any, + omega: Any, + ): + def ll_stream_coarse(loader: neon.Loader): + loader.set_mres_grid(bc_mask_fd.get_grid(), level) + coalescence_factor_fd = omega + f_0_pn = loader.get_mres_read_handle(f_0_fd) + f_1_pn = loader.get_mres_write_handle(f_1_fd) + + bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd) + missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd) + coalescence_factor_pn = loader.get_mres_read_handle(coalescence_factor_fd) + + _c = self.velocity_set.c + _w = self.velocity_set.w + + @wp.func + def cl_stream_coarse(index: Any): + _boundary_id = wp.neon_read(bc_mask_pn, index, 0) + if _boundary_id == wp.uint8(255): + return + + are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index) + if are_we_a_halo_cell: + # HERE: we are a halo cell so we just exit + return + + for l in range(self.velocity_set.q): + if l == lattice_central_index: + # HERE, we skip the center direction + continue + + pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l])) + + has_ngh_at_same_level = wp.bool(False) + accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level) + + # if (!pin.hasChildren(cell, dir)) { + if not wp.neon_has_finer_ngh(f_0_pn, index, pull_direction): + # NO finer ngh. in the pull direction (opposite of l) + if not has_ngh_at_same_level: + # NO ngh. at the same level + # COULD we have a ngh. at the courser level? + if wp.neon_has_parent(f_0_pn, index): + # YES halo cell on top of us + has_a_coarser_ngh = wp.bool(False) + exploded_pop = wp.neon_lbm_read_coarser_ngh( + f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh + ) + if has_a_coarser_ngh: + # Full state: + # NO finer ngh. in the pull direction (opposite of l) + # NO ngh. at the same level + # YES ghost cell on top of us + # YES courser ngh. + # -> **Explosion** + wp.neon_write(f_1_pn, index, l, exploded_pop) + else: + # HERE -> I have a finer ngh. in direction pull (opposite l) + # Then I have to read from the halo on top of my finer ngh. + if has_ngh_at_same_level: + # if l == 10: + # wp.print(accumulated) + # glob = wp.neon_global_idx(f_1_pn, index) + # wp.neon_cuda_info() + # wp.neon_print(glob) + # wp.neon_level(f_1_pn) + # accumulated = _w[l] + # Full State + # YES finer ngh. in the pull direction (opposite of l) + # YES ngh. at the same level + # -> **Coalescence** + coalescence_factor = wp.neon_read(coalescence_factor_pn, index, l) + accumulated = accumulated * coalescence_factor + wp.neon_write(f_1_pn, index, l, accumulated) + # + # else: + # wp.print("ERRRRRRORRRRRRRRRRRRRR") + + loader.declare_kernel(cl_stream_coarse) + + return ll_stream_coarse + + @neon.Container.factory(name="finest_fused_pull") + def finest_fused_pull( + level: int, + f_0_fd: Any, + f_1_fd: Any, + bc_mask_fd: Any, + missing_mask_fd: Any, + omega: Any, + timestep: Any, + is_f1_the_explosion_src_field: bool, + ): + if level != 0: + # throw an exception + raise Exception("Only the finest level is supported for now") + grid = f_0_fd.get_grid() + num_levels = grid.num_levels + + # if level != 0: + # # throw an exception + # raise Exception("Only the finest level is supported for now") + + # module op to define odd of even iteration + # od_or_even = wp.module("odd_or_even", "even") + + def finest_fused_pull_launcher(loader: neon.Loader): + loader.set_mres_grid(bc_mask_fd.get_grid(), level) + + if level + 1 < f_0_fd.get_grid().num_levels: + f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up) + f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up) + else: + f_0_pn = loader.get_mres_read_handle(f_0_fd) + f_1_pn = loader.get_mres_write_handle(f_1_fd) + + bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd) + missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd) + + _c = self.velocity_set.c + _w = self.velocity_set.w + + @wp.func + def finest_fused_pull_kernel(index: Any): + _boundary_id = wp.neon_read(bc_mask_pn, index, 0) + if _boundary_id == wp.uint8(255): + return + + are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index) + if are_we_a_halo_cell: + # HERE: we are a halo cell so we just exit + return + + # do stream normally + _missing_mask = _missing_mask_vec() + _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index) + _f_post_collision = _f0_thread + _f_post_stream = self.stream.neon_functional(f_0_pn, index) + + for l in range(self.velocity_set.q): + if l == lattice_central_index: + # HERE, we skip the center direction + continue + + pull_direction = wp.neon_ngh_idx(wp.int8(-_c[0, l]), wp.int8(-_c[1, l]), wp.int8(-_c[2, l])) + + has_ngh_at_same_level = wp.bool(False) + accumulated = wp.neon_read_ngh(f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_ngh_at_same_level) + + # NO finer ngh. in the pull direction (opposite of l) + if not has_ngh_at_same_level: + # NO ngh. at the same level + # COULD we have a ngh. at the courser level? + if wp.neon_has_parent(f_0_pn, index): + # YES halo cell on top of us + has_a_coarser_ngh = wp.bool(False) + if is_f1_the_explosion_src_field: + exploded_pop = wp.neon_lbm_read_coarser_ngh( + f_1_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh + ) + else: + exploded_pop = wp.neon_lbm_read_coarser_ngh( + f_0_pn, index, pull_direction, l, self.compute_dtype(0), has_a_coarser_ngh + ) + if has_a_coarser_ngh: + # Full state: + # NO finer ngh. in the pull direction (opposite of l) + # NO ngh. at the same level + # YES ghost cell on top of us + # YES courser ngh. + # -> **Explosion** + # wp.neon_write(f_1_pn, index, l, exploded_pop) + _f_post_stream[l] = exploded_pop + + # do non mres post-streaming corrections + _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_collision, _f_post_stream, True) + + _rho, _u = self.macroscopic.neon_functional(_f_post_stream) + _feq = self.equilibrium.neon_functional(_rho, _u) + _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega) + + # Apply post-collision boundary conditions + _f_post_collision = apply_bc( + index, timestep, _boundary_id, _missing_mask, f_0_pn, f_1_pn, _f_post_stream, _f_post_collision, False + ) + + # Apply auxiliary recovery for boundary conditions (swapping) before overwriting f_1 + neon_apply_aux_recovery_bc(index, _boundary_id, _missing_mask, f_0_pn, f_1_pn) + + # Accumulate the post-collision populations in f_0 + for l in range(self.velocity_set.q): + push_direction = wp.neon_ngh_idx(wp.int8(_c[0, l]), wp.int8(_c[1, l]), wp.int8(_c[2, l])) + if level < num_levels - 1: + val = _f_post_collision[l] + if is_f1_the_explosion_src_field: + wp.neon_mres_lbm_store_op(f_1_pn, index, l, push_direction, val) + else: + wp.neon_mres_lbm_store_op(f_0_pn, index, l, push_direction, val) + + wp.neon_write(f_1_pn, index, l, _f_post_collision[l]) + + loader.declare_kernel(finest_fused_pull_kernel) + + return finest_fused_pull_launcher + + @neon.Container.factory(name="finest_fused_pull_no_254") + def finest_fused_pull_no_254( + level: int, + f_0_fd: Any, + f_1_fd: Any, + bc_mask_fd: Any, + missing_mask_fd: Any, + omega: Any, + timestep: Any, + is_f1_the_explosion_src_field: bool, + ): + if level != 0: + # throw an exception + raise Exception("Only the finest level is supported for now") + grid = f_0_fd.get_grid() + num_levels = grid.num_levels + + # if level != 0: + # # throw an exception + # raise Exception("Only the finest level is supported for now") + + # module op to define odd of even iteration + # od_or_even = wp.module("odd_or_even", "even") + + def finest_fused_pull_launcher(loader: neon.Loader): + loader.set_mres_grid(bc_mask_fd.get_grid(), level) + + if level + 1 < f_0_fd.get_grid().num_levels: + f_0_pn = loader.get_mres_write_handle(f_0_fd, neon.Loader.Operation.stencil_up) + f_1_pn = loader.get_mres_write_handle(f_1_fd, neon.Loader.Operation.stencil_up) + else: + f_0_pn = loader.get_mres_read_handle(f_0_fd) + f_1_pn = loader.get_mres_write_handle(f_1_fd) + + bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd) + missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd) + + _c = self.velocity_set.c + _w = self.velocity_set.w + + @wp.func + def finest_fused_pull_kernel(index: Any): + _boundary_id = wp.neon_read(bc_mask_pn, index, 0) + if _boundary_id == wp.uint8(255): + return + if _boundary_id == wp.uint8(254): + return + + are_we_a_halo_cell = wp.neon_has_child(f_0_pn, index) + if are_we_a_halo_cell: + # HERE: we are a halo cell so we just exit + return + + # do stream normally + _missing_mask = _missing_mask_vec() + _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index) + _f_post_collision = _f0_thread + _f_post_stream = self.stream.neon_functional(f_0_pn, index) + for l in range(self.velocity_set.q): if l == lattice_central_index: # HERE, we skip the center direction @@ -831,6 +1349,60 @@ def finest_fused_pull_kernel(index: Any): return finest_fused_pull_launcher + @neon.Container.factory(name="254_finest_fused_pull") + def finest_fused_pull_254( + level: int, + f_0_fd: Any, + f_1_fd: Any, + bc_mask_fd: Any, + missing_mask_fd: Any, + omega: Any, + ): + if level != 0: + # throw an exception + raise Exception("Only the finest level is supported for now") + grid = f_0_fd.get_grid() + num_levels = grid.num_levels + + if level != 0: + # throw an exception + raise Exception("Only the finest level is supported for now") + + def finest_fused_pull_launcher(loader: neon.Loader): + loader.set_mres_grid(bc_mask_fd.get_grid(), level) + + f_0_pn = loader.get_mres_read_handle(f_0_fd) + f_1_pn = loader.get_mres_write_handle(f_1_fd) + + bc_mask_pn = loader.get_mres_read_handle(bc_mask_fd) + missing_mask_pn = loader.get_mres_read_handle(missing_mask_fd) + + _c = self.velocity_set.c + _w = self.velocity_set.w + + @wp.func + def finest_fused_pull_kernel_254(index: Any): + _boundary_id = wp.neon_read(bc_mask_pn, index, 0) + if _boundary_id != wp.uint8(254): + return + + # do stream normally + _missing_mask = _missing_mask_vec() + _f0_thread, _missing_mask = neon_get_thread_data(f_0_pn, missing_mask_pn, index) + _f_post_collision = _f0_thread + _f_post_stream = self.stream.neon_functional(f_0_pn, index) + + _rho, _u = self.macroscopic.neon_functional(_f_post_stream) + _feq = self.equilibrium.neon_functional(_rho, _u) + _f_post_collision = self.collision.neon_functional(_f_post_stream, _feq, _rho, _u, omega) + + for l in range(self.velocity_set.q): + wp.neon_write(f_1_pn, index, l, _f_post_collision[l]) + + loader.declare_kernel(finest_fused_pull_kernel_254) + + return finest_fused_pull_launcher + @neon.Container.factory(name="stream_coarse_step_C") def stream_coarse_step_C( level: int, @@ -889,29 +1461,89 @@ def cl_stream_coarse(index: Any): "stream_coarse_step_B": stream_coarse_step_B, "stream_coarse_step_C": stream_coarse_step_C, "finest_fused_pull": finest_fused_pull, + "finest_fused_pull_no_254": finest_fused_pull_no_254, + "finest_fused_pull_254": finest_fused_pull_254, + "reset_bc_mask_for_no_mr_no_bc_as_254": reset_bc_mask_for_no_mr_no_bc_as_254, + "collide_coarse_no_254": collide_coarse_no_254, + "collide_coarse_254": collide_coarse_254, + "stream_coarse_step_ABC_no_254": stream_coarse_step_ABC_no_254, + "stream_coarse_step_254": stream_coarse_step_254, } def launch_container(self, streamId, op_name, mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep): self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep).run(0) - def add_to_app( - self, - app, - op_name, - mres_level, - f_0, - f_1, - bc_mask, - missing_mask, - omega, - timestep, - is_f1_the_explosion_src_field: bool = None, - ): + def add_to_app(self, **kwargs): + import inspect + + def validate_kwargs_forward(func, kwargs): + """ + Check whether `func(**kwargs)` would be valid, + and return *all* the issues instead of raising on the first one. + + Returns a dict; empty dict means "everything is OK". + """ + sig = inspect.signature(func) + params = sig.parameters + + errors = {} + + # --- 1. Positional-only required params (cannot be given via kwargs) --- + pos_only_required = [name for name, p in params.items() if p.kind == inspect.Parameter.POSITIONAL_ONLY and p.default is inspect._empty] + if pos_only_required: + errors["positional_only_required"] = pos_only_required + + # --- 2. Unexpected kwargs (if no **kwargs in target) --- + has_var_kw = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()) + if not has_var_kw: + allowed_kw = { + name + for name, p in params.items() + if p.kind + in ( + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY, + ) + } + unexpected = sorted(set(kwargs) - allowed_kw) + if unexpected: + errors["unexpected_kwargs"] = unexpected + + # --- 3. Missing required keyword-passable params --- + missing_required = [ + name + for name, p in params.items() + if p.kind + in ( + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY, + ) + and p.default is inspect._empty # no default + and name not in kwargs # not provided + ] + if missing_required: + errors["missing_required"] = missing_required + + return errors + + container_generator = None + try: + op_name = kwargs.pop("op_name") + app = kwargs.pop("app") + except: + raise ValueError("op_name and app must be provided as keyword arguments") + + try: + container_generator = self.neon_container[op_name] + except KeyError: + raise ValueError(f"Operator {op_name} not found in neon container. Available operators: {list(self.neon_container.keys())}") + + errors = validate_kwargs_forward(container_generator, kwargs) + if errors: + raise ValueError(f"Cannot forward kwargs to target: {errors}") + nvtx.push_range(f"New Container {op_name}", color="yellow") - if is_f1_the_explosion_src_field is None: - app.append(self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep)) - else: - app.append(self.neon_container[op_name](mres_level, f_0, f_1, bc_mask, missing_mask, omega, timestep, is_f1_the_explosion_src_field)) + app.append(container_generator(**kwargs)) nvtx.pop_range() @Operator.register_backend(ComputeBackend.NEON)